Ver Fonte

Merge branch '2024-0515-rank-test'
master
.

罗俊辉 há 1 ano atrás
pai
commit
fa9e34849c

+ 3 - 3
app.py

@@ -2,13 +2,13 @@
 @author: luojunhui
 """
 from quart import Quart
-from applications.log import logging
+from applications.functions.log import logging
 from applications.routes import my_blueprint
 
 # 初始化 App
 app = Quart(__name__, static_folder='applications/static')
 logging(
-    code="1000",
+    code="0000",
     info="APP Initialization Complete",
     function="app"
 )
@@ -16,7 +16,7 @@ logging(
 # 注册蓝图
 app.register_blueprint(my_blueprint)
 logging(
-    code="1000",
+    code="0000",
     info="Blue Print Initialization Complete",
     function="app"
 )

+ 0 - 50
applications/functions/ask_kimi.py

@@ -1,50 +0,0 @@
-"""
-@author: luojunhui
-"""
-"""
-@author: luojunhui
-"""
-import json
-from openai import OpenAI
-
-
-def ask_kimi(question):
-    """
-    Ask Kimi for information
-    :param question: tiny text
-    :return: "{}"
-    """
-    single_title_prompt = """
-        我会给你一个视频标题,需要你帮我用你所学的知识来帮我分析出以下信息,信息我都写到 json 里面了
-        {
-            "key_words": [],  # 返回三个关键词
-            "search_keys": [], # 标题可能的搜索关键词,返回 3 个
-            "extra_keys": [], # 关心这个视频的用户还会关心哪些关键词, 返回 3 个
-            "theme": 标题的主题, 用一个词概括
-        }
-        只需要返回一个 json,key 和上面的一样,
-        我给你的标题是: 
-        """
-    client = OpenAI(
-        api_key='sk-tz1VaKqksTzk0F8HxlU4YVGwj7oa1g0c0puGNUZrdn9MDtzm',
-        base_url="https://api.moonshot.cn/v1"
-    )
-    chat_completion = client.chat.completions.create(
-        messages=[
-            {
-                "role": "user",
-                "content": single_title_prompt + question,
-            }
-        ],
-        model="moonshot-v1-8k",
-    )
-    response = chat_completion.choices[0].message.content
-    try:
-        response = json.loads(response)
-        return response
-    except:
-        return {}
-
-
-
-

+ 0 - 54
applications/functions/auto_white.py

@@ -1,54 +0,0 @@
-"""
-@author: luojunhui
-"""
-import json
-import requests
-
-
-def get_cookie():
-    """
-    获取 cookie
-    :return:
-    """
-    url = "https://admin.piaoquantv.com/manager/login?account=luojunhui&passWd=e10adc3949ba59abbe56e057f20f883e&muid=7"
-    payload = {}
-    headers = {
-        'accept': 'application/json, text/plain, */*',
-        'accept-language': 'en',
-        'priority': 'u=1, i',
-        'sec-ch-ua': '"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"',
-        'sec-ch-ua-mobile': '?0',
-        'sec-ch-ua-platform': '"macOS"',
-        'sec-fetch-dest': 'empty',
-        'sec-fetch-mode': 'cors',
-        'sec-fetch-site': 'same-origin',
-        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
-    }
-    response = requests.request("GET", url, headers=headers, data=payload)
-    return response.cookies.values()[0]
-
-
-def auto_white(root_share_id):
-    """
-    自动加入白名单, 保证公众号百分百出广告
-    :param root_share_id:
-    :return:
-    """
-    url = "https://admin.piaoquantv.com/manager/ad/own/white/rootShare/save"
-    dd = {
-        "rootShareId": root_share_id,
-        "commit": "算法自动加入白名单--"
-    }
-    payload = json.dumps(dd)
-    cookie = get_cookie()
-    headers = {
-        'accept': 'application/json',
-        'accept-language': 'en',
-        'content-type': 'application/json;',
-        'cookie': "SESSION=" + cookie,
-        'origin': 'https://admin.piaoquantv.com',
-        'priority': 'u=1, i',
-        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
-    }
-    response = requests.request("POST", url, headers=headers, data=payload)
-    return response.json()['content']

+ 0 - 147
applications/functions/calculate.py

@@ -1,147 +0,0 @@
-"""
-@author: luojunhui
-"""
-import json
-import os
-
-from applications.log import logging
-from applications.functions.date import generate_daily_strings, five_days_before
-
-
-def read_single_file(filename):
-    """
-    :param filename:
-    """
-    with open(filename, encoding="utf-8") as f:
-        data = json.loads(f.read())
-    if data:
-        return data
-    else:
-        return {}
-
-
-def compute_similarity(file_1, file_2):
-    """
-    计算
-    :param file_1:
-    :param file_2:
-    :return:
-    """
-    data_1 = read_single_file(file_1)
-    data_2 = read_single_file(file_2)
-
-    def calculate_v1(d1, d2):
-        """
-        通过交并集来判断
-        :param d1:
-        :param d2:
-        :return:
-        """
-        f1_keys = set(d1["key_words"])
-        f2_keys = set(d2["key_words"])
-        keys_union = f1_keys | f2_keys
-        keys_intersection = f1_keys & f2_keys
-        f1_search_keys = set(d1["search_keys"])
-        f2_search_keys = set(d2["search_keys"])
-        search_keys_union = f1_search_keys | f2_search_keys
-        search_keys_intersection = f1_search_keys & f2_search_keys
-        f1_extra_keys = set(d1["extra_keys"])
-        f2_extra_keys = set(d2["extra_keys"])
-        extra_keys_union = f1_extra_keys | f2_extra_keys
-        extra_keys_intersection = f1_extra_keys & f2_extra_keys
-        score_1 = len(keys_intersection) / len(keys_union)
-        score_2 = len(search_keys_intersection) / len(search_keys_union)
-        score_3 = len(extra_keys_intersection) / len(extra_keys_union)
-        return score_1 * 0.4 + score_2 * 0.4 + score_3 * 0.2
-
-    def calculate_v2(d1, d2):
-        """
-        计算方法 v2
-        :param d1:
-        :param d2:
-        :return:
-        """
-        score = 0
-        tone_1 = d1["tone"]
-        tone_2 = d2["tone"]
-        if tone_1 == tone_2:
-            score += 0.1
-        target_audience_1 = d1["target_audience"]
-        target_audience_2 = d2["target_audience"]
-        if target_audience_1 == target_audience_2:
-            score += 0.2
-        target_age_1 = d1["target_age"]
-        target_age_2 = d2["target_age"]
-        if target_age_1 == target_age_2:
-            score += 0.2
-        address_1 = d1["address"]
-        address_2 = d2["address"]
-        if address_1 == address_2:
-            score += 0.2
-        gender_1 = d1["theme"]
-        gender_2 = d2["theme"]
-        if gender_1 == gender_2:
-            score += 0.5
-        return score
-
-    if data_1 and data_2:
-        try:
-            score_1 = calculate_v1(data_1, data_2)
-            return score_1
-            # score_2 = calculate_v2(data_1, data_2)
-            # return score_1, score_2
-        except Exception as e:
-            return 0
-    else:
-        return 0
-
-
-def title_mix(title_p, dt, trace_id):
-    """
-    执行代码
-    :param trace_id: 请求唯一 id
-    :param title_p:
-    :param dt: dt
-    """
-    five_days_ago = five_days_before(ori_dt=dt)
-    days_list = generate_daily_strings(five_days_ago, dt)
-    L = []
-    for day_str in days_list:
-        json_path = os.path.join(os.getcwd(), 'applications', 'static', day_str)
-        # 处理标题信息
-        files = os.listdir(json_path)
-        for file in files:
-            if file.endswith(".json"):
-                L.append(os.path.join(json_path, file))
-    print("召回的视频量", len(L))
-    score_list_1 = []
-    # score_list_2 = []
-    for file in L:
-        file_name = file.split('/')[-1].replace(".json", "")
-        v_id = file_name.split('_')[1]
-        uid = file_name.split('_')[0]
-        # score1, score2 = compute_similarity(title_p, file)
-        score1 = compute_similarity(title_p, file)
-        score_list_1.append([score1, v_id, uid])
-        # score_list_2.append([score2, v_id, uid])
-
-    s1_list = sorted(score_list_1, key=lambda x: x[0], reverse=True)
-    # s2_list = sorted(score_list_2, key=lambda x: x[0], reverse=True)
-    title = title_p.split("/")[-1].replace(".json", "")
-    obj = {
-        "title": title,
-        "s1_vid": s1_list[0][1],
-        "s1_score": s1_list[0][0],
-        "s1_uid": s1_list[0][2],
-        # "s2_vid": s2_list[0][1],
-        # "s2_score": s2_list[0][0],
-        # "s2_uid": s2_list[0][2]
-    }
-    logging(
-        code="1003",
-        info="计算结果得分",
-        data=obj,
-        function="title_mix",
-        trace_id=trace_id
-    )
-    return obj

+ 241 - 34
applications/functions/common.py

@@ -1,50 +1,257 @@
+# encoding: utf-8
 """
 @author: luojunhui
 """
 import json
+import time
 import uuid
 import requests
+import pymysql
 import urllib.parse
 
-from applications.functions.auto_white import auto_white
+from applications.functions.log import logging
 
 
-def create_gzh_path(video_id, shared_uid):
+class Functions(object):
     """
-    :param video_id: 视频 id
-    :param shared_uid: 分享 id
+    通用工具代码
     """
-    root_share_id = str(uuid.uuid4())
-    url = f"pages/user-videos?id={video_id}&su={shared_uid}&fromGzh=1&rootShareId={root_share_id}&shareId={root_share_id}"
-    # 自动把 root_share_id 加入到白名单
-    auto_white(root_share_id)
-    return root_share_id, f"pages/category?jumpPage={urllib.parse.quote(url, safe='')}"
 
+    # 自动加入白名单逻辑
+    @classmethod
+    def auto_white(cls, root_share_id):
+        """
+        自动加入白名单, 保证公众号百分百出广告
+        :param root_share_id:
+        :return:
+        """
 
-def request_for_info(video_id):
-    """
-    请求数据
-    :param video_id:
-    :return:
-    """
-    url = "https://longvideoapi.piaoquantv.com/longvideoapi/openapi/video/batchSelectVideoInfo"
-    data = {
-        "videoIdList": [video_id]
-    }
-    header = {
-        "Content-Type": "application/json",
-    }
-    response = requests.post(url, headers=header, data=json.dumps(data))
-    return response.json()
-
-
-def choose_video(result):
+        def get_cookie():
+            """
+            获取 cookie
+            :return:
+            """
+            url = "https://admin.piaoquantv.com/manager/login?account=luojunhui&passWd=e10adc3949ba59abbe56e057f20f883e&muid=7"
+            payload = {}
+            headers = {
+                'accept': 'application/json, text/plain, */*',
+                'accept-language': 'en',
+                'priority': 'u=1, i',
+                'sec-ch-ua': '"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"',
+                'sec-ch-ua-mobile': '?0',
+                'sec-ch-ua-platform': '"macOS"',
+                'sec-fetch-dest': 'empty',
+                'sec-fetch-mode': 'cors',
+                'sec-fetch-site': 'same-origin',
+                'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
+            }
+            response = requests.request("GET", url, headers=headers, data=payload)
+            return response.cookies.values()[0]
+
+        url = "https://admin.piaoquantv.com/manager/ad/own/white/rootShare/save"
+        dd = {
+            "rootShareId": root_share_id,
+            "commit": "算法自动加入白名单--"
+        }
+        payload = json.dumps(dd)
+        cookie = get_cookie()
+        headers = {
+            'accept': 'application/json',
+            'accept-language': 'en',
+            'content-type': 'application/json;',
+            'cookie': "SESSION=" + cookie,
+            'origin': 'https://admin.piaoquantv.com',
+            'priority': 'u=1, i',
+            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
+        }
+        response = requests.request("POST", url, headers=headers, data=payload)
+        return response.json()['content']
+
+    # 创建公众号分享卡片
+    @classmethod
+    def create_gzh_path(cls, video_id, shared_uid):
+        """
+        :param video_id: 视频 id
+        :param shared_uid: 分享 id
+        """
+        root_share_id = str(uuid.uuid4())
+        url = f"pages/user-videos?id={video_id}&su={shared_uid}&fromGzh=1&rootShareId={root_share_id}&shareId={root_share_id}"
+        # 自动把 root_share_id 加入到白名单
+        cls.auto_white(root_share_id)
+        return root_share_id, f"pages/category?jumpPage={urllib.parse.quote(url, safe='')}"
+
+    # 从票圈请求视频
+    @classmethod
+    def request_for_info(cls, video_id):
+        """
+        请求数据
+        :param video_id:
+        :return:
+        """
+        url = "https://longvideoapi.piaoquantv.com/longvideoapi/openapi/video/batchSelectVideoInfo"
+        data = {
+            "videoIdList": [video_id]
+        }
+        header = {
+            "Content-Type": "application/json",
+        }
+        response = requests.post(url, headers=header, data=json.dumps(data))
+        return response.json()
+
+    # 清理标题
+    @classmethod
+    def clean_title(cls, strings):
+        """
+        :param strings:
+        :return:
+        """
+        return (
+            strings.strip()
+            .replace("\n", "")
+            .replace("/", "")
+            .replace("\r", "")
+            .replace("#", "")
+            .replace(".", "。")
+            .replace("\\", "")
+            .replace("&NBSP", "")
+            .replace(":", "")
+            .replace("*", "")
+            .replace("?", "")
+            .replace("?", "")
+            .replace('"', "")
+            .replace("<", "")
+            .replace(">", "")
+            .replace("|", "")
+            .replace(" ", "")
+            .replace('"', "")
+            .replace("'", "")
+        )
+
+
+class MySQLServer(object):
     """
-    :param result: 计算出来的结果
-    :return: uid, video_id
+    MySql 服务
     """
-    score1 = result['s1_score']
-    if score1 > 0:
-        return result['s1_uid'], result['s1_vid']
-    else:
-        return None, None
+
+    @classmethod
+    def select_download_videos(cls, trace_id):
+        """
+        查询
+        :param trace_id:
+        :return:
+        """
+        sql = "select video_id from crawler_video where out_user_id = '{}' and video_title = '{}';".format(trace_id,
+                                                                                                           trace_id)
+        connection = pymysql.connect(
+            host="rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com",  # 数据库IP地址,内网地址
+            port=3306,  # 端口号
+            user="crawler",  # mysql用户名
+            passwd="crawler123456@",  # mysql用户登录密码
+            db="piaoquan-crawler",  # 数据库名
+            charset="utf8mb4"  # 如果数据库里面的文本是utf8编码的,charset指定是utf8
+        )
+        cursor = connection.cursor()
+        cursor.execute(sql)
+        out_video_list = cursor.fetchall()
+        if len(out_video_list) > 0:
+            if out_video_list[0][0] == 0:
+                video_id = cls.search_id_to_video(trace_id)
+            else:
+                video_id = out_video_list[0][0]
+
+            vid_list = [video_id]
+            logging(
+                code="2003",
+                trace_id=trace_id,
+                info="recall_search_list",
+                function="find_videos_in_mysql",
+                data=vid_list
+            )
+            return {
+                "search_videos": "success",
+                "trace_id": trace_id,
+                "video_list": vid_list
+            }
+        else:
+            return {
+                "search_videos": "failed",
+                "trace_id": trace_id,
+                "video_list": []
+            }
+
+    @classmethod
+    def select_pq_videos(cls):
+        """
+        查询
+        :return: info_list
+        """
+        connection = pymysql.connect(
+            host="rm-bp1k5853td1r25g3n690.mysql.rds.aliyuncs.com",  # 数据库IP地址,内网地址
+            port=3306,  # 端口号
+            user="wx2016_longvideo",  # mysql用户名
+            passwd="wx2016_longvideoP@assword1234",  # mysql用户登录密码
+            db="incentive",  # 数据库名
+            charset="utf8mb4"  # 如果数据库里面的文本是utf8编码的,charset指定是utf8
+        )
+        sql = "select video_id, key_words, search_keys, extra_keys from video_content"
+        cursor = connection.cursor()
+        cursor.execute(sql)
+        data = cursor.fetchall()
+        result = [
+            {
+                "video_id": line[0],
+                "key_words": json.loads(line[1]),
+                "search_keys": json.loads(line[2]),
+                "extra_keys": json.loads(line[3]),
+            }
+            for line in data
+        ]
+        return result
+
+    # 敏感词
+    @classmethod
+    def select_sensitive_words(cls):
+        """
+        sensitive words
+        :return:
+        """
+        connection = pymysql.connect(
+            host="rm-bp1k5853td1r25g3n690.mysql.rds.aliyuncs.com",  # 数据库IP地址,内网地址
+            port=3306,  # 端口号
+            user="wx2016_longvideo",  # mysql用户名
+            passwd="wx2016_longvideoP@assword1234",  # mysql用户登录密码
+            db="longvideo",  # 数据库名
+            charset="utf8mb4"  # 如果数据库里面的文本是utf8编码的,charset指定是utf8
+        )
+        sql = "select `keyword` from wx_sensitive_word where `data_status` = 0"
+        cursor = connection.cursor()
+        cursor.execute(sql)
+        data = cursor.fetchall()
+        result = [line[0] for line in data]
+        return result
+
+    @classmethod
+    def search_id_to_video(cls, trace_id):
+        """
+        通过 search_id 返回 video_id
+        :param trace_id:
+        :return:
+        """
+        sql = "select video_id from crawler_video where out_user_id = '{}' and video_title = '{}';".format(trace_id,
+                                                                                                           trace_id)
+        connection = pymysql.connect(
+            host="rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com",  # 数据库IP地址,内网地址
+            port=3306,  # 端口号
+            user="crawler",  # mysql用户名
+            passwd="crawler123456@",  # mysql用户登录密码
+            db="piaoquan-crawler",  # 数据库名
+            charset="utf8mb4"  # 如果数据库里面的文本是utf8编码的,charset指定是utf8
+        )
+        cursor = connection.cursor()
+        cursor.execute(sql)
+        out_video_list = cursor.fetchall()
+        if int(out_video_list[0][0]) == 0:
+            time.sleep(1)
+            return cls.search_id_to_video(trace_id)
+        else:
+            return out_video_list[0][0]

+ 0 - 33
applications/functions/date.py

@@ -1,33 +0,0 @@
-"""
-@author: luojunhui
-"""
-from datetime import datetime, timedelta
-
-
-def five_days_before(ori_dt):
-    """
-    Generate date in 3 days
-    :param ori_dt:
-    :return:
-    """
-    now_date = datetime.strptime(ori_dt, "%Y%m%d")
-    seven_before = now_date - timedelta(days=5)
-    return seven_before.strftime("%Y%m%d")
-
-
-def generate_daily_strings(start_date, end_date):
-    """
-    Generate daily date_str
-    :param start_date:
-    :param end_date:
-    :return:
-    """
-    start = datetime.strptime(start_date, "%Y%m%d")
-    end = datetime.strptime(end_date, "%Y%m%d")
-    current = start
-    date_strings = []
-    while current <= end:
-        date_strings.append(current.strftime("%Y%m%d"))
-        current += timedelta(days=1)
-    return date_strings
-

+ 190 - 0
applications/functions/kimi.py

@@ -0,0 +1,190 @@
+"""
+@author: luojunhui
+"""
+import os
+import json
+from openai import OpenAI
+
+from applications.functions.log import logging
+
+
+class KimiServer(object):
+    """
+    Kimi Server
+    """
+
+    @classmethod
+    async def search_kimi_schedule(cls, params):
+        """
+        搜索阶段 kimi 操作
+        :param params:
+        :return:
+        """
+        title = params['title'].split("@@")[-1]
+        contents = params['content']
+        trace_id = params['trace_id']
+        title_p = os.path.join(os.getcwd(), 'applications', 'static', "titles", "{}.json".format(title))
+        if os.path.exists(title_p):
+            logging(
+                code="2001",
+                info="该标题已经被 kimi 处理过,跳过请求 kimi 操作--- {}".format(title),
+                function="search_videos_from_the_web",
+                trace_id=trace_id
+            )
+        else:
+            await cls.ask_kimi_and_save_to_local((title, trace_id, title_p))
+        kimi_title = await cls.kimi_title(title)
+        kimi_info = await cls.kimi_mining(contents)
+        kimi_info['k_title'] = kimi_title
+        kimi_info['ori_title'] = title
+        logging(
+            code="8000",
+            info="kimi_mining",
+            data=kimi_info,
+            trace_id=trace_id
+        )
+        return kimi_info
+
+    @classmethod
+    async def ask_kimi(cls, question):
+        """
+        Ask Kimi for information
+        :param question: tiny text
+        :return: "{}"
+        """
+        single_title_prompt = """
+            我会给你一个视频标题,需要你帮我用你所学的知识来帮我分析出以下信息,信息我都写到 json 里面了
+            {
+                "key_words": [],  # 返回三个关键词
+                "search_keys": [], # 标题可能的搜索关键词,返回 3 个
+                "extra_keys": [], # 关心这个视频的用户还会关心哪些关键词, 返回 3 个
+                "theme": 标题的主题, 用一个词概括
+            }
+            只需要返回一个 json,key 和上面的一样,
+            我给你的标题是: 
+            """
+        client = OpenAI(
+            api_key='sk-tz1VaKqksTzk0F8HxlU4YVGwj7oa1g0c0puGNUZrdn9MDtzm',
+            base_url="https://api.moonshot.cn/v1"
+        )
+        chat_completion = client.chat.completions.create(
+            messages=[
+                {
+                    "role": "user",
+                    "content": single_title_prompt + question,
+                }
+            ],
+            model="moonshot-v1-8k",
+        )
+        response = chat_completion.choices[0].message.content.replace('```json', '').replace('```', '')
+        try:
+            response = json.loads(response)
+            return response
+        except:
+            return {}
+
+    @classmethod
+    async def ask_kimi_and_save_to_local(cls, info_tuple):
+        """
+        save file to local
+        :return:
+        """
+        title, trace_id, save_path = info_tuple[0], info_tuple[1], info_tuple[2]
+        if os.path.exists(save_path):
+            logging(
+                code="2001",
+                info="该 video 信息已经挖掘完成---{}".format(title),
+                function="ask_kimi_and_save_to_local",
+                trace_id=trace_id,
+            )
+        else:
+            os.makedirs(os.path.dirname(save_path), exist_ok=True)
+            if not title:
+                result = {}
+            else:
+                result = await cls.ask_kimi(title)
+            logging(
+                code="2001",
+                info="kimi-result",
+                data=result,
+                trace_id=trace_id,
+                function="ask_kimi_and_save_to_local"
+            )
+            with open(save_path, "w", encoding="utf-8") as f:
+                f.write(json.dumps(result, ensure_ascii=False))
+
+    @classmethod
+    async def kimi_title(cls, ori_title):
+        """
+        prompt + kimi + ori_title generate new title
+        :param ori_title:
+        :return:
+        """
+        single_title_prompt = """
+        请将以上标题改写成适合小程序点击和传播的小程序标题,小程序标题的写作规范如下,请学习后进行小程序标题的编写。直接输出最终的小程序标题
+        小程序标题写作规范:
+        1.要点前置:将最重要的信息放在标题的最前面,以快速吸引读者的注意力。例如,“5月一辈子同学,三辈子亲,送给我的老同学,听哭无数人!”中的“5月”和“一辈子同学,三辈子亲”都是重要的信息点。
+        2.激发情绪:使用能够触动人心的语言,激发读者的情感共鸣。如“只剩两人同学聚会,看后感动落泪。”使用“感动落泪”激发读者的同情和怀旧情绪。
+        3.使用数字和特殊符号:数字可以提供具体性,而特殊符号如“🔴”、“😄”、“🔥”等可以吸引视觉注意力,增加点击率。
+        4.悬念和好奇心:创建悬念或提出问题,激发读者的好奇心。例如,“太神奇了!长江水位下降,重庆出现惊奇一幕!”中的“惊奇一幕”就是一个悬念。
+        5.名人效应:如果内容与知名人士相关,提及他们的名字可以增加标题的吸引力。
+        6.社会价值观:触及读者的文化和社会价值观,如家庭、友情、国家荣誉等。
+        7.标点符号的运用:使用感叹号、问号等标点来增强语气和情感表达。
+        8.直接的语言:使用直白、口语化的语言,易于理解,如“狗屁股,笑死我了!”。
+        9.热点人物或事件:提及当前的热点人物或事件,利用热点效应吸引读者。
+        10.字数适中:保持标题在10-20个字之间,既不过长也不过短,确保信息的完整性和吸引力。
+        11.适当的紧迫感:使用“最新”、“首次”、“紧急”等词汇,创造一种紧迫感,促使读者立即行动。
+        12.情感或价值诉求:使用如“感动”、“泪目”、“经典”等词汇,直接与读者的情感或价值观产生共鸣。
+        避免误导:确保标题准确反映内容,避免夸大或误导读者。
+        """
+        client = OpenAI(
+            api_key='sk-tz1VaKqksTzk0F8HxlU4YVGwj7oa1g0c0puGNUZrdn9MDtzm',
+            base_url="https://api.moonshot.cn/v1"
+        )
+        chat_completion = client.chat.completions.create(
+            messages=[
+                {
+                    "role": "user",
+                    "content": ori_title + "\n" + single_title_prompt,
+                }
+            ],
+            model="moonshot-v1-8k",
+        )
+        response = chat_completion.choices[0].message.content
+        return response.split("\n")[0]
+
+    @classmethod
+    async def kimi_mining(cls, text):
+        """
+        通过文章来挖掘出有效的信息
+        :param text:
+        :return:
+        """
+        text_prompt = """
+        请从我给你的文章中挖掘出以下信息并且返回如下结果。
+        你返回的结果是一个 json, 格式如下:
+        {
+            "content_keys": [] # 同时提供三个与文章内容高度相关的关键词,这些关键词将用于网络上搜索相关视频内容,
+            "content_title": 一个总结性的标题,该标题应简洁并能够反映文章的主要内容
+        }
+        你需要处理的文本是:
+        """
+        client = OpenAI(
+            api_key='sk-tz1VaKqksTzk0F8HxlU4YVGwj7oa1g0c0puGNUZrdn9MDtzm',
+            base_url="https://api.moonshot.cn/v1"
+        )
+        chat_completion = client.chat.completions.create(
+            messages=[
+                {
+                    "role": "user",
+                    "content": text_prompt + text,
+                }
+            ],
+            model="moonshot-v1-8k",
+        )
+        response = chat_completion.choices[0].message.content.replace('```json', '').replace('```', '')
+        try:
+            response = json.loads(response)
+            return response
+        except:
+            return {}

+ 17 - 2
applications/log.py → applications/functions/log.py

@@ -6,8 +6,18 @@ import json
 from aliyun.log import LogClient, PutLogsRequest, LogItem
 
 
-def logging(code, trace_id=None, info=None, port=None, alg=None, function=None, data=None):
+def logging(
+        code,
+        mode="prod",
+        trace_id=None,
+        info=None,
+        port=None,
+        alg=None,
+        function=None,
+        data=None
+):
     """
+    :param mode: 生产模式 or  测试模式
     :param trace_id: 请求唯一 id
     :param data: 信息
     :param code: 日志状态码
@@ -30,6 +40,7 @@ def logging(code, trace_id=None, info=None, port=None, alg=None, function=None,
     log_group = []
     log_item = LogItem()
     contents = [
+        (f"mode", str(mode)),
         (f"code", str(code)),
         (f"alg", str(alg)),
         (f"function", str(function)),
@@ -51,4 +62,8 @@ def logging(code, trace_id=None, info=None, port=None, alg=None, function=None,
         logitems=log_group,
         compress=False,
     )
-    client.put_logs(request)
+    try:
+        client.put_logs(request)
+    except Exception as e:
+        print("日志失败")
+        print(e)

+ 2 - 4
applications/mq.py → applications/functions/mq.py

@@ -22,13 +22,11 @@ class MQ(object):
 
     def send_msg(self, params):
         """
-        发送 mq,并且记录 redis
+        send msg to mq client
         """
-        account = params["ghId"]
-
         try:
             msg = TopicMessage(json.dumps(params))
-            message_key = account + str(uuid4())
+            message_key = str(uuid4())
             msg.set_message_key(message_key)
             re_msg = self.producer.publish_message(msg)
             print(re_msg)

+ 0 - 56
applications/functions/mysql.py

@@ -1,56 +0,0 @@
-"""
-@author: luojunhui
-mysql 方法
-"""
-import json
-
-import pymysql
-
-
-def select(sql):
-    """
-    查询
-    :param sql:
-    :return:
-    """
-    connection = pymysql.connect(
-        host="rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com",  # 数据库IP地址,内网地址
-        port=3306,  # 端口号
-        user="crawler",  # mysql用户名
-        passwd="crawler123456@",  # mysql用户登录密码
-        db="piaoquan-crawler",  # 数据库名
-        charset="utf8mb4"  # 如果数据库里面的文本是utf8编码的,charset指定是utf8
-    )
-    cursor = connection.cursor()
-    cursor.execute(sql)
-    data = cursor.fetchall()
-    return data
-
-
-def select_pq_videos():
-    """
-    查询
-    :return: info_list
-    """
-    connection = pymysql.connect(
-        host="rm-bp1k5853td1r25g3n690.mysql.rds.aliyuncs.com",  # 数据库IP地址,内网地址
-        port=3306,  # 端口号
-        user="wx2016_longvideo",  # mysql用户名
-        passwd="wx2016_longvideoP@assword1234",  # mysql用户登录密码
-        db="incentive",  # 数据库名
-        charset="utf8mb4"  # 如果数据库里面的文本是utf8编码的,charset指定是utf8
-    )
-    sql = "select video_id, key_words, search_keys, extra_keys from video_content"
-    cursor = connection.cursor()
-    cursor.execute(sql)
-    data = cursor.fetchall()
-    result = [
-        {
-            "video_id": line[0],
-            "key_words": json.loads(line[1]),
-            "search_keys": json.loads(line[2]),
-            "extra_keys": json.loads(line[3]),
-        }
-        for line in data
-    ]
-    return result

+ 0 - 35
applications/functions/odps.py

@@ -1,35 +0,0 @@
-"""
-@author: luojunhui
-"""
-
-from odps import ODPS
-
-
-class PyODPS(object):
-    """
-    PyODPS class, get data from odps server
-    """
-
-    def __init__(self):
-        self.endpoint = "http://service.cn.maxcompute.aliyun.com/api"
-        self.access_id = "LTAIWYUujJAm7CbH"
-        self.access_key = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P"
-        self.project = "loghubods"
-
-        self.od = ODPS(
-            access_id=self.access_id,
-            secret_access_key=self.access_key,
-            endpoint=self.endpoint,
-            project=self.project,
-        )
-
-    def select(self, sql):
-        """
-        :param sql: 查询语句
-        :return: odps_obj{}
-        """
-        result = []
-        with self.od.execute_sql(sql).open_reader() as reader:
-            for record in reader:
-                result.append(record)
-        return result

+ 243 - 0
applications/functions/video_item.py

@@ -0,0 +1,243 @@
+"""
+@author: luojunhui
+"""
+import time
+from applications.functions.mq import MQ
+from applications.functions.log import logging
+from applications.functions.common import Functions
+
+
+class VideoItem(object):
+    """
+    function: 当扫描进一条视频的时候,对该视频的基本信息进行处理,保证发送给 pipeline和 etl 的 video_dict 是正确的
+    __init__: 初始化空json 对象,用来存储视频信息
+    add_video_info: 把视频信息存储到 item 对象中
+    check_item: 检查 item 对象中的各个元素以及处理
+    """
+
+    def __init__(self):
+        self.item = {}
+
+    def add_video_info(self, key, value):
+        """
+        insert or update video info
+        :param key:
+        :param value:
+        """
+        self.item[key] = value
+
+    def check_item(self):
+        """
+        判断item 里面的字段,是否符合要求
+        字段分为 3 类:
+        1. 必须存在数据的字段: ["video_id", "user_id", "user_name", "out_user_id", "out_video_id", "session", "video_url", "cover_url", "platform", "strategy"]
+        2. 不存在默认为 0 的字段 :["duration", "play_cnt", "like_cnt", "comment_cnt", "share_cnt", "width", "height"]
+        3. 需要后出理的字段: video_title, publish_time
+        """
+        if self.item.get("video_title"):
+            self.item["video_title"] = Functions().clean_title(self.item["video_title"])
+        else:
+            return False
+        if self.item.get("publish_time_stamp"):
+            publish_time_str = time.strftime(
+                "%Y-%m-%d %H:%M:%S", time.localtime(self.item["publish_time_stamp"])
+            )
+            self.add_video_info("publish_time_str", publish_time_str)
+        else:
+            publish_time_stamp = int(time.time())
+            publish_time_str = time.strftime(
+                "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
+            )
+            self.add_video_info("publish_time_stamp", publish_time_stamp)
+            self.add_video_info("publish_time_str", publish_time_str)
+        self.add_video_info("publish_time", publish_time_str)
+        if not self.item.get("update_time_stamp"):
+            self.add_video_info("update_time_stamp", int(time.time()))
+
+        # 如果不存在,默认值为 0
+        config_keys = [
+            "duration",
+            "play_cnt",
+            "like_cnt",
+            "comment_cnt",
+            "share_cnt",
+            "width",
+            "height",
+        ]
+        for config_key in config_keys:
+            if self.item.get(config_key):
+                continue
+            else:
+                self.add_video_info(config_key, 0)
+
+        # 必须存在的元素,若不存在则会报错
+        must_keys = [
+            "video_id",
+            "user_id",
+            "user_name",
+            "out_video_id",
+            "session",
+            "video_url",
+            "cover_url",
+            "platform",
+            "strategy",
+        ]
+        """
+        video_id, out_video_id 均为站外视频 id
+        usr_id: 站内用户 id
+        out_user_id: 站外用户 id
+        user_name: 站外用户名称
+        """
+        for m_key in must_keys:
+            if self.item.get(m_key):
+                continue
+            else:
+                # print(m_key)
+                return False
+        return True
+
+    def produce_item(self):
+        """
+        item producer
+        :return:
+        """
+        flag = self.check_item()
+        if flag:
+            return self.item
+        else:
+            return False
+
+
+class VideoProducer(object):
+    """
+    处理视频
+    """
+
+    @classmethod
+    def wx_video_producer(cls, video_obj, user, trace_id):
+        """
+            异步处理微信 video_obj
+            公众号和站内账号一一对应
+            :param trace_id:
+            :param user:
+            :param video_obj:
+            :return:
+        """
+        platform = "weixin_search"
+        publish_time_stamp = int(video_obj['pubTime'])
+        item = VideoItem()
+        item.add_video_info("user_id", user["uid"])
+        item.add_video_info("user_name", user["nick_name"])
+        item.add_video_info("video_id", video_obj['hashDocID'])
+        item.add_video_info("video_title", trace_id)
+        item.add_video_info("publish_time_stamp", int(publish_time_stamp))
+        item.add_video_info("video_url", video_obj["videoUrl"])
+        item.add_video_info("cover_url", video_obj["image"])
+        item.add_video_info("out_video_id", video_obj['hashDocID'])
+        item.add_video_info("out_user_id", trace_id)
+        item.add_video_info("platform", platform)
+        item.add_video_info("strategy", "search")
+        item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
+        mq_obj = item.produce_item()
+        return mq_obj
+
+    @classmethod
+    def baidu_video_producer(cls, video_obj, user, trace_id):
+        """
+        处理好看视频的 video_info
+        :param video_obj:
+        :param user:
+        :param trace_id:
+        :return:
+        """
+        platform = "baidu_search"
+        publish_time_stamp = int(video_obj['publish_time'])
+        item = VideoItem()
+        item.add_video_info("user_id", user["uid"])
+        item.add_video_info("user_name", user["nick_name"])
+        item.add_video_info("video_id", video_obj['id'])
+        item.add_video_info("video_title", trace_id)
+        item.add_video_info("publish_time_stamp", publish_time_stamp)
+        item.add_video_info("video_url", video_obj["playurl"])
+        item.add_video_info("cover_url", video_obj["poster"])
+        item.add_video_info("out_video_id", video_obj['id'])
+        item.add_video_info("out_user_id", trace_id)
+        item.add_video_info("like_cnt", video_obj['like'] if video_obj.get('like') else 0)
+        item.add_video_info("play_cnt", video_obj['playcnt'])
+        item.add_video_info("duration", video_obj['duration'])
+        item.add_video_info("platform", platform)
+        item.add_video_info("strategy", "search")
+        item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
+        mq_obj = item.produce_item()
+        return mq_obj
+
+    @classmethod
+    def xg_video_producer(cls, video_obj, user, trace_id):
+        """
+        西瓜搜索
+        :param video_obj:
+        :param user:
+        :param trace_id:
+        :return:
+        """
+        platform = "xg_search"
+        publish_time_stamp = int(video_obj['publish_time'])
+        item = VideoItem()
+        item.add_video_info("user_id", user["uid"])
+        item.add_video_info("user_name", user["nick_name"])
+        item.add_video_info("video_id", video_obj['video_id'])
+        item.add_video_info("video_title", trace_id)
+        item.add_video_info("publish_time_stamp", int(publish_time_stamp))
+        item.add_video_info("video_url", video_obj["video_url"])
+        item.add_video_info("cover_url", video_obj["cover_url"])
+        item.add_video_info("out_video_id", video_obj['video_id'])
+        item.add_video_info("play_cnt", video_obj['play_cnt'])
+        item.add_video_info("duration", video_obj['duration'])
+        item.add_video_info("like_cnt", video_obj['like_cnt'])
+        item.add_video_info("out_user_id", trace_id)
+        item.add_video_info("platform", platform)
+        item.add_video_info("strategy", "search")
+        item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
+        mq_obj = item.produce_item()
+        return mq_obj
+
+
+async def video_mq_sender(video_obj, user, trace_id, platform):
+    """
+    异步处理微信 video_obj
+    公众号和站内账号一一对应
+    :param platform:
+    :param user:
+    :param trace_id:
+    :param video_obj:
+    :return:
+    """
+    ETL_MQ = MQ(topic_name="topic_crawler_etl_prod")
+    Video = VideoProducer()
+    if platform == "xg_search":
+        mq_obj = Video.xg_video_producer(
+            video_obj=video_obj,
+            user=user,
+            trace_id=trace_id,
+        )
+    elif platform == "baidu_search":
+        mq_obj = Video.baidu_video_producer(
+            video_obj=video_obj,
+            user=user,
+            trace_id=trace_id,
+        )
+    elif platform == "wx_search":
+        mq_obj = Video.wx_video_producer(
+            video_obj=video_obj,
+            user=user,
+            trace_id=trace_id,
+        )
+    else:
+        mq_obj = {}
+    ETL_MQ.send_msg(params=mq_obj)
+    logging(
+        code="6002",
+        info="发送消息至 ETL",
+        data=mq_obj,
+        trace_id=trace_id
+    )

+ 37 - 12
applications/match_alg/rank.py

@@ -2,6 +2,7 @@
 @author: luojunhui
 """
 from applications.match_alg.recall import recall_videos
+from applications.functions.log import logging
 
 
 def jac_score(d1, d2):
@@ -29,12 +30,12 @@ def jac_score(d1, d2):
     return score_1 * 0.4 + score_2 * 0.4 + score_3 * 0.2, d2['video_id']
 
 
-async def best_choice(params_obj, request_param, trace_id):
+async def best_choice(params_obj, trace_id, search_videos):
     """
     计算,返回出最合适的 video_id
     :return: video_id
     """
-    pq_list, search_list = await recall_videos(params=request_param, trace_id=trace_id)
+    pq_list, search_list = await recall_videos(trace_id=trace_id, s_videos=search_videos)
 
     def best_video_id(target_list):
         """
@@ -49,20 +50,44 @@ async def best_choice(params_obj, request_param, trace_id):
             except Exception as e:
                 print(e)
         sorted_list = sorted(score_list, key=lambda x: x[1], reverse=True)
-        return sorted_list[0]
+        return sorted_list[0] if sorted_list else (0, 0)
+
     if search_list:
-        best_search_tuple = best_video_id(search_list)
-        if best_search_tuple[1] > 0:
-            return best_search_tuple[0]
-        else:
-            best_pq_tuple = best_video_id(pq_list)
-            if best_pq_tuple[1] > 0:
-                return best_pq_tuple[0]
-            else:
-                return None
+        logging(
+            code="1003",
+            info="Return Best Search Video",
+            data=search_list,
+            trace_id=trace_id
+        )
+        return search_list[0]
+        # return best_video_id(search_list)[0]
+        # best_search_tuple = best_video_id(search_list)
+        # if best_search_tuple[1] > 0:
+        #     logging(
+        #         code="1003",
+        #         info="search_score---{}".format(best_search_tuple[1]),
+        #         trace_id=trace_id
+        #     )
+        #     return best_search_tuple[0]
+        # else:
+        #     best_pq_tuple = best_video_id(pq_list)
+        #     if best_pq_tuple[1] > 0:
+        #         logging(
+        #             code="1003",
+        #             info="pq_score---{}".format(best_pq_tuple[1]),
+        #             trace_id=trace_id
+        #         )
+        #         return best_pq_tuple[0]
+        #     else:
+        #         return None
     else:
         best_pq_tuple = best_video_id(pq_list)
         if best_pq_tuple[1] > 0:
+            logging(
+                code="1003",
+                info="pq_score---{}".format(best_pq_tuple[1]),
+                trace_id=trace_id
+            )
             return best_pq_tuple[0]
         else:
             return None

+ 27 - 384
applications/match_alg/recall.py

@@ -1,399 +1,42 @@
 """
 @author: luojunhui
 """
-import os
-import json
-import time
-import asyncio
-from concurrent.futures import ThreadPoolExecutor
 
-import httpx
-import requests
+from applications.functions.log import logging
+from applications.functions.common import MySQLServer
 
-from applications.log import logging
-from applications.functions.mysql import select, select_pq_videos
-from applications.functions.ask_kimi import ask_kimi
 
-
-gh_id_dict = {
-    "gh_01f8afd03366": {
-        "uid": 69637520,
-        "nick_name": "非亲非故"
-    },
-    "gh_058e41145a0c": {
-        "uid": 69637476,
-        "nick_name": "甜腻梦话"
-    },
-    "gh_084a485e859a": {
-        "uid": 69637472,
-        "nick_name": "梦星月"
-    },
-    "gh_0921c03402cd": {
-        "uid": 69637531,
-        "nick_name": "你的女友"
-    },
-    "gh_0c89e11f8bf3": {
-        "uid": 69637508,
-        "nick_name": "粟米"
-    },
-    "gh_171cec079b2a": {
-        "uid": 69637501,
-        "nick_name": "海上"
-    },
-    "gh_183d80deffb8": {
-        "uid": 69637491,
-        "nick_name": "论趣"
-    },
-    "gh_1ee2e1b39ccf": {
-        "uid": 69637473,
-        "nick_name": "纵有疾风起"
-    },
-    "gh_234ef02cdee5": {
-        "uid": 69637513,
-        "nick_name": "夹逼"
-    },
-    "gh_26a307578776": {
-        "uid": 69637490,
-        "nick_name": "最宝贝的宝贝"
-    },
-    "gh_29074b51f2b7": {
-        "uid": 69637530,
-        "nick_name": "沉舸"
-    },
-    "gh_2b8c6aa035ae": {
-        "uid": 69637470,
-        "nick_name": "懶得取名"
-    },
-    "gh_34318194fd0e": {
-        "uid": 69637517,
-        "nick_name": "徒四壁"
-    },
-    "gh_3845af6945d0": {
-        "uid": 69637545,
-        "nick_name": "秋水娉婷"
-    },
-    "gh_3ac6d7208961": {
-        "uid": 69637497,
-        "nick_name": "小熊的少女梦"
-    },
-    "gh_3c7d38636846": {
-        "uid": 69637519,
-        "nick_name": "油腻腻"
-    },
-    "gh_3df10391639c": {
-        "uid": 69637541,
-        "nick_name": "六郎娇面"
-    },
-    "gh_40a0ad154478": {
-        "uid": 69637516,
-        "nick_name": "禁止"
-    },
-    "gh_424c8eeabced": {
-        "uid": 69637522,
-        "nick_name": "认命"
-    },
-    "gh_4568b5a7e2fe": {
-        "uid": 69637482,
-        "nick_name": "香腮"
-    },
-    "gh_45beb952dc74": {
-        "uid": 69637488,
-        "nick_name": "毋庸"
-    },
-    "gh_484de412b0ef": {
-        "uid": 69637481,
-        "nick_name": "婪"
-    },
-    "gh_4c058673c07e": {
-        "uid": 69637474,
-        "nick_name": "影帝"
-    },
-    "gh_538f78f9d3aa": {
-        "uid": 69637478,
-        "nick_name": "伤痕"
-    },
-    "gh_56a6765df869": {
-        "uid": 69637514,
-        "nick_name": "风月"
-    },
-    "gh_56ca3dae948c": {
-        "uid": 69637538,
-        "nick_name": "留下太多回忆"
-    },
-    "gh_5e543853d8f0": {
-        "uid": 69637543,
-        "nick_name": "不知春秋"
-    },
-    "gh_5ff48e9fb9ef": {
-        "uid": 69637494,
-        "nick_name": "寻她找他"
-    },
-    "gh_671f460c856c": {
-        "uid": 69637523,
-        "nick_name": "绝不改悔"
-    },
-    "gh_6b7c2a257263": {
-        "uid": 69637528,
-        "nick_name": "奶牙"
-    },
-    "gh_6d205db62f04": {
-        "uid": 69637509,
-        "nick_name": "怕羞"
-    },
-    "gh_6d9f36e3a7be": {
-        "uid": 69637498,
-        "nick_name": "望长安"
-    },
-    "gh_73be0287bb94": {
-        "uid": 69637537,
-        "nick_name": "戏剧"
-    },
-    "gh_744cb16f6e16": {
-        "uid": 69637505,
-        "nick_name": "反駁"
-    },
-    "gh_7b4a5f86d68c": {
-        "uid": 69637477,
-        "nick_name": "我很想你"
-    },
-    "gh_7bca1c99aea0": {
-        "uid": 69637511,
-        "nick_name": "从小就很傲"
-    },
-    "gh_7e5818b2dd83": {
-        "uid": 69637532,
-        "nick_name": "二八佳人"
-    },
-    "gh_89ef4798d3ea": {
-        "uid": 69637533,
-        "nick_name": "彼岸花"
-    },
-    "gh_901b0d722749": {
-        "uid": 69637518,
-        "nick_name": "深情不为我"
-    },
-    "gh_9161517e5676": {
-        "uid": 69637495,
-        "nick_name": "折磨"
-    },
-    "gh_93e00e187787": {
-        "uid": 69637504,
-        "nick_name": "理会"
-    },
-    "gh_9877c8541764": {
-        "uid": 69637506,
-        "nick_name": "我沿着悲伤"
-    },
-    "gh_9cf3b7ff486b": {
-        "uid": 69637492,
-        "nick_name": "hoit"
-    },
-    "gh_9e559b3b94ca": {
-        "uid": 69637471,
-        "nick_name": "我与你相遇"
-    },
-    "gh_9f8dc5b0c74e": {
-        "uid": 69637496,
-        "nick_name": "港口"
-    },
-    "gh_a182cfc94dad": {
-        "uid": 69637539,
-        "nick_name": "四海八荒"
-    },
-    "gh_a2901d34f75b": {
-        "uid": 69637535,
-        "nick_name": "听腻了谎话"
-    },
-    "gh_a307072c04b9": {
-        "uid": 69637521,
-        "nick_name": "踏步"
-    },
-    "gh_a6351b447819": {
-        "uid": 69637540,
-        "nick_name": "七猫酒馆"
-    },
-    "gh_ac43e43b253b": {
-        "uid": 69637499,
-        "nick_name": "一厢情愿"
-    },
-    "gh_adca24a8f429": {
-        "uid": 69637483,
-        "nick_name": "对你何止一句喜欢"
-    },
-    "gh_b15de7c99912": {
-        "uid": 69637536,
-        "nick_name": "糖炒板栗"
-    },
-    "gh_b32125c73861": {
-        "uid": 69637493,
-        "nick_name": "发尾"
-    },
-    "gh_b3ffc1ca3a04": {
-        "uid": 69637546,
-        "nick_name": "主宰你心"
-    },
-    "gh_b8baac4296cb": {
-        "uid": 69637489,
-        "nick_name": "生性"
-    },
-    "gh_b9b99173ff8a": {
-        "uid": 69637524,
-        "nick_name": "养一只月亮"
-    },
-    "gh_bd57b6978e06": {
-        "uid": 69637527,
-        "nick_name": "厌遇"
-    },
-    "gh_be8c29139989": {
-        "uid": 69637502,
-        "nick_name": "不负"
-    },
-    "gh_bfe5b705324a": {
-        "uid": 69637529,
-        "nick_name": "乐极"
-    },
-    "gh_bff0bcb0694a": {
-        "uid": 69637534,
-        "nick_name": "简迷离"
-    },
-    "gh_c69776baf2cd": {
-        "uid": 69637512,
-        "nick_name": "骄纵"
-    },
-    "gh_c91b42649690": {
-        "uid": 69637503,
-        "nick_name": "荟萃"
-    },
-    "gh_d2cc901deca7": {
-        "uid": 69637487,
-        "nick_name": "恶意调笑"
-    },
-    "gh_d5f935d0d1f2": {
-        "uid": 69637500,
-        "nick_name": "青少年哪吒"
-    },
-    "gh_da76772d8d15": {
-        "uid": 69637526,
-        "nick_name": "独揽风月"
-    },
-    "gh_de9f9ebc976b": {
-        "uid": 69637475,
-        "nick_name": "剑出鞘恩怨了"
-    },
-    "gh_e0eb490115f5": {
-        "uid": 69637486,
-        "nick_name": "赋别"
-    },
-    "gh_e24da99dc899": {
-        "uid": 69637484,
-        "nick_name": "恋雨夏季"
-    },
-    "gh_e2576b7181c6": {
-        "uid": 69637515,
-        "nick_name": "满天星"
-    },
-    "gh_e75dbdc73d80": {
-        "uid": 69637542,
-        "nick_name": "情战"
-    },
-    "gh_e9d819f9e147": {
-        "uid": 69637525,
-        "nick_name": "与卿"
-    },
-    "gh_efaf7da157f5": {
-        "uid": 69637547,
-        "nick_name": "心野性子浪"
-    },
-    "gh_f4594783f5b8": {
-        "uid": 69637544,
-        "nick_name": "自缚"
-    },
-    "gh_fe6ef3a65a48": {
-        "uid": 69637480,
-        "nick_name": "风间"
-    }
-}
-
-
-def ask_kimi_and_save_to_local(info_tuple):
-    """
-    save file to local
-    :return:
-    """
-    title, trace_id, save_path = info_tuple[0], info_tuple[1], info_tuple[2]
-    if os.path.exists(save_path):
-        logging(
-            code="1002",
-            info="该 video 信息已经挖掘完成---{}".format(title),
-            function="ask_kimi_and_save_to_local",
-            trace_id=trace_id,
-        )
-    else:
-        os.makedirs(os.path.dirname(save_path), exist_ok=True)
-        if not title:
-            result = {}
-        else:
-            result = ask_kimi(title)
-        logging(
-            code="1002",
-            info="kimi-result",
-            data=result,
-            trace_id=trace_id,
-            function="ask_kimi_and_save_to_local"
-        )
-        with open(save_path, "w", encoding="utf-8") as f:
-            f.write(json.dumps(result, ensure_ascii=False))
-
-
-async def recall_videos(params, trace_id):
+async def recall_videos(trace_id, s_videos):
     """
     通过请求的数据来召回视频
+    :param s_videos:
     :param trace_id:
-    :param params: 请求参数
     :return: file_list
     """
-    title = params['title']
+    # title = params['title']
     # content = params['content']
-    ghId = params['ghId']
-    user_id = gh_id_dict[ghId]['uid']
-
-    # 在外面搜索视频
-    # payload = {
-    #     "ghId": ghId,
-    #     "search_keys": [title],
-    #     "trace_id": trace_id
-    # }
-    # # print(payload)
-    # url = "http://61.48.133.26:8111/search_videos"
-    # requests.post(url, json=payload)
-    # # print("请求完成")
-    # await asyncio.sleep(15)
-    # select_sql = "select video_id, video_title from crawler_video where platform='weixin_search' and user_id = '{}' order by update_time DESC limit 10".format(
-    #     user_id)
-    # out_video_list = select(sql=select_sql)
-    # dir_path = os.path.join(os.getcwd(), 'applications', 'static', "out_videos")
-    # os.makedirs(os.path.dirname(dir_path), exist_ok=True)
-    # done_list = os.listdir(dir_path)
-    # process_list = [
-    #     (
-    #         i[1],
-    #         trace_id,
-    #         os.path.join(dir_path, "{}.json".format(i[0]))
-    #     ) for i in out_video_list if not "{}.json".format(i[0]) in done_list
-    # ]
-    # with ThreadPoolExecutor(max_workers=10) as pool:
-    #     pool.map(ask_kimi_and_save_to_local, process_list)
+    # ghId = params['ghId']
+    # user_id = gh_id_dict[ghId]['uid']
 
     # 在两边召回视频
     # pq_videos
-    recall_video_list = select_pq_videos()
-    dirs_1 = os.path.join(os.getcwd(), 'applications', 'static', 'out_videos')
-    file_list = [os.path.join(dirs_1, file) for file in os.listdir(dirs_1) if file.endswith(".json")]
-    search_list = []
-    for file in file_list:
-        with open(file, encoding="utf-8") as f:
-            obj = json.loads(f.read())
-            if obj:
-                obj['video_id'] = file.split("/")[-1].replace('.json', '')
-        search_list.append(obj)
-    return recall_video_list, search_list
+    recall_video_list = MySQLServer().select_pq_videos()
+    # dirs_1 = os.path.join(os.getcwd(), 'applications', 'static', 'out_videos')
+    # file_list = [os.path.join(dirs_1, "{}.json".format(vid)) for vid in s_videos]
+    # search_list = []
+    # for file in file_list:
+    #     with open(file, encoding="utf-8") as f:
+    #         obj = json.loads(f.read())
+    #         if obj:
+    #             obj['video_id'] = file.split("/")[-1].replace('.json', '')
+    #     search_list.append(obj)
+    logging(
+        code="1002",
+        info="召回视频",
+        data={
+            "pq_list": [],
+            "search_list": s_videos
+        },
+        trace_id=trace_id
+    )
+    return recall_video_list, s_videos

+ 77 - 24
applications/routes.py

@@ -1,20 +1,22 @@
 """
 @author: luojunhui
 """
-import time
 import json
+import time
 import uuid
-from quart import Blueprint, jsonify, request, websocket
+from quart import Blueprint, jsonify, request
 
-from applications.log import logging
-from applications.process import ProcessParams
-from applications.mq import MQ
+from applications.functions.log import logging
+from applications.schedule import ProcessParams, search_videos
+from applications.functions.common import MySQLServer
+from applications.functions.kimi import KimiServer
+from applications.schedule.main_schedule import AskForInfo
 
 my_blueprint = Blueprint('kimi', __name__)
 
 
 @my_blueprint.route('/healthcheck')
-async def hello():
+def hello():
     """
     Hello World Test
     :return:
@@ -27,13 +29,73 @@ async def hello():
     return jsonify({'message': 'Hello, World!'})
 
 
-@my_blueprint.route('/title_to_video', methods=['POST'])
+@my_blueprint.route('/title_to_search', methods=['POST'])
+async def search_videos_from_the_web():
+    """
+    从web 搜索视频并且存储到票圈的视频库中
+    :return:
+    """
+    params = await request.get_json()
+    K = KimiServer()
+    gh_id = params['ghId']
+    trace_id = "search-{}-{}".format(str(uuid.uuid4()), str(int(time.time())))
+    params['trace_id'] = trace_id
+    logging(
+        code="2000",
+        info="搜索视频内容接口请求成功",
+        port="title_to_search",
+        function="search_videos_from_the_web",
+        trace_id=trace_id
+    )
+    # try:
+    kimi_info = await K.search_kimi_schedule(params=params)
+    await search_videos(
+        kimi_info=kimi_info,
+        trace_id=trace_id,
+        gh_id=gh_id
+    )
+    print(json.dumps(kimi_info, ensure_ascii=False, indent=4))
+    res = {
+        "trace_id": trace_id,
+        "code": 0,
+        "kimi_title": kimi_info['k_title']
+    }
+    # except Exception as e:
+    #     res = {
+    #         "trace_id": trace_id,
+    #         "code": 1,
+    #         "message": str(e)
+    #     }
+    return jsonify(res)
+
+
+@my_blueprint.route('/out_videos', methods=['POST'])
+async def find_in_mysql():
+    """
+    搜索是否存在外站视频 video_list, 如果存在,则返回成功
+    :return:
+    """
+    data = await request.get_json()
+    trace_id = data['traceId']
+    logging(
+        code="2000",
+        info="请求接口成功",
+        port="title_to_video",
+        trace_id=trace_id,
+        function="find_in_mysql"
+    )
+    res = MySQLServer().select_download_videos(trace_id=trace_id)
+    return jsonify(res)
+
+
+@my_blueprint.route('/find_video', methods=['POST'])
 async def post_data():
     """
     请求接口代码
     :return:
     """
-    trace_id = str(uuid.uuid4()) + "-" + str(int(time.time()))
+    data = await request.get_json()
+    trace_id = data['traceId']
     logging(
         code="1001",
         info="请求接口成功",
@@ -41,27 +103,18 @@ async def post_data():
         trace_id=trace_id
     )
     p = ProcessParams(t_id=trace_id)
-    data = await request.get_json()
     processed_data = await p.deal(data)
     return jsonify(processed_data)
 
 
-@my_blueprint.route('/search_videos', methods=['POST'])
-async def search_data():
+@my_blueprint.route('/title_to_video', methods=['POST'])
+async def delay_response():
     """
-    通过搜索词去搜索获取视频信息
+    main
     :return:
     """
-    mq = MQ(topic_name="search_spider_prod")
-    trace_id = "search-{}-{}".format(str(uuid.uuid4()), str(int(time.time())))
-    logging(
-        code="1001",
-        info="请求接口成功",
-        port="search_videos",
-        trace_id=trace_id
-    )
+    # 从请求体中解析 JSON 数据
     data = await request.get_json()
-    mq.send_msg(params=data)
-    return jsonify({
-        "code": 0
-    })
+    A = AskForInfo(data)
+    res = await A.schedule()
+    return jsonify(res)

+ 5 - 0
applications/schedule/__init__.py

@@ -0,0 +1,5 @@
+"""
+@author: luojunhui
+"""
+from .process_schedule import ProcessParams
+from .search_schedule import search_videos

+ 100 - 0
applications/schedule/main_schedule.py

@@ -0,0 +1,100 @@
+"""
+@author: luojunhui
+"""
+# encoding: utf-8
+"""
+@author: luojunhui
+"""
+import time
+import asyncio
+import requests
+
+
+class AskForInfo:
+    """
+    Ask user to enter their info
+    """
+    def __init__(self, params):
+        self.params = params
+        self.base_url = "http://localhost:8111"
+
+    async def search_request(self):
+        """
+        请求下载外部视频
+        :return:
+        """
+        url = "{}/title_to_search".format(self.base_url)
+        body = {
+            "title": self.params["title"],
+            "content": self.params['content'],
+            "ghId": self.params["ghId"]
+        }
+        res = requests.post(url, json=body, timeout=120)
+        return res.json()
+
+    async def check_out_videos(self, trace_id):
+        """
+        :return:
+        """
+        url = "{}/out_videos".format(self.base_url)
+        body = {
+            "traceId": trace_id
+        }
+        res = requests.post(url, json=body, timeout=120)
+        return res.json()
+
+    async def ask_for_info(self, res_obj, kt):
+        """
+        :param kt:
+        :param res_obj:
+        :return:
+        """
+        traceId = res_obj["trace_id"]
+        video_list = res_obj["video_list"]
+        url = "{}/find_video".format(self.base_url)
+        body = {
+            "accountName": self.params['accountName'],
+            "content": self.params['content'],
+            "cover": self.params['cover'],
+            "ghId": self.params['ghId'],
+            "title": self.params['title'],
+            "traceId": traceId,
+            "kimi_title": kt,
+            "videoList": video_list
+        }
+        res = requests.post(url, json=body, timeout=120)
+        return res.json()
+
+    async def schedule(self):
+        """
+
+        :return:
+        """
+        a = time.time()
+        res = await self.search_request()
+        b = time.time()
+        print("search_time")
+        print(b - a)
+        if res['code'] == 0:
+            trace_id = res["trace_id"]
+            kimi_title = res['kimi_title']
+            c = time.time()
+            res_obj = await self.check_out_videos(trace_id=trace_id)
+            d = time.time()
+            print("recall time")
+            print(d - c)
+            time.sleep(2)
+            final_obj = await self.ask_for_info(res_obj=res_obj, kt=kimi_title)
+            e = time.time()
+            print("Rank and Return Time")
+            print(e - d - 2)
+            return final_obj
+        elif res['code'] == 1:
+            return {
+                "Error": res['message']
+            }
+        else:
+            return {
+                "Error": "Unknown Error"
+            }
+

+ 14 - 52
applications/process.py → applications/schedule/process_schedule.py

@@ -2,13 +2,12 @@
 @author: luojunhui
 对请求进行操作
 """
-
+import json
 import os
 
-from applications.log import logging
-from applications.functions.ask_kimi import ask_kimi
 from applications.match_alg import best_choice
-from applications.functions.common import *
+from applications.functions.common import Functions
+from applications.functions.log import logging
 
 
 class ProcessParams(object):
@@ -37,54 +36,16 @@ class ProcessParams(object):
         )
         return data
 
-    def ask_kimi_and_save_to_local(self, title):
-        """
-        save file to local
-        :param title:
-        :return:
-        """
-        save_path = os.path.join(os.getcwd(), 'applications', 'static', "titles", "{}.json".format(title))
-        if os.path.exists(save_path):
-            logging(
-                code="1002",
-                info="该 video 信息已经挖掘完成---{}".format(title),
-                function="ask_kimi_and_save_to_local",
-                trace_id=self.trace_id,
-            )
-            return
-        else:
-            os.makedirs(os.path.dirname(save_path), exist_ok=True)
-            if not title:
-                result = {}
-            else:
-                result = ask_kimi(title)
-            logging(
-                code="1002",
-                info="kimi-result",
-                data=result,
-                trace_id=self.trace_id,
-                function="ask_kimi_and_save_to_local"
-            )
-            with open(save_path, "w", encoding="utf-8") as f:
-                f.write(json.dumps(result, ensure_ascii=False))
-
     async def deal(self, data):
         """执行代码"""
         params = self.get_params(data)
-        title = params['title']
+        title = params['title'].split("@@")[-1]
+        kimi_title = params['kimi_title']
         # account_name = params['accountName']
         # ghId = params['ghId']
+        video_list = params['videoList']
 
         title_p = os.path.join(os.getcwd(), 'applications', 'static', "titles", "{}.json".format(title))
-        if os.path.exists(title_p):
-            logging(
-                code="1002",
-                info="该标题已经被 kimi 处理过,跳过请求 kimi 操作--- {}".format(title),
-                function="process",
-                trace_id=self.trace_id
-            )
-        else:
-            self.ask_kimi_and_save_to_local(title)
 
         with open(title_p, encoding="utf-8") as f:
             params_obj = json.loads(f.read())
@@ -92,7 +53,7 @@ class ProcessParams(object):
         best_video_id = await best_choice(
             params_obj=params_obj,
             trace_id=self.trace_id,
-            request_param=params
+            search_videos=video_list
         )
         logging(
             code="1002",
@@ -102,17 +63,18 @@ class ProcessParams(object):
         )
 
         if best_video_id:
-            print(best_video_id)
-            response = request_for_info(best_video_id)
+            print("best video id", best_video_id)
+            response = Functions().request_for_info(best_video_id)
             productionCover = response['data'][0]['shareImgPath']
-            productionName = response["data"][0]['title']
+            # productionName = response["data"][0]['title']
+            productionName = kimi_title
             videoUrl = response['data'][0]['videoPath']
             user_id = response['data'][0]['user']['uid']
             programAvatar = "/static/logo.png"
-            programId = "wx69c36def517d687a"
-            programName = "票圈最惊奇"
+            programId = "wx0b7d95eb293b783b"
+            programName = "天天美好祝福好生活"
             source = "Web"
-            root_share_id, productionPath = create_gzh_path(video_id=best_video_id, shared_uid=user_id)
+            root_share_id, productionPath = Functions().create_gzh_path(video_id=best_video_id, shared_uid=user_id)
             logging(
                 code="1002",
                 info="root_share_id --{}, productionPath -- {}".format(root_share_id, productionPath),

+ 234 - 0
applications/schedule/search_schedule.py

@@ -0,0 +1,234 @@
+"""
+@author: luojunhui
+调用接口在微信内搜索视频
+"""
+
+from applications.search import *
+from applications.static.config import gh_id_dict, ab_test_config
+from applications.functions.log import logging
+from applications.functions.video_item import video_mq_sender
+
+
+class SearchABTest(object):
+    """
+    搜索策略实验方案
+    """
+
+    ori_title = None
+    article_summary = None
+    article_keys = None
+    gh_id = None
+    trace_id = None
+
+    def __init__(self, info, gh_id):
+        SearchABTest.set_class_properties(info, gh_id)
+
+    @classmethod
+    def set_class_properties(cls, info, gh_id):
+        """
+        初始化搜索策略实验类
+        :param info: kimi 挖掘的基本信息
+        :param gh_id: 公众号账号 id
+        :return:
+        """
+        cls.ori_title = info["ori_title"]
+        cls.article_summary = info["content_title"]
+        cls.article_keys = info["content_keys"]
+        cls.trace_id = info["trace_id"]
+        cls.gh_id = gh_id
+
+    @classmethod
+    def ab_0(cls):
+        """
+        默认搜索逻辑
+        :return:
+        """
+        wx_result = wx_search(keys=cls.ori_title)
+        if wx_result:
+            return {"platform": "wx_search", "result": wx_result[0]}
+        else:
+            logging(
+                code="7001",
+                info="通过微信搜索失败---{}".format(cls.ori_title),
+                trace_id=cls.trace_id,
+            )
+            # 微信搜不到的话,采用好看视频搜索
+            baidu_result = hksp_search(key=cls.ori_title)
+            if baidu_result:
+                return {"platform": "baidu_search", "result": baidu_result[0]}
+            else:
+                # 若好看视频未搜到,则采用西瓜搜索
+                logging(
+                    code="7001",
+                    info="通过baidu搜索失败---{}".format(cls.ori_title),
+                    trace_id=cls.trace_id,
+                )
+                xigua_result = xigua_search(keyword=cls.ori_title)
+                if xigua_result:
+                    return {"platform": "xg_search", "result": xigua_result[0]}
+                else:
+                    logging(
+                        code="7001",
+                        info="通过西瓜搜索失败---{}".format(cls.ori_title),
+                        trace_id=cls.trace_id,
+                    )
+                return None
+
+    @classmethod
+    def ab_1(cls):
+        """
+        :return:
+        """
+        wx_result = wx_search(keys=cls.article_summary)
+        if wx_result:
+            return {"platform": "wx_search", "result": wx_result[0]}
+        else:
+            logging(
+                code="7001",
+                info="通过微信搜索失败---{}".format(cls.article_summary),
+                trace_id=cls.trace_id,
+            )
+            # 微信搜不到的话,采用好看视频搜索
+            baidu_result = hksp_search(key=cls.article_summary)
+            if baidu_result:
+                return {"platform": "baidu_search", "result": baidu_result[0]}
+            else:
+                # 若好看视频未搜到,则采用西瓜搜索
+                logging(
+                    code="7001",
+                    info="通过baidu搜索失败---{}".format(cls.article_summary),
+                    trace_id=cls.trace_id,
+                )
+                xigua_result = xigua_search(keyword=cls.article_summary)
+                if xigua_result:
+                    return {"platform": "xg_search", "result": xigua_result[0]}
+                else:
+                    logging(
+                        code="7001",
+                        info="通过西瓜搜索失败---{}".format(cls.article_summary),
+                        trace_id=cls.trace_id,
+                    )
+                return None
+
+    @classmethod
+    def ab_2(cls):
+        """
+        ori_title + wx
+        :return:
+        """
+        wx_result = wx_search(keys=",".join(cls.article_keys))
+        if wx_result:
+            return {"platform": "wx_search", "result": wx_result[0]}
+        else:
+            logging(
+                code="7001",
+                info="通过微信搜索失败---{}".format(",".join(cls.article_keys)),
+                trace_id=cls.trace_id,
+            )
+            # 微信搜不到的话,采用好看视频搜索
+            baidu_result = hksp_search(key=",".join(cls.article_keys))
+            if baidu_result:
+                return {"platform": "baidu_search", "result": baidu_result[0]}
+            else:
+                # 若好看视频未搜到,则采用西瓜搜索
+                logging(
+                    code="7001",
+                    info="通过baidu搜索失败---{}".format(",".join(cls.article_keys)),
+                    trace_id=cls.trace_id,
+                )
+                xigua_result = xigua_search(keyword=",".join(cls.article_keys))
+                if xigua_result:
+                    return {"platform": "xg_search", "result": xigua_result[0]}
+                else:
+                    logging(
+                        code="7001",
+                        info="通过西瓜搜索失败---{}".format(",".join(cls.article_keys)),
+                        trace_id=cls.trace_id,
+                    )
+                return None
+
+    @classmethod
+    def ab_3(cls):
+        """
+        article_summary + baidu
+        :return:
+        """
+        result = hksp_search(key=cls.article_summary)
+        return {"platform": "baidu_search", "result": result[0] if result else []}
+
+    @classmethod
+    def ab_4(cls):
+        """
+        article_summary + weixin
+        :return:
+        """
+        result = wx_search(keys=cls.article_summary)
+        return {"platform": "wx_search", "result": result[0] if result else []}
+
+    @classmethod
+    def ab_5(cls):
+        """
+        article_keys + weixin
+        :return:
+        """
+        result = wx_search(keys=",".join(cls.article_keys))
+        return {"platform": "wx_search", "result": result[0] if result else []}
+
+    @classmethod
+    def ab_6(cls):
+        """
+        article_keys + baidu
+        :return:
+        """
+        result = hksp_search(key=",".join(cls.article_keys))
+        return {"platform": "baidu_search", "result": result[0] if result else []}
+
+
+async def search_videos(kimi_info, trace_id, gh_id):
+    """
+    search and send msg to ETL
+    :param kimi_info:
+    :param gh_id: 通过账号 id 来控制实验策略
+    :param trace_id:
+    :return:
+    """
+    kimi_info["trace_id"] = trace_id
+    SearchAB = SearchABTest(info=kimi_info, gh_id=gh_id)
+    if ab_test_config.get(gh_id):
+        test_id = ab_test_config[gh_id]
+        if test_id == 0:
+            recall_obj = SearchAB.ab_0()
+        elif test_id == 1:
+            recall_obj = SearchAB.ab_1()
+        elif test_id == 2:
+            recall_obj = SearchAB.ab_2()
+        # elif test_id == 3:
+        #     recall_obj = SearchAB.ab_3()
+        # elif test_id == 4:
+        #     recall_obj = SearchAB.ab_4()
+        # elif test_id == 5:
+        #     recall_obj = SearchAB.ab_5()
+        # elif test_id == 6:
+        #     recall_obj = SearchAB.ab_6()
+        else:
+            recall_obj = {}
+    else:
+        recall_obj = SearchAB.ab_0()
+    if recall_obj:
+        platform = recall_obj["platform"]
+        recall_video = recall_obj["result"]
+        if recall_video:
+            logging(
+                code="7002",
+                info="视频搜索成功, 搜索平台为--{}".format(platform),
+                trace_id=trace_id,
+                data=recall_video,
+            )
+            await video_mq_sender(
+                video_obj=recall_video,
+                user=gh_id_dict.get(gh_id),
+                trace_id=trace_id,
+                platform=platform,
+            )
+    else:
+        logging(code="7003", info="视频搜索失败", trace_id=trace_id)

+ 6 - 0
applications/search/__init__.py

@@ -0,0 +1,6 @@
+"""
+@author: luojunhui
+"""
+from .hksp_search import hksp_search
+from .weixin_search import wx_search
+from .xigua_search import xigua_search

+ 111 - 0
applications/search/hksp_search.py

@@ -0,0 +1,111 @@
+"""
+@author: luojunhui
+好看视频搜索爬虫
+"""
+import json
+import time
+import base64
+import hashlib
+import requests
+import urllib.parse
+from uuid import uuid4
+from fake_useragent import FakeUserAgent
+
+from applications.functions.common import MySQLServer
+
+
+def get_video_detail(video_id):
+    """
+    获取好看视频的视频链接
+    :param video_id:
+    :return:
+    """
+    url = "https://haokan.baidu.com/v"
+    params = {
+        'vid': video_id,
+        '_format': 'json'
+    }
+
+    base_64_string = base64.b64encode(str(uuid4()).encode()).decode()
+    headers = {
+        'Accept': '*/*',
+        'cookie': "BIDUPSID={}".format(base_64_string),
+        'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
+        'Cache-Control': 'no-cache',
+        'Connection': 'keep-alive',
+        'Content-Type': 'application/x-www-form-urlencoded',
+        'Referer': 'https://haokan.baidu.com',
+        'User-Agent': FakeUserAgent().chrome,
+    }
+    response = requests.request(
+        "GET",
+        url,
+        headers=headers,
+        params=params
+    ).json()
+    return response['data']['apiData']['curVideoMeta']
+
+
+def hksp_search(key):
+    """
+    好看视频搜索爬虫
+    """
+    sensitive_words = MySQLServer().select_sensitive_words()
+
+    def sensitive_flag(s_words, ori_title):
+        """
+        :param ori_title:
+        :param s_words:
+        :return:
+        """
+        for word in s_words:
+            if word in ori_title:
+                return False
+        return True
+
+    timestamp_seconds = time.time()
+    timestamp_milliseconds = int(timestamp_seconds * 1000)
+    url = 'https://haokan.baidu.com/haokan/ui-search/pc/search/video'
+    # 定义请求的参数
+    strings = "{}_{}_{}_{}_{}".format(1, urllib.parse.quote(key), 10, timestamp_milliseconds, 1)
+    sign = hashlib.md5(strings.encode()).hexdigest()
+    params = {
+        'pn': 1,
+        'rn': 10,
+        'type': 'video',
+        'query': key,
+        'sign': sign,
+        'version': 1,
+        'timestamp': timestamp_milliseconds
+    }
+    # 定义请求头
+    base_64_string = base64.b64encode(str(uuid4()).encode()).decode()
+    headers = {
+        'authority': 'haokan.baidu.com',
+        'accept': '*/*',
+        'accept-language': 'zh,en;q=0.9,zh-CN;q=0.8',
+        'cookie': "BIDUPSID={}".format(base_64_string),
+        'user-agent': FakeUserAgent().chrome,
+        'x-requested-with': 'xmlhttprequest',
+    }
+    # 发送GET请求
+    response = requests.get(url, headers=headers, params=params).json()
+    try:
+        data_list = response['data']['list']
+        L = []
+        for data in data_list:
+            try:
+                video_id = data['vid']
+                title = data['title']
+                duration = int(data['duration'].split(":")[0]) * 60 + int(data['duration'].split(":")[1])
+                if sensitive_flag(sensitive_words, title) and int(duration) <= 300:
+                    res = get_video_detail(video_id)
+                    L.append(res)
+                else:
+                    continue
+            except Exception as e:
+                print(e)
+                pass
+        return L
+    except:
+        return []

+ 58 - 0
applications/search/weixin_search.py

@@ -0,0 +1,58 @@
+"""
+@author: luojunhui
+"""
+import json
+import requests
+
+from applications.functions.common import MySQLServer
+
+
+def wx_search(keys):
+    """
+    WeChat search
+    :param keys:
+    :return:
+    """
+
+    sensitive_words = MySQLServer().select_sensitive_words()
+
+    def sensitive_flag(s_words, ori_title):
+        """
+        :param ori_title:
+        :param s_words:
+        :return:
+        """
+        for word in s_words:
+            if word in ori_title:
+                return False
+        return True
+
+    url = "http://8.217.190.241:8888/crawler/wei_xin/keyword"
+    payload = json.dumps({
+        "keyword": keys,
+        "cursor": "0",
+        "content_type": "video"
+    })
+    headers = {
+        'Content-Type': 'application/json'
+    }
+    response = requests.request("POST", url, headers=headers, data=payload).json()
+    if response['msg'] == '未知错误':
+        return []
+    else:
+        L = []
+        if response['data']:
+            video_list = response['data']['data']
+            for video in video_list:
+                try:
+                    video_info = video['items'][0]
+                    title = video_info['title']
+                    duration_str = video_info['duration']
+                    dr = int(duration_str.split(":")[0].strip()) + int(duration_str.split(":")[1].strip())
+                    if sensitive_flag(sensitive_words, title) and dr <= 300:
+                        L.append(video_info)
+                    else:
+                        continue
+                except:
+                    pass
+        return L

+ 235 - 0
applications/search/xigua_search.py

@@ -0,0 +1,235 @@
+"""
+@author: luojunhui
+西瓜视频搜索爬虫
+"""
+import re
+import json
+import base64
+
+import requests
+import urllib.parse
+
+from lxml import etree
+from Crypto.Cipher import AES
+from Crypto.Util.Padding import unpad
+from fake_useragent import FakeUserAgent
+
+from applications.functions.common import MySQLServer
+
+
+class XiGuaFunctions(object):
+    """
+    XiGuaSearch Class
+    """
+
+    @classmethod
+    def tunnel_proxies(cls):
+        """
+            快代理方法
+            :return:
+            """
+        tunnel = "q796.kdltps.com:15818"
+        username = "t17772369458618"
+        password = "5zqcjkmy"
+        proxies = {
+            "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
+            "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
+        }
+        return proxies
+
+    @classmethod
+    def byte_dance_cookie(cls, item_id):
+        """
+        获取西瓜视频的 cookie
+        :param item_id:
+        """
+        sess = requests.Session()
+        sess.headers.update({
+            'user-agent': FakeUserAgent().chrome,
+            'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
+        })
+
+        # 获取 cookies
+        sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
+        data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
+        r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
+        if r.json()['redirect_url']:
+            requests.get(
+                url=r.json()['redirect_url']
+            )
+        return r.cookies.values()[0]
+
+    @classmethod
+    def aes_decrypt(cls, data, key):
+        """
+        XiGua AES decrypt
+        :param data:
+        :param key:
+        :return:
+        """
+        password = key.encode()
+        iv = password[:16]
+        try:
+            ct = base64.b64decode(data.encode())
+            cipher = AES.new(password, AES.MODE_CBC, iv)
+            pt = unpad(cipher.decrypt(ct), AES.block_size)
+            return base64.b64decode(pt).decode()
+        except Exception as e:
+            print("Incorrect decryption {}".format(e))
+            return None
+
+    @classmethod
+    def extract_video_url(cls, text):
+        """
+        获取视频 video_url
+        :param text:
+        :return:
+        """
+        HTML = etree.HTML(text)
+        str_2 = HTML.xpath('//script[@id="SSR_HYDRATED_DATA"]/text()')[0]
+        json_2 = str_2[str_2.find('{'):str_2.rfind('}') + 1]
+        Irregulars = ['null', 'undefined', '=false', '=true', 'false', 'true']
+        # python中不规则的定义
+        for I in Irregulars:
+            if I in ['=false', '=true']:
+                json_2 = json_2.replace(I, '=' + I[1:].capitalize())
+            else:
+                json_2 = json_2.replace(I, '12')
+        dict_2 = json.loads(json_2)["anyVideo"]["gidInformation"]["packerData"]["video"]
+        duration = dict_2["video_duration"]
+        play_cnt = dict_2['video_watch_count']
+        publish_time = int(dict_2['video_publish_time'])
+        like_cnt = dict_2['video_like_count']
+        video_title = dict_2['title']
+        video_id = dict_2['vid']
+        video_res = dict_2['videoResource']
+        cover_url = dict_2['poster_url'].replace("\\u002F", "/")
+        if video_res['dash'] == 12:
+            obj = video_res['normal']
+            ptk = obj['ptk']
+            video_list = obj['video_list']
+            keys = list(video_list.keys())
+            main_url = video_list[keys[-1]]['main_url']
+            real_video_url = cls.aes_decrypt(data=main_url, key=ptk)
+        else:
+            obj = video_res['dash']
+            ptk = obj["ptk"]
+            video_url = obj['dynamic_video']['main_url']
+            real_video_url = cls.aes_decrypt(data=video_url, key=ptk)
+        return {
+            "video_url": real_video_url,
+            "cover_url": cover_url,
+            "video_id": video_id,
+            "video_title": video_title,
+            "like_cnt": like_cnt,
+            "play_cnt": play_cnt,
+            "publish_time": publish_time,
+            "duration": duration
+        }
+
+    @classmethod
+    def extract_info_by_re(cls, text):
+        """
+        通过正则表达式获取文本中的信息
+        :param text:
+        :return:
+        """
+        result = cls.extract_video_url(text)
+        # 标题
+        title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
+        if title_match:
+            title_content = title_match.group(1)
+            title_content = title_content.split(" - ")[0]
+            try:
+                title_content = bytes(title_content, "latin1").decode()
+            except:
+                title_content = title_content
+        else:
+            title_content = ""
+        result['video_title'] = title_content
+        return result
+
+    @classmethod
+    def get_video_info(cls, item_id):
+        """
+        获取视频信息
+        """
+        url = "https://www.ixigua.com/{}".format(item_id)
+        headers = {
+            "accept-encoding": "gzip, deflate",
+            "accept-language": "zh-CN,zh-Hans;q=0.9",
+            "cookie": "ttwid={}".format(cls.byte_dance_cookie(item_id)),
+            "user-agent": FakeUserAgent().random,
+            "referer": "https://www.ixigua.com/{}/".format(item_id),
+        }
+        response = requests.get(
+            url=url,
+            headers=headers
+        )
+        video_info = cls.extract_info_by_re(response.text)
+        return video_info
+
+
+def xigua_search(keyword):
+    """
+    搜索
+    """
+    sensitive_words = MySQLServer().select_sensitive_words()
+
+    def sensitive_flag(s_words, ori_title):
+        """
+        :param ori_title:
+        :param s_words:
+        :return:
+        """
+        for word in s_words:
+            if word in ori_title:
+                return False
+        return True
+
+    keyword = urllib.parse.quote(keyword)
+    base_url = "https://www.ixigua.com/search/{}/ab_name=search&fss=input".format(
+        keyword
+    )
+    headers = {
+        "authority": "www.ixigua.com",
+        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+        "accept-language": "zh,en;q=0.9,zh-CN;q=0.8",
+        "cache-control": "max-age=0",
+        "cookie": "ixigua-a-s=1; support_webp=true; support_avif=true; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=Ur23fgYD2pMJOvi1BpILyfaobg8wA7IhGwmQx260ULRa8Dvjaxc5ZA63BUIP-6Vi473f; ttwid=1%7CNtTtSp4Iej-v0nWtepdZH3d3Ts6uGNMFzTN20ps1cdo%7C1708236945%7Cc1f301c64aa3bf69cdaa41f28856e2bb7b7eed16583f8c92d50cffa2d9944fc6; msToken=rr418opQf04vm8n9s8FAGdr1AoCUsvAOGKSDPbBEfwVS1sznxxZCvcZTI93qVz5uAXlX9yRwcKlNQZ4wMro2DmlHw5yWHAVeKr_SzgO1KtVVnjUMTUNEux_cq1-EIkI=",
+        "upgrade-insecure-requests": "1",
+        "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
+    }
+    basic_response = requests.get(url=base_url, headers=headers)
+    html = etree.HTML(basic_response.text)
+    result_list = html.xpath(
+        '//div[@class="HorizontalFeedCard searchPageV2__card"]/div[1]/a'
+    )
+    if result_list:
+        for item in result_list:
+            try:
+                url = item.xpath("@href")[0]
+                duration_str = str(item.xpath("./span/text()")[0])
+                duration_obj = duration_str.split(":")
+                if len(duration_obj) == 3:
+                    duration = 100000
+                elif len(duration_obj) == 2:
+                    duration = int(duration_str.split(":")[0]) * 60 + int(duration_str.split(":")[1])
+                else:
+                    duration = 10000
+                title = item.xpath("@title")[0]
+                real_title = bytes(str(title), "latin1").decode()
+                if sensitive_flag(sensitive_words, real_title) and duration <= 300:
+                    try:
+                        res = XiGuaFunctions().get_video_info(url[1:])
+                        if res:
+                            return [res]
+                        else:
+                            continue
+                    except Exception as e:
+                        print(e)
+            except Exception as e:
+                print(e)
+        return []
+    else:
+        return []

+ 365 - 0
applications/static/config.py

@@ -0,0 +1,365 @@
+"""
+@author: luojunhui
+"""
+
+gh_id_dict = {
+    "gh_01f8afd03366": {
+        "uid": 69637520,
+        "nick_name": "非亲非故"
+    },
+    "gh_058e41145a0c": {
+        "uid": 69637476,
+        "nick_name": "甜腻梦话"
+    },
+    "gh_084a485e859a": {
+        "uid": 69637472,
+        "nick_name": "梦星月"
+    },
+    "gh_0921c03402cd": {
+        "uid": 69637531,
+        "nick_name": "你的女友"
+    },
+    "gh_0c89e11f8bf3": {
+        "uid": 69637508,
+        "nick_name": "粟米"
+    },
+    "gh_171cec079b2a": {
+        "uid": 69637501,
+        "nick_name": "海上"
+    },
+    "gh_183d80deffb8": {
+        "uid": 69637491,
+        "nick_name": "论趣"
+    },
+    "gh_1ee2e1b39ccf": {
+        "uid": 69637473,
+        "nick_name": "纵有疾风起"
+    },
+    "gh_234ef02cdee5": {
+        "uid": 69637513,
+        "nick_name": "夹逼"
+    },
+    "gh_26a307578776": {
+        "uid": 69637490,
+        "nick_name": "最宝贝的宝贝"
+    },
+    "gh_29074b51f2b7": {
+        "uid": 69637530,
+        "nick_name": "沉舸"
+    },
+    "gh_2b8c6aa035ae": {
+        "uid": 69637470,
+        "nick_name": "懶得取名"
+    },
+    "gh_34318194fd0e": {
+        "uid": 69637517,
+        "nick_name": "徒四壁"
+    },
+    "gh_3845af6945d0": {
+        "uid": 69637545,
+        "nick_name": "秋水娉婷"
+    },
+    "gh_3ac6d7208961": {
+        "uid": 69637497,
+        "nick_name": "小熊的少女梦"
+    },
+    "gh_3c7d38636846": {
+        "uid": 69637519,
+        "nick_name": "油腻腻"
+    },
+    "gh_3df10391639c": {
+        "uid": 69637541,
+        "nick_name": "六郎娇面"
+    },
+    "gh_40a0ad154478": {
+        "uid": 69637516,
+        "nick_name": "禁止"
+    },
+    "gh_424c8eeabced": {
+        "uid": 69637522,
+        "nick_name": "认命"
+    },
+    "gh_4568b5a7e2fe": {
+        "uid": 69637482,
+        "nick_name": "香腮"
+    },
+    "gh_45beb952dc74": {
+        "uid": 69637488,
+        "nick_name": "毋庸"
+    },
+    "gh_484de412b0ef": {
+        "uid": 69637481,
+        "nick_name": "婪"
+    },
+    "gh_4c058673c07e": {
+        "uid": 69637474,
+        "nick_name": "影帝"
+    },
+    "gh_538f78f9d3aa": {
+        "uid": 69637478,
+        "nick_name": "伤痕"
+    },
+    "gh_56a6765df869": {
+        "uid": 69637514,
+        "nick_name": "风月"
+    },
+    "gh_56ca3dae948c": {
+        "uid": 69637538,
+        "nick_name": "留下太多回忆"
+    },
+    "gh_5e543853d8f0": {
+        "uid": 69637543,
+        "nick_name": "不知春秋"
+    },
+    "gh_5ff48e9fb9ef": {
+        "uid": 69637494,
+        "nick_name": "寻她找他"
+    },
+    "gh_671f460c856c": {
+        "uid": 69637523,
+        "nick_name": "绝不改悔"
+    },
+    "gh_6b7c2a257263": {
+        "uid": 69637528,
+        "nick_name": "奶牙"
+    },
+    "gh_6d205db62f04": {
+        "uid": 69637509,
+        "nick_name": "怕羞"
+    },
+    "gh_6d9f36e3a7be": {
+        "uid": 69637498,
+        "nick_name": "望长安"
+    },
+    "gh_73be0287bb94": {
+        "uid": 69637537,
+        "nick_name": "戏剧"
+    },
+    "gh_744cb16f6e16": {
+        "uid": 69637505,
+        "nick_name": "反駁"
+    },
+    "gh_7b4a5f86d68c": {
+        "uid": 69637477,
+        "nick_name": "我很想你"
+    },
+    "gh_7bca1c99aea0": {
+        "uid": 69637511,
+        "nick_name": "从小就很傲"
+    },
+    "gh_7e5818b2dd83": {
+        "uid": 69637532,
+        "nick_name": "二八佳人"
+    },
+    "gh_89ef4798d3ea": {
+        "uid": 69637533,
+        "nick_name": "彼岸花"
+    },
+    "gh_901b0d722749": {
+        "uid": 69637518,
+        "nick_name": "深情不为我"
+    },
+    "gh_9161517e5676": {
+        "uid": 69637495,
+        "nick_name": "折磨"
+    },
+    "gh_93e00e187787": {
+        "uid": 69637504,
+        "nick_name": "理会"
+    },
+    "gh_9877c8541764": {
+        "uid": 69637506,
+        "nick_name": "我沿着悲伤"
+    },
+    "gh_9cf3b7ff486b": {
+        "uid": 69637492,
+        "nick_name": "hoit"
+    },
+    "gh_9e559b3b94ca": {
+        "uid": 69637471,
+        "nick_name": "我与你相遇"
+    },
+    "gh_9f8dc5b0c74e": {
+        "uid": 69637496,
+        "nick_name": "港口"
+    },
+    "gh_a182cfc94dad": {
+        "uid": 69637539,
+        "nick_name": "四海八荒"
+    },
+    "gh_a2901d34f75b": {
+        "uid": 69637535,
+        "nick_name": "听腻了谎话"
+    },
+    "gh_a307072c04b9": {
+        "uid": 69637521,
+        "nick_name": "踏步"
+    },
+    "gh_a6351b447819": {
+        "uid": 69637540,
+        "nick_name": "七猫酒馆"
+    },
+    "gh_ac43e43b253b": {
+        "uid": 69637499,
+        "nick_name": "一厢情愿"
+    },
+    "gh_adca24a8f429": {
+        "uid": 69637483,
+        "nick_name": "对你何止一句喜欢"
+    },
+    "gh_b15de7c99912": {
+        "uid": 69637536,
+        "nick_name": "糖炒板栗"
+    },
+    "gh_b32125c73861": {
+        "uid": 69637493,
+        "nick_name": "发尾"
+    },
+    "gh_b3ffc1ca3a04": {
+        "uid": 69637546,
+        "nick_name": "主宰你心"
+    },
+    "gh_b8baac4296cb": {
+        "uid": 69637489,
+        "nick_name": "生性"
+    },
+    "gh_b9b99173ff8a": {
+        "uid": 69637524,
+        "nick_name": "养一只月亮"
+    },
+    "gh_bd57b6978e06": {
+        "uid": 69637527,
+        "nick_name": "厌遇"
+    },
+    "gh_be8c29139989": {
+        "uid": 69637502,
+        "nick_name": "不负"
+    },
+    "gh_bfe5b705324a": {
+        "uid": 69637529,
+        "nick_name": "乐极"
+    },
+    "gh_bff0bcb0694a": {
+        "uid": 69637534,
+        "nick_name": "简迷离"
+    },
+    "gh_c69776baf2cd": {
+        "uid": 69637512,
+        "nick_name": "骄纵"
+    },
+    "gh_c91b42649690": {
+        "uid": 69637503,
+        "nick_name": "荟萃"
+    },
+    "gh_d2cc901deca7": {
+        "uid": 69637487,
+        "nick_name": "恶意调笑"
+    },
+    "gh_d5f935d0d1f2": {
+        "uid": 69637500,
+        "nick_name": "青少年哪吒"
+    },
+    "gh_da76772d8d15": {
+        "uid": 69637526,
+        "nick_name": "独揽风月"
+    },
+    "gh_de9f9ebc976b": {
+        "uid": 69637475,
+        "nick_name": "剑出鞘恩怨了"
+    },
+    "gh_e0eb490115f5": {
+        "uid": 69637486,
+        "nick_name": "赋别"
+    },
+    "gh_e24da99dc899": {
+        "uid": 69637484,
+        "nick_name": "恋雨夏季"
+    },
+    "gh_e2576b7181c6": {
+        "uid": 69637515,
+        "nick_name": "满天星"
+    },
+    "gh_e75dbdc73d80": {
+        "uid": 69637542,
+        "nick_name": "情战"
+    },
+    "gh_e9d819f9e147": {
+        "uid": 69637525,
+        "nick_name": "与卿"
+    },
+    "gh_efaf7da157f5": {
+        "uid": 69637547,
+        "nick_name": "心野性子浪"
+    },
+    "gh_f4594783f5b8": {
+        "uid": 69637544,
+        "nick_name": "自缚"
+    },
+    "gh_fe6ef3a65a48": {
+        "uid": 69637480,
+        "nick_name": "风间"
+    }
+}
+
+sensitive_words = [
+    "台湾",
+    "南海",
+    "强奸",
+    "寂寞难耐",
+    "欲求不满",
+    "不雅视频",
+    "人妻",
+    "侵犯",
+    "正部级",
+    "外长",
+    "邓小平",
+    "林彪",
+    "李先念",
+    "毛主席",
+    "毛泽东",
+    "江青",
+    "朱镕基",
+    "胡耀邦",
+    "政治局",
+    "省委书记",
+    "国防部长",
+    "外交部长"
+]
+
+# 实验配置文件
+buy_accounts = [
+    "gh_084a485e859a",
+    "gh_e24da99dc899",
+    "gh_e0eb490115f5",
+    "gh_183d80deffb8",
+    "gh_5ff48e9fb9ef",
+    "gh_9f8dc5b0c74e",
+    "gh_6d9f36e3a7be"
+]
+
+dyy = [
+    "gh_9877c8541764",
+    "gh_6d205db62f04",
+    "gh_c69776baf2cd",
+    "gh_7e5818b2dd83",
+    "gh_89ef4798d3ea",
+    "gh_a2901d34f75b",
+    "gh_b15de7c99912"
+]
+
+ab_test_config = {
+    "gh_084a485e859a": 0,
+    "gh_e24da99dc899": 1,
+    "gh_e0eb490115f5": 2,
+    "gh_183d80deffb8": 0,
+    "gh_5ff48e9fb9ef": 1,
+    "gh_9f8dc5b0c74e": 2,
+    "gh_6d9f36e3a7be": 0,
+    "gh_9877c8541764": 1,
+    "gh_6d205db62f04": 2,
+    "gh_c69776baf2cd": 0,
+    "gh_7e5818b2dd83": 1,
+    "gh_89ef4798d3ea": 2,
+    "gh_a2901d34f75b": 1,
+    "gh_b15de7c99912": 2
+}

+ 71 - 0
dev/hksp_test_0515.py

@@ -0,0 +1,71 @@
+"""
+@author: luojunhui
+"""
+import json
+import pandas as pd
+from applications.search import hksp_search
+
+
+def ab_test(title):
+    """
+    :param title:
+    :return:
+    """
+    recall_list = hksp_search(title)
+    if recall_list:
+        best_0 = recall_list[0]
+        print(best_0['title'], best_0['playcnt'], best_0['like'] if best_0.get('like') else 0, best_0['comment'] if best_0.get("comment") else 0)
+        ab_1 = sorted(recall_list, reverse=True, key=lambda x: int(x['playcnt']))[0]
+        ab_2 = sorted(recall_list, reverse=True, key=lambda x: int(x['like'] if x.get('like') else 0))[0]
+        ab_3 = sorted(recall_list, reverse=True, key=lambda x: int(x['comment'] if x.get("comment") else 0))[0]
+        ab_4 = sorted(recall_list, reverse=True, key=lambda x: (int(x['like'] if x.get('like') else 0) / int(x['playcnt'])))[0]
+        # print(ab_1['title'], ab_1['playcnt'], ab_1['like'] if ab_1.get('like') else 0, ab_1['comment'] if ab_1.get("comment") else 0)
+        # print(ab_2['title'], ab_2['playcnt'], ab_2['like'] if ab_2.get('like') else 0, ab_2['comment'] if ab_2.get("comment") else 0)
+        # print(ab_3['title'], ab_3['playcnt'], ab_3['like'] if ab_3.get('like') else 0, ab_3['comment'] if ab_3.get("comment") else 0)
+        # print(ab_4['title'], ab_4['playcnt'], ab_4['like'] if ab_4.get('like') else 0, ab_4['comment'] if ab_4.get("comment") else 0)
+        return [best_0, ab_1, ab_2, ab_3, ab_4]
+    else:
+        return []
+
+
+title_list = [
+    "菲再闯仁爱礁,中国海警船掀了炮衣,好话说尽,1号令立即实施",
+    "乌克兰遭受严重损失!俄罗斯在西方国家大使馆展示摧毁的设备",
+    "中方出重拳了,一句话把以色列挂在火上烤,就看它能否接住",
+    "终于合作!中东国家大团结,以色列要完蛋",
+    "【2】毛远新从监狱刑满释放后,对女儿十分愧疚,回忆往事时他偷偷落泪",
+    "【2】韩国政局要变天,中国一年前的警示应验了,尹锡悦终究自食其果",
+    "金正恩乘专列过江,为何故意绕开中国?原因有两个,值得我们警惕",
+    "中方出手!给巴勒斯坦打去电话,“战狼”已至中东!",
+    "中国不欠犹太人的,华春莹用双语发文,西方该感恩没资历道德绑架"
+]
+
+with open("result.json", encoding="utf-8") as f:
+    title_dict = json.loads(f.read())
+
+ooo = []
+for line in title_list:
+    print(line)
+    c_title = title_dict[line]['c_title']
+    c_keys = "# ".join(title_dict[line]["keys"])
+    result = ab_test(line)
+    if result:
+        for index, item in enumerate(result):
+            temp = [line, c_title, c_keys, "ab_{}".format(index), item['title'], item.get('playcnt', None), item.get('like', None), item.get('comment', None), item['playurl']]
+            ooo.append(temp)
+    else:
+        sub_result = ab_test(title_dict[line]['c_title'])
+        if sub_result:
+            for index, item in enumerate(sub_result):
+                temp = [line, c_title, c_keys, "ab_{}".format(index), item['title'], item.get('playcnt', None), item.get('like', None), item.get('comment', None), item['playurl']]
+                print(temp)
+                ooo.append(temp)
+        else:
+            ss_result = ab_test(title_dict[line]['keys'][0])
+            for index, item in enumerate(ss_result):
+                temp = [line, c_title, c_keys,  "ab_{}".format(index), item['title'], item.get('playcnt', None), item.get('like', None), item.get('comment', None), item['playurl']]
+                print(temp)
+                ooo.append(temp)
+
+df = pd.DataFrame(ooo, columns=['article_title', 'kimi_content_summary', 'kimi_content_keys', 'ab_test', 'out_title', 'views', 'like', 'comment', 'videoUrl'])
+df.to_excel("baidu_test.xlsx", index=False)

+ 74 - 0
dev/notes

@@ -0,0 +1,74 @@
+gh_2b8c6aa035ae	魔法美学馆
+gh_9e559b3b94ca	票圈大事件
+gh_084a485e859a	生活情感叁读
+gh_1ee2e1b39ccf	票圈最新消息
+gh_4c058673c07e	探马再探再报
+gh_de9f9ebc976b	赵师傅厨房秘笈
+gh_058e41145a0c	小琪故事馆
+gh_7b4a5f86d68c	八卦不断线
+gh_538f78f9d3aa	张阿姨爱美食
+gh_fe6ef3a65a48	心灵智慧馆
+gh_484de412b0ef	充电宝宝
+gh_4568b5a7e2fe	王小八娱乐
+gh_adca24a8f429	兔子爱蹬鹰
+gh_e24da99dc899	缘来养心厅
+gh_e0eb490115f5	心灵情感驿站
+gh_d2cc901deca7	票圈极速版
+gh_45beb952dc74	票圈乐活
+gh_b8baac4296cb	票圈原创视频精选
+gh_26a307578776	票圈美文速递
+gh_183d80deffb8	生活良读
+gh_9cf3b7ff486b	票圈热门
+gh_b32125c73861	票圈奇闻
+gh_5ff48e9fb9ef	祝福养心厅
+gh_9161517e5676	宝娃趣味游戏
+gh_9f8dc5b0c74e	音药金曲厅
+gh_3ac6d7208961	异次元玩家
+gh_6d9f36e3a7be	音药养心馆
+gh_ac43e43b253b	小阳看天下
+gh_d5f935d0d1f2	半仙社评
+gh_171cec079b2a	观察家王小姐
+gh_be8c29139989	心灵书局
+gh_c91b42649690	心理调色盘
+gh_93e00e187787	小惠爱厨房
+gh_744cb16f6e16	美味在人间
+gh_9877c8541764	退休老年圈
+gh_0c89e11f8bf3	幸福启示
+gh_6d205db62f04	指尖奇文
+gh_7bca1c99aea0	慧行社
+gh_c69776baf2cd	老友欢聚地
+gh_234ef02cdee5	姜子丫
+gh_56a6765df869	婉央女子
+gh_e2576b7181c6	六八评价
+gh_40a0ad154478	所见畅谈
+gh_34318194fd0e	老新说事
+gh_901b0d722749	壹姐八卦
+gh_3c7d38636846	圈内侃八卦
+gh_01f8afd03366	奇闻有约
+gh_a307072c04b9	生活智慧正能量
+gh_424c8eeabced	爱姨生活妙招
+gh_671f460c856c	日日有妙招
+gh_b9b99173ff8a	实在妙招
+gh_e9d819f9e147	热血军中事
+gh_da76772d8d15	娱乐在前
+gh_bd57b6978e06	八点说故事
+gh_6b7c2a257263	幸福晚年知音
+gh_bfe5b705324a	奇趣百味生活
+gh_29074b51f2b7	老来生活家
+gh_0921c03402cd	俏生活秘籍
+gh_7e5818b2dd83	便捷生活好方法
+gh_89ef4798d3ea	生活百态观
+gh_bff0bcb0694a	喜乐生活派
+gh_a2901d34f75b	畅聊奇闻
+gh_b15de7c99912	人生百事观
+gh_73be0287bb94	军莫愁
+gh_56ca3dae948c	老友闲谈
+gh_a182cfc94dad	冀中轶事
+gh_a6351b447819	冀中精彩生活
+gh_3df10391639c	冀中生活谈
+gh_e75dbdc73d80	票圈正能量
+gh_5e543853d8f0	票圈精彩
+gh_f4594783f5b8	俏丽音乐相册
+gh_3845af6945d0	新品女装特价
+gh_b3ffc1ca3a04	票圈内容精选
+gh_efaf7da157f5	票圈热议

+ 17 - 0
dev/read_in.py

@@ -0,0 +1,17 @@
+"""
+@author: luojunhui
+"""
+import json
+
+file_path = 'ttt.txt'
+
+with open(file_path, encoding="utf-8") as f:
+    data_lines = f.readlines()
+
+dy_c = 0
+buy_c = 0
+for line in data_lines:
+    data = json.loads(json.loads(line[:-1])['data'])
+    if data['productionPath']:
+        if "20764105" in data['productionPath']:
+            print(1)

Diff do ficheiro suprimidas por serem muito extensas
+ 9 - 0
dev/test.py


+ 15 - 0
dev/test_search.py

@@ -0,0 +1,15 @@
+"""
+@author: luojunhui
+"""
+from applications.search import *
+
+keys = "湖人大战勇士"
+
+wx_result = wx_search(keys)
+print(wx_result)
+
+xg_result = xigua_search(keys)
+print(xg_result)
+
+baidu_result = hksp_search(keys)
+print(baidu_result)

Diff do ficheiro suprimidas por serem muito extensas
+ 13 - 0
dev/title_to_search.py


+ 1 - 1
hypercorn_config.toml

@@ -1,7 +1,7 @@
 reload = true
 bind = "0.0.0.0:8111"
 workers = 2
-keep_alive_timeout = 60  # 保持连接的最大秒数,根据需要调整
+keep_alive_timeout = 120  # 保持连接的最大秒数,根据需要调整
 graceful_timeout = 30    # 重启或停止之前等待当前工作完成的时间
 loglevel = "debug"  # 日志级别
 accesslog = "access.log"  # 访问日志文件

+ 2 - 0
requirements.txt

@@ -102,3 +102,5 @@ wsproto==1.2.0
 WTForms==3.1.2
 yarl==1.9.4
 zipp==3.16.2
+
+lxml~=5.2.1

+ 0 - 44
test.py

@@ -1,44 +0,0 @@
-"""
-@author: luojunhui
-"""
-import json
-import time
-import requests
-import argparse
-from concurrent.futures import ThreadPoolExecutor
-
-
-def request_data(url):
-    # index = _url.split("#")[0]
-    # url = _url.split("#")[1]
-    body = {
-        "accountName": "魔法美学馆",
-        "content": "8月20日,最高人民法院举行新闻发布会,发布新修订的《最高人民法院关于审理民间借贷案件适用法律若干问题的规定》(以下简称《规定》)并回答记者提问。",
-        "title": "🔴日本收到俄罗斯令人惊慌的消息😱",
-        "search_keys": ["日本核污水排海"],
-        "ghId": "gh_efaf7da157f5"
-    }
-    t = time.time()
-    res = requests.post(url, json=body)
-    e = time.time()
-    # print(index)
-    print(e - t)
-    print(json.dumps(res.json(), ensure_ascii=False, indent=4))
-    # print(res.json())
-
-
-if __name__ == "__main__":
-    # parser = argparse.ArgumentParser()  # 新建参数解释器对象
-    # parser.add_argument("--thread")
-    # args = parser.parse_args()
-    # thread = int(args.thread)
-    dt = ["http://61.48.133.26:8111/title_to_video"]
-    # total_s = time.time()
-    request_data(dt[0])
-    # with ThreadPoolExecutor(max_workers=thread) as pool:
-    #     pool.map(request_data, dt)
-    # total_e = time.time()
-    # print(total_e - total_s)
-import uuid
-import urllib.parse
-

Alguns ficheiros não foram mostrados porque muitos ficheiros mudaram neste diff