Prechádzať zdrojové kódy

MySQL 服务
异步化 v0.2开发上线

罗俊辉 11 mesiacov pred
rodič
commit
05674e269f

+ 12 - 0
applications/functions/common.py

@@ -127,6 +127,18 @@ class Functions(object):
             .replace("'", "")
         )
 
+    @classmethod
+    def sensitive_flag(cls, s_words, ori_title):
+        """
+        :param s_words:
+        :param ori_title:
+        :return:
+        """
+        for word in s_words:
+            if word in ori_title:
+                return False
+        return True
+
 
 class MySQLServer(object):
     """

+ 1 - 1
applications/routes.py

@@ -11,7 +11,7 @@ from applications.functions.log import logging
 from applications.schedule import recall_videos, search_videos
 from applications.functions.kimi import KimiServer
 
-my_blueprint = Blueprint('kimi', __name__)
+my_blueprint = Blueprint('LongArticles', __name__)
 
 
 def Routes(mysql_client):

+ 78 - 150
applications/schedule/search_schedule.py

@@ -4,10 +4,11 @@
 """
 
 from applications.search import *
-from applications.static.config import gh_id_dict, ab_test_config
+from applications.static.config import gh_id_dict
 from applications.functions.log import logging
 from applications.functions.video_item import VideoProducer
 from applications.functions.async_etl import AsyncETL
+from applications.functions.common import MySQLServer
 
 
 class SearchABTest(object):
@@ -39,176 +40,103 @@ class SearchABTest(object):
         cls.gh_id = gh_id
 
     @classmethod
-    def dd(cls):
+    async def base_line(cls):
         """
-        兜底
+        兜底策略
+        """
+        return await SearchMethod().search_v0(
+            text=cls.article_keys[0],
+            trace_id=cls.trace_id
+        )
+
+    @classmethod
+    async def ab_0(cls):
+        """
+        默认原标题搜索
         :return:
         """
-        wx_result_ = wx_search(keys=cls.article_keys[0])
-        if wx_result_:
-            logging(
-                code="7011",
-                info="微信兜底搜索成功",
-                trace_id=cls.trace_id,
-            )
-            return {"platform": "wx_search", "result": wx_result_[0]}
+        search_result = await SearchMethod().search_v0(
+            text=cls.ori_title,
+            trace_id=cls.trace_id
+        )
+        if search_result:
+            return search_result
         else:
-            baidu_result_ = hksp_search(key=cls.article_keys[0])
-            if baidu_result_:
-                logging(
-                    code="7011",
-                    info="百度兜底搜索成功",
-                    trace_id=cls.trace_id,
-                )
-                return {"platform": "baidu_search", "result": baidu_result_[0]}
-            else:
-                return None
+            return await cls.base_line()
 
     @classmethod
-    def ab_0(cls):
+    async def ab_1(cls):
         """
-        默认搜索逻辑
+        使用 content_summary搜索
         :return:
         """
-        wx_result = wx_search(keys=cls.ori_title)
-        if wx_result:
-            return {"platform": "wx_search", "result": wx_result[0]}
+        search_result = await SearchMethod().search_v0(
+            text=cls.article_summary,
+            trace_id=cls.trace_id
+        )
+        if search_result:
+            return search_result
         else:
-            logging(
-                code="7001",
-                info="通过微信搜索失败---{}".format(cls.ori_title),
-                trace_id=cls.trace_id,
-            )
-            # 微信搜不到的话,采用好看视频搜索
-            baidu_result = hksp_search(key=cls.ori_title)
-            if baidu_result:
-                return {"platform": "baidu_search", "result": baidu_result[0]}
-            else:
-                # 若好看视频未搜到,则采用西瓜搜索
-                logging(
-                    code="7001",
-                    info="通过baidu搜索失败---{}".format(cls.ori_title),
-                    trace_id=cls.trace_id,
-                )
-                xigua_result = xigua_search(keyword=cls.ori_title)
-                if xigua_result:
-                    return {"platform": "xg_search", "result": xigua_result[0]}
-                else:
-                    logging(
-                        code="7001",
-                        info="通过西瓜搜索失败---{}, 启用兜底方式".format(cls.ori_title),
-                        trace_id=cls.trace_id,
-                    )
-                    return cls.dd()
+            return await cls.base_line()
 
     @classmethod
-    def ab_1(cls):
+    async def ab_2(cls):
         """
+        使用文本关键词搜索
         :return:
         """
-        wx_result = wx_search(keys=cls.article_summary)
-        if wx_result:
-            return {"platform": "wx_search", "result": wx_result[0]}
+        search_result = await SearchMethod().search_v0(
+            text=",".join(cls.article_keys),
+            trace_id=cls.trace_id
+        )
+        if search_result:
+            return search_result
         else:
-            logging(
-                code="7001",
-                info="通过微信搜索失败---{}".format(cls.article_summary),
-                trace_id=cls.trace_id,
-            )
-            # 微信搜不到的话,采用好看视频搜索
-            baidu_result = hksp_search(key=cls.article_summary)
-            if baidu_result:
-                return {"platform": "baidu_search", "result": baidu_result[0]}
-            else:
-                # 若好看视频未搜到,则采用西瓜搜索
-                logging(
-                    code="7001",
-                    info="通过baidu搜索失败---{}".format(cls.article_summary),
-                    trace_id=cls.trace_id,
-                )
-                xigua_result = xigua_search(keyword=cls.article_summary)
-                if xigua_result:
-                    return {"platform": "xg_search", "result": xigua_result[0]}
-                else:
-                    logging(
-                        code="7001",
-                        info="通过西瓜搜索失败---{},启用兜底方式".format(cls.article_summary),
-                        trace_id=cls.trace_id,
-                    )
-                    return cls.dd()
+            return await cls.base_line()
+
+
+class SearchMethod(object):
+    """
+    搜索召回模式
+    """
+    s_words = MySQLServer().select_sensitive_words()
 
     @classmethod
-    def ab_2(cls):
+    async def search_v0(cls, text, trace_id):
         """
-        ori_title + wx
+        搜索顺序-wx --> baidu --> xigua
         :return:
         """
-        wx_result = wx_search(keys=",".join(cls.article_keys))
+        wx_result = wx_search(keys=text, sensitive_words=cls.s_words)
         if wx_result:
             return {"platform": "wx_search", "result": wx_result[0]}
         else:
             logging(
                 code="7001",
-                info="通过微信搜索失败---{}".format(",".join(cls.article_keys)),
-                trace_id=cls.trace_id,
+                info="通过微信搜索失败---{}".format(text),
+                trace_id=trace_id,
             )
             # 微信搜不到的话,采用好看视频搜索
-            baidu_result = hksp_search(key=",".join(cls.article_keys))
+            baidu_result = hksp_search(key=text, sensitive_words=cls.s_words)
             if baidu_result:
                 return {"platform": "baidu_search", "result": baidu_result[0]}
             else:
                 # 若好看视频未搜到,则采用西瓜搜索
                 logging(
                     code="7001",
-                    info="通过baidu搜索失败---{}".format(",".join(cls.article_keys)),
-                    trace_id=cls.trace_id,
+                    info="通过baidu搜索失败---{}".format(text),
+                    trace_id=trace_id,
                 )
-                xigua_result = xigua_search(keyword=",".join(cls.article_keys))
+                xigua_result = xigua_search(keyword=text, sensitive_words=cls.s_words)
                 if xigua_result:
                     return {"platform": "xg_search", "result": xigua_result[0]}
                 else:
                     logging(
                         code="7001",
-                        info="通过西瓜搜索失败---{},启用兜底".format(",".join(cls.article_keys)),
-                        trace_id=cls.trace_id,
+                        info="通过西瓜搜索失败---{}, 启用兜底方式".format(text),
+                        trace_id=trace_id,
                     )
-                    return cls.dd()
-
-    @classmethod
-    def ab_3(cls):
-        """
-        article_summary + baidu
-        :return:
-        """
-        result = hksp_search(key=cls.article_summary)
-        return {"platform": "baidu_search", "result": result[0] if result else []}
-
-    @classmethod
-    def ab_4(cls):
-        """
-        article_summary + weixin
-        :return:
-        """
-        result = wx_search(keys=cls.article_summary)
-        return {"platform": "wx_search", "result": result[0] if result else []}
-
-    @classmethod
-    def ab_5(cls):
-        """
-        article_keys + weixin
-        :return:
-        """
-        result = wx_search(keys=",".join(cls.article_keys))
-        return {"platform": "wx_search", "result": result[0] if result else []}
-
-    @classmethod
-    def ab_6(cls):
-        """
-        article_keys + baidu
-        :return:
-        """
-        result = hksp_search(key=",".join(cls.article_keys))
-        return {"platform": "baidu_search", "result": result[0] if result else []}
+                    return None
 
 
 async def video_sender(video_obj, user, trace_id, platform):
@@ -265,26 +193,26 @@ async def search_videos(kimi_info, trace_id, gh_id, mysql_client):
     """
     kimi_info["trace_id"] = trace_id
     SearchAB = SearchABTest(info=kimi_info, gh_id=gh_id)
-    if ab_test_config.get(gh_id):
-        test_id = ab_test_config[gh_id]
-        if test_id == 0:
-            recall_obj = SearchAB.ab_0()
-        elif test_id == 1:
-            recall_obj = SearchAB.ab_1()
-        elif test_id == 2:
-            recall_obj = SearchAB.ab_2()
-        # elif test_id == 3:
-        #     recall_obj = SearchAB.ab_3()
-        # elif test_id == 4:
-        #     recall_obj = SearchAB.ab_4()
-        # elif test_id == 5:
-        #     recall_obj = SearchAB.ab_5()
-        # elif test_id == 6:
-        #     recall_obj = SearchAB.ab_6()
-        else:
-            recall_obj = {}
-    else:
-        recall_obj = SearchAB.ab_0()
+    # if ab_test_config.get(gh_id):
+    #     test_id = ab_test_config[gh_id]
+    #     if test_id == 0:
+    #         recall_obj = SearchAB.ab_0()
+    #     elif test_id == 1:
+    #         recall_obj = SearchAB.ab_1()
+    #     elif test_id == 2:
+    #         recall_obj = SearchAB.ab_2()
+    #     # elif test_id == 3:
+    #     #     recall_obj = SearchAB.ab_3()
+    #     # elif test_id == 4:
+    #     #     recall_obj = SearchAB.ab_4()
+    #     # elif test_id == 5:
+    #     #     recall_obj = SearchAB.ab_5()
+    #     # elif test_id == 6:
+    #     #     recall_obj = SearchAB.ab_6()
+    #     else:
+    #         recall_obj = {}
+    # else:
+    recall_obj = await SearchAB.ab_1()
     if recall_obj:
         platform = recall_obj["platform"]
         recall_video = recall_obj["result"]

+ 3 - 16
applications/search/hksp_search.py

@@ -11,7 +11,7 @@ import urllib.parse
 from uuid import uuid4
 from fake_useragent import FakeUserAgent
 
-from applications.functions.common import MySQLServer
+from applications.functions.common import Functions
 
 
 def get_video_detail(video_id):
@@ -46,23 +46,10 @@ def get_video_detail(video_id):
     return response['data']['apiData']['curVideoMeta']
 
 
-def hksp_search(key):
+def hksp_search(key, sensitive_words):
     """
     好看视频搜索爬虫
     """
-    sensitive_words = MySQLServer().select_sensitive_words()
-
-    def sensitive_flag(s_words, ori_title):
-        """
-        :param ori_title:
-        :param s_words:
-        :return:
-        """
-        for word in s_words:
-            if word in ori_title:
-                return False
-        return True
-
     timestamp_seconds = time.time()
     timestamp_milliseconds = int(timestamp_seconds * 1000)
     url = 'https://haokan.baidu.com/haokan/ui-search/pc/search/video'
@@ -98,7 +85,7 @@ def hksp_search(key):
                 video_id = data['vid']
                 title = data['title']
                 duration = int(data['duration'].split(":")[0]) * 60 + int(data['duration'].split(":")[1])
-                if sensitive_flag(sensitive_words, title) and int(duration) <= 300:
+                if Functions().sensitive_flag(sensitive_words, title) and int(duration) <= 300:
                     res = get_video_detail(video_id)
                     L.append(res)
                 else:

+ 4 - 17
applications/search/weixin_search.py

@@ -4,29 +4,16 @@
 import json
 import requests
 
-from applications.functions.common import MySQLServer
+from applications.functions.common import Functions
 
 
-def wx_search(keys):
+def wx_search(keys, sensitive_words):
     """
     WeChat search
+    :param sensitive_words:
     :param keys:
     :return:
     """
-
-    sensitive_words = MySQLServer().select_sensitive_words()
-
-    def sensitive_flag(s_words, ori_title):
-        """
-        :param ori_title:
-        :param s_words:
-        :return:
-        """
-        for word in s_words:
-            if word in ori_title:
-                return False
-        return True
-
     url = "http://8.217.190.241:8888/crawler/wei_xin/keyword"
     payload = json.dumps({
         "keyword": keys,
@@ -49,7 +36,7 @@ def wx_search(keys):
                     title = video_info['title']
                     duration_str = video_info['duration']
                     dr = int(duration_str.split(":")[0].strip()) + int(duration_str.split(":")[1].strip())
-                    if sensitive_flag(sensitive_words, title) and dr <= 300:
+                    if Functions().sensitive_flag(sensitive_words, title) and dr <= 300:
                         L.append(video_info)
                     else:
                         continue

+ 3 - 16
applications/search/xigua_search.py

@@ -14,7 +14,7 @@ from Crypto.Cipher import AES
 from Crypto.Util.Padding import unpad
 from fake_useragent import FakeUserAgent
 
-from applications.functions.common import MySQLServer
+from applications.functions.common import Functions
 
 
 class XiGuaFunctions(object):
@@ -170,23 +170,10 @@ class XiGuaFunctions(object):
         return video_info
 
 
-def xigua_search(keyword):
+def xigua_search(keyword, sensitive_words):
     """
     搜索
     """
-    sensitive_words = MySQLServer().select_sensitive_words()
-
-    def sensitive_flag(s_words, ori_title):
-        """
-        :param ori_title:
-        :param s_words:
-        :return:
-        """
-        for word in s_words:
-            if word in ori_title:
-                return False
-        return True
-
     keyword = urllib.parse.quote(keyword)
     base_url = "https://www.ixigua.com/search/{}/ab_name=search&fss=input".format(
         keyword
@@ -219,7 +206,7 @@ def xigua_search(keyword):
                     duration = 10000
                 title = item.xpath("@title")[0]
                 real_title = bytes(str(title), "latin1").decode()
-                if sensitive_flag(sensitive_words, real_title) and duration <= 300:
+                if Functions().sensitive_flag(sensitive_words, real_title) and duration <= 300:
                     try:
                         res = XiGuaFunctions().get_video_info(url[1:])
                         if res:

+ 9 - 9
applications/static/config.py

@@ -348,18 +348,18 @@ dyy = [
 ]
 
 ab_test_config = {
-    "gh_084a485e859a": 0,
+    "gh_084a485e859a": 1,
     "gh_e24da99dc899": 1,
-    "gh_e0eb490115f5": 2,
-    "gh_183d80deffb8": 0,
+    "gh_e0eb490115f5": 1,
+    "gh_183d80deffb8": 1,
     "gh_5ff48e9fb9ef": 1,
-    "gh_9f8dc5b0c74e": 2,
-    "gh_6d9f36e3a7be": 0,
+    "gh_9f8dc5b0c74e": 1,
+    "gh_6d9f36e3a7be": 1,
     "gh_9877c8541764": 1,
-    "gh_6d205db62f04": 2,
-    "gh_c69776baf2cd": 0,
+    "gh_6d205db62f04": 1,
+    "gh_c69776baf2cd": 1,
     "gh_7e5818b2dd83": 1,
-    "gh_89ef4798d3ea": 2,
+    "gh_89ef4798d3ea": 1,
     "gh_a2901d34f75b": 1,
-    "gh_b15de7c99912": 2
+    "gh_b15de7c99912": 1
 }

+ 37 - 10
dev/read_in.py

@@ -2,16 +2,43 @@
 @author: luojunhui
 """
 import json
+from tqdm import tqdm
+import pandas as pd
+
+"""
+    m 生活情感叁读 0
+    m 缘来养心厅 1
+    m 心灵情感驿站 2
+    m 生活良读 0
+    m 祝福养心厅 1
+    m 音药金曲厅 2
+    d 便捷生活好方法 1
+    d 畅聊奇闻 1
+"""
+
+
+def function1(account):
+    if account in ["生活情感叁读", "生活良读"]:
+        return 0
+    elif account in ["缘来养心厅", "祝福养心厅", "便捷生活好方法", "畅聊奇闻"]:
+        return 1
+    elif account in ["心灵情感驿站", "音药金曲厅"]:
+        return 2
+    else:
+        return None
+
+
+def function2(account):
+    if account in ["生活情感叁读", "生活良读", "缘来养心厅", "祝福养心厅", "心灵情感驿站", "音药金曲厅"]:
+        return "买号"
+    elif account in ["便捷生活好方法", "畅聊奇闻"]:
+        return "代运营"
 
-file_path = 'ttt.txt'
 
-with open(file_path, encoding="utf-8") as f:
-    data_lines = f.readlines()
+file_path = "result-analysis.xlsx"
 
-dy_c = 0
-buy_c = 0
-for line in data_lines:
-    data = json.loads(json.loads(line[:-1])['data'])
-    if data['productionPath']:
-        if "20764105" in data['productionPath']:
-            print(1)
+data_frame = pd.read_excel(file_path)
+print(data_frame.columns)
+data_frame['账号类型'] = data_frame['账号来源'].apply(function2)
+data_frame['搜索策略'] = data_frame['账号来源'].apply(function1)
+data_frame.to_excel("result.xlsx")

Rozdielové dáta súboru neboli zobrazené, pretože súbor je príliš veľký
+ 14 - 0
dev/test_v2.py


Rozdielové dáta súboru neboli zobrazené, pretože súbor je príliš veľký
+ 14 - 0
dev/title_to_search.py


+ 50 - 0
dev/ttt.py

@@ -0,0 +1,50 @@
+"""
+@author: luojunhui
+"""
+ab_test_config = {
+    "gh_084a485e859a": 0,
+    "gh_e24da99dc899": 1,
+    "gh_e0eb490115f5": 2,
+    "gh_183d80deffb8": 0,
+    "gh_5ff48e9fb9ef": 1,
+    "gh_9f8dc5b0c74e": 2,
+    "gh_6d9f36e3a7be": 0,
+    "gh_9877c8541764": 1,
+    "gh_6d205db62f04": 2,
+    "gh_c69776baf2cd": 0,
+    "gh_7e5818b2dd83": 1,
+    "gh_89ef4798d3ea": 2,
+    "gh_a2901d34f75b": 1,
+    "gh_b15de7c99912": 2
+}
+
+with open("notes", encoding="utf-8") as f:
+    data = f.readlines()
+buy_accounts = [
+    "gh_084a485e859a",
+    "gh_e24da99dc899",
+    "gh_e0eb490115f5",
+    "gh_183d80deffb8",
+    "gh_5ff48e9fb9ef",
+    "gh_9f8dc5b0c74e",
+    "gh_6d9f36e3a7be"
+]
+dyy = [
+    "gh_9877c8541764",
+    "gh_6d205db62f04",
+    "gh_c69776baf2cd",
+    "gh_7e5818b2dd83",
+    "gh_89ef4798d3ea",
+    "gh_a2901d34f75b",
+    "gh_b15de7c99912"
+]
+for line in data:
+    tt = line.split("\t")
+    gh_id = tt[0]
+    account_name = tt[1].replace("\n", "")
+    if gh_id in buy_accounts:
+        print("m", account_name, ab_test_config[gh_id])
+    elif gh_id in dyy:
+        print("d", account_name, ab_test_config[gh_id])
+    else:
+        continue

+ 1 - 1
hypercorn_config.toml

@@ -1,5 +1,5 @@
 reload = true
-bind = "0.0.0.0:8111"
+bind = "0.0.0.0:9111"
 workers = 4
 keep_alive_timeout = 120  # 保持连接的最大秒数,根据需要调整
 graceful_timeout = 30    # 重启或停止之前等待当前工作完成的时间

Niektoré súbory nie sú zobrazené, pretože je v týchto rozdielových dátach zmenené mnoho súborov