vor 11 Monaten · 4f1e08f108
--- a/applications/__init__.py
+++ b/applications/__init__.py
@@ -3,6 +3,7 @@
 
				 """
			
 
				 from .aidit_api import AIDTApi
			
 
				 from .denet_mysql import DeNetMysql
			
 
				+from .ad_mysql import AdMySQL
			
 
				 from .pq_mysql import PQMySQL
			
 
				 from .functions import Functions
			
 
				 from .data_works import ODPSApi
			
--- a/applications/ad_mysql.py
+++ b/applications/ad_mysql.py
@@ -0,0 +1,42 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+import pymysql
			
 
				+
			
 
				+
			
 
				+class AdMySQL(object):
			
 
				+    """
			
 
				+    PQ Mysql
			
 
				+    """
			
 
				+    connection = pymysql.connect(
			
 
				+        host='rm-bp12k5fuh5zyx31d28o.mysql.rds.aliyuncs.com',
			
 
				+        port=3306,
			
 
				+        user='wx2023_ad',
			
 
				+        password='wx2023_adP@assword1234',
			
 
				+        db='adplatform',
			
 
				+        charset='utf8mb4'
			
 
				+    )
			
 
				+
			
 
				+    # @classmethod
			
 
				+    # def update(cls, sql, params):
			
 
				+    #     """
			
 
				+    #     更新
			
 
				+    #     :return:
			
 
				+    #     """
			
 
				+    #     cursor = cls.connection.cursor()
			
 
				+    #     cursor.execute(sql, params)
			
 
				+    #     cls.connection.commit()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def select(cls, sql):
			
 
				+        """
			
 
				+        查询
			
 
				+        :param sql:
			
 
				+        :return:
			
 
				+        """
			
 
				+        cursor = cls.connection.cursor()
			
 
				+        cursor.execute(sql)
			
 
				+        result = cursor.fetchall()
			
 
				+        return result
			
 
				+
			
 
				+
			
--- a/applications/functions.py
+++ b/applications/functions.py
@@ -51,3 +51,72 @@ class Functions(object):
 
				         cursor.execute(sql)
			
 
				         article_link = cursor.fetchone()
			
 
				         return article_link
			
 
				+
			
 
				+    @classmethod
			
 
				+    def matchLinkByIdTuple(cls, channel_id_tuple):
			
 
				+        """
			
 
				+        Use channelContentId to match articleUrl
			
 
				+        :param channel_id_tuple:
			
 
				+        :return:
			
 
				+        """
			
 
				+        connection = pymysql.connect(
			
 
				+            host='rm-bp12k5fuh5zyx31d28o.mysql.rds.aliyuncs.com',
			
 
				+            port=3306,
			
 
				+            user='wx2023_ad',
			
 
				+            password='wx2023_adP@assword1234',
			
 
				+            db='adplatform',
			
 
				+            charset='utf8mb4'
			
 
				+        )
			
 
				+        sql = f"""select id, account_id, link, item_index, title from changwen_article where id in {channel_id_tuple};"""
			
 
				+        cursor = connection.cursor()
			
 
				+        cursor.execute(sql)
			
 
				+        article_link = cursor.fetchall()
			
 
				+        L = {}
			
 
				+        for line in article_link:
			
 
				+            key = line[0]
			
 
				+            value = {
			
 
				+                "gh_key": "{}_{}".format(line[1], line[3]),
			
 
				+                "url": line[2],
			
 
				+                "title": line[4]
			
 
				+            }
			
 
				+            L[key] = value
			
 
				+        return L
			
 
				+
			
 
				+    @classmethod
			
 
				+    def TitleSimilarity(cls, title_list, target_title):
			
 
				+        """
			
 
				+        计算标题相似度
			
 
				+        :return:
			
 
				+        """
			
 
				+
			
 
				+        def title_sim_v2(title_a, title_b, thredhold=0.8):
			
 
				+            """
			
 
				+            :param title_a:
			
 
				+            :param title_b:
			
 
				+            :param thredhold:
			
 
				+            :return:
			
 
				+            """
			
 
				+            if len(title_a) < 1 or len(title_b) < 1:
			
 
				+                return False
			
 
				+            set_a = set(title_a)
			
 
				+            set_b = set(title_b)
			
 
				+            set_cross = set_a & set_b
			
 
				+            set_union = set_a | set_b
			
 
				+            if not set_union:
			
 
				+                return False
			
 
				+            min_len = max(min(len(set_a), len(set_b)), 1)
			
 
				+            rate = len(set_cross) / min_len
			
 
				+            if rate >= thredhold:
			
 
				+                return True
			
 
				+            else:
			
 
				+                return False
			
 
				+
			
 
				+        for title in title_list:
			
 
				+            sim_score = title_sim_v2(target_title, title)
			
 
				+            if sim_score:
			
 
				+                return True
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/longArticleJob.py
+++ b/longArticleJob.py
@@ -0,0 +1,29 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+from tasks import *
			
 
				+
			
 
				+
			
 
				+class Job(object):
			
 
				+    """
			
 
				+    Long articles job
			
 
				+    """
			
 
				+    @classmethod
			
 
				+    def initColdPool(cls):
			
 
				+        """
			
 
				+        每天初始化冷启动池
			
 
				+        :return:
			
 
				+        """
			
 
				+        cold_pool = ColdStartPool()
			
 
				+        cold_pool.deal()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def sendToLevel3(cls):
			
 
				+        """
			
 
				+        按照配比，将数据抓取计划发不到第三层
			
 
				+        :return:
			
 
				+        """
			
 
				+        cold_start = ColdStartTask()
			
 
				+        cold_start.sendToColdPool()
			
 
				+
			
 
				+
			
--- a/stratrgy/strategy.py
+++ b/stratrgy/strategy.py
@@ -19,17 +19,16 @@ class ArticlePoolStrategy(object):
 
				     def getData(cls, article_list):
			
 
				         """
			
 
				         :param article_list: 每天召回的文章list
			
 
				-        :return: {
			
 
				-            "Level1": [],
			
 
				-            "Level2": [],
			
 
				-            "Level3": []
			
 
				-        }
			
 
				         """
			
 
				         detail_list = []
			
 
				         print("查询文章url......")
			
 
				+        id_tuple = [i['id'] for i in article_list]
			
 
				+        detail_dict = cls.Fun.matchLinkByIdTuple(channel_id_tuple=tuple(id_tuple))
			
 
				         for i in tqdm(article_list):
			
 
				-            detail = cls.Fun.matchLinkById(i['article_id'])
			
 
				-            i['gh_id'], i['url'], i['index'] = detail
			
 
				+            content_id = i['id']
			
 
				+            i['gh_key'] = detail_dict[content_id]['gh_key']
			
 
				+            i['url'] = detail_dict[content_id]['url']
			
 
				+            i['title'] = detail_dict[content_id]['title']
			
 
				             detail_list.append(i)
			
 
				         print("查询完成, 开始排序")
			
 
				         return detail_list
			
@@ -42,8 +41,8 @@ class ArticlePoolStrategy(object):
 
				         """
			
 
				         L = []
			
 
				         for line in detail_list:
			
 
				-            key = "{}_{}".format(line['gh_id'], line['index'])
			
 
				-            article_read = line['increase_read_count']
			
 
				+            key = line['gh_key']
			
 
				+            article_read = line['read_count']
			
 
				             if accountBaseInfo.get(key):
			
 
				                 avg_read = accountBaseInfo[key]['readAvg']
			
 
				                 # 计算比率
			
@@ -53,33 +52,47 @@ class ArticlePoolStrategy(object):
 
				                     "avg_read": avg_read,
			
 
				                     "article_read": article_read,
			
 
				                     "level_rate": level_rate,
			
 
				-                    "url": line['url']
			
 
				+                    "url": line['url'],
			
 
				+                    "title": line['title']
			
 
				                 }
			
 
				                 L.append(obj)
			
 
				         L = sorted(L, key=lambda x: x["level_rate"], reverse=True)
			
 
				-        for index, i in enumerate(L):
			
 
				-            print(index,"\t", i['key'], "\t", i['level_rate'])
			
 
				         result = {
			
 
				             "Level1": [],
			
 
				             "Level2": []
			
 
				         }
			
 
				-        c = 0
			
 
				+        c1 = 0
			
 
				+        c2 = 0
			
 
				         for line in L:
			
 
				-            print(json.dumps(line, ensure_ascii=False, indent=4))
			
 
				-            if line['level_rate'] > 0.2:
			
 
				-                c += 1
			
 
				-                gh_key = line['key']
			
 
				-                if pool_level_detail.get(gh_key):
			
 
				-                    now_level = pool_level_detail[gh_key]
			
 
				-                    if now_level == "3":
			
 
				-                        result['Level2'].append(line['url'])
			
 
				-                    elif now_level == "2":
			
 
				-                        result['Level1'].append(line['url'])
			
 
				-                    else:
			
 
				-                        continue
			
 
				-                else:
			
 
				-                    result['Level2'].append(line['url'])
			
 
				-        print(c)
			
 
				+            gh_key = line['key']
			
 
				+            if pool_level_detail.get(gh_key):
			
 
				+                now_level = pool_level_detail[gh_key]
			
 
				+            else:
			
 
				+                now_level = "3"
			
 
				+            match now_level:
			
 
				+                case "2":
			
 
				+                    # 往1层升
			
 
				+                    if line['level_rate'] > 0.1 and line['avg_read'] >= 1000:
			
 
				+                        now_title_list = [i['title'] for i in result['Level1']]
			
 
				+                        if cls.Fun.TitleSimilarity(now_title_list, line['title']):
			
 
				+                            continue
			
 
				+                        else:
			
 
				+                            result['Level1'].append(line)
			
 
				+                case "3":
			
 
				+                    if line['level_rate'] > 0.1 and line['avg_read'] >= 100:
			
 
				+                        now_title_list = [i['title'] for i in result['Level2']]
			
 
				+                        if cls.Fun.TitleSimilarity(now_title_list, line['title']):
			
 
				+                            continue
			
 
				+                        else:
			
 
				+                            result['Level2'].append(line)
			
 
				+                case "1":
			
 
				+                    continue
			
 
				+                    # if line['level_rate'] > 1.0:
			
 
				+                    #     now_title_list = [i['title'] for i in result['Level1']]
			
 
				+                    #     if cls.Fun.TitleSimilarity(now_title_list, line['title']):
			
 
				+                    #         continue
			
 
				+                    #     else:
			
 
				+                    #         result['Level1'].append(line)
			
 
				         return result
			
 
				 
			
 
				 
			
--- a/tasks/__init__.py
+++ b/tasks/__init__.py
@@ -0,0 +1,8 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+from .task1 import ColdStartPool
			
 
				+from .task2 import ColdStartTask
			
 
				+from .task3 import SendToMultiLevels
			
 
				+from .task4 import update_articles
			
 
				+from .task5 import AccountArticleProducer
			
--- a/tasks/task1.py
+++ b/tasks/task1.py
@@ -1,8 +1,6 @@
 
				 """
			
 
				 @author: luojunhui
			
 
				 """
			
 
				-import json
			
 
				-
			
 
				 from tqdm import tqdm
			
 
				 
			
 
				 from applications import AIDTApi, DeNetMysql, PQMySQL
			
@@ -87,8 +85,4 @@ class ColdStartPool(object):
 
				         """
			
 
				         plan_id_list = cls.DeMysql.getUnEmptyPlan()
			
 
				         for plan_id in tqdm(plan_id_list):
			
 
				-            cls.updateToPool(plan_id)
			
 
				-
			
 
				-
			
 
				-CST = ColdStartPool()
			
 
				-CST.deal()
			
 
				+            cls.updateToPool(plan_id)
			
--- a/tasks/task2.py
+++ b/tasks/task2.py
@@ -146,12 +146,3 @@ class ColdStartTask(object):
 
				             plan_tag=plan_tag,
			
 
				             url_list=[i['url'] for i in target_article_list]
			
 
				         )
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    CST = ColdStartTask()
			
 
				-    CST.sendToColdPool(
			
 
				-        plan_id=None,
			
 
				-        plan_name="冷启池子--0730--Monday--分品类抓取--6个品类",
			
 
				-        plan_tag="autoArticlePoolLevel3",
			
 
				-    )
			
--- a/tasks/task3.py
+++ b/tasks/task3.py
@@ -5,7 +5,7 @@ import datetime
 
				 
			
 
				 from tqdm import tqdm
			
 
				 
			
 
				-from applications import AIDTApi, DeNetMysql, PQMySQL, Functions, ODPSApi
			
 
				+from applications import AIDTApi, DeNetMysql, PQMySQL, Functions, AdMySQL
			
 
				 from config import poolTagMap
			
 
				 from stratrgy import ArticlePoolStrategy
			
 
				 
			
@@ -18,7 +18,7 @@ class SendToMultiLevels(object):
 
				     DeMysql = DeNetMysql()
			
 
				     PqMysql = PQMySQL()
			
 
				     Fun = Functions()
			
 
				-    OA = ODPSApi()
			
 
				+    Ad = AdMySQL()
			
 
				 
			
 
				     @classmethod
			
 
				     def getYesterdayData(cls):
			
@@ -26,19 +26,19 @@ class SendToMultiLevels(object):
 
				         获取前一天数据表现
			
 
				         :return:
			
 
				         """
			
 
				-        odps_sql = "select * from loghubods.changwen_article_datastat where dt = '20240729';"
			
 
				-        result = cls.OA.select(sql=odps_sql)
			
 
				+        sql = f"""
			
 
				+        select article_id, read_count from changwen_article_datastat
			
 
				+        where article_id in (
			
 
				+            select id from changwen_article
			
 
				+            where publish_timestamp >= 1721664000000
			
 
				+        ) and read_count > 100;
			
 
				+        """
			
 
				+        result = cls.Ad.select(sql=sql)
			
 
				         response_list = [
			
 
				             {
			
 
				-                "article_id": record["article_id"],
			
 
				-                "increase_read_count": record["increase_read_count"],
			
 
				-                "read_count": record["read_count"],
			
 
				-                "increase_income": record["increase_income"],
			
 
				-                "income": record["income"],
			
 
				-                "increase_share_count": record["increase_share_count"],
			
 
				-                "share_count": record["share_count"],
			
 
				-                "update_timestamp": record["update_timestamp"]
			
 
				-            } for record in result if record['increase_read_count'] >= 1000
			
 
				+                "id": line[0],
			
 
				+                "read_count": line[1]
			
 
				+            } for line in result
			
 
				         ]
			
 
				         return response_list
			
 
				 
			
@@ -54,23 +54,25 @@ class SendToMultiLevels(object):
 
				         return result
			
 
				 
			
 
				     @classmethod
			
 
				-    def sendToEachCrawlerPlan(cls, key, url_list):
			
 
				+    def sendToEachCrawlerPlan(cls, key, result_list):
			
 
				         """
			
 
				 
			
 
				+        :param result_list:
			
 
				         :param key:
			
 
				-        :param url_list:
			
 
				         :return:
			
 
				         """
			
 
				-        print(key)
			
 
				-        print(len(url_list))
			
 
				-        print(url_list)
			
 
				+        # print(key)
			
 
				+        # print(len(result_list))
			
 
				+        # for index, i in enumerate(result_list):
			
 
				+        #     print(index, "\t",  i['level_rate'], "\t", i['title'], "\t", i['avg_read'], "\t", i['article_read'], "\t", i['key'])
			
 
				+        # print(url_list)
			
 
				         # daily自动创建新抓取计划
			
 
				-        # cls.AidApi.updateArticleIntoCrawlerPlan(
			
 
				-        #     plan_id=None,
			
 
				-        #     plan_name="{}--{}".format(datetime.datetime.today().__str__().split(" ")[0], key),
			
 
				-        #     plan_tag=poolTagMap[key],
			
 
				-        #     url_list=url_list
			
 
				-        # )
			
 
				+        cls.AidApi.updateArticleIntoCrawlerPlan(
			
 
				+            plan_id=None,
			
 
				+            plan_name="流量池晋级--{}--{}".format(datetime.datetime.today().__str__().split(" ")[0], key),
			
 
				+            plan_tag=poolTagMap[key],
			
 
				+            url_list=[i['url'] for i in result_list]
			
 
				+        )
			
 
				 
			
 
				     @classmethod
			
 
				     def sendToDifferentPools(cls, pool_info):
			
@@ -89,9 +91,14 @@ class SendToMultiLevels(object):
 
				         """
			
 
				         yesterday_data = cls.getYesterdayData()
			
 
				         level_url_list_map = cls.splitToDifferentPools(yesterday_data)
			
 
				+        # for line in level_url_list_map:
			
 
				+        #     print(line)
			
 
				         cls.sendToDifferentPools(pool_info=level_url_list_map)
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    ST = SendToMultiLevels()
			
 
				-    ST.deal()
			
 
				+    S = SendToMultiLevels()
			
 
				+    S.deal()
			
 
				+    # yesterday_data = S.getYesterdayData()
			
 
				+    # for line in tqdm(yesterday_data):
			
 
				+    #     print(line)