罗俊辉 8 месяцев назад
Родитель
Сommit
4f1e08f108
9 измененных файлов с 224 добавлено и 70 удалено
  1. 1 0
      applications/__init__.py
  2. 42 0
      applications/ad_mysql.py
  3. 69 0
      applications/functions.py
  4. 29 0
      longArticleJob.py
  5. 41 28
      stratrgy/strategy.py
  6. 8 0
      tasks/__init__.py
  7. 1 7
      tasks/task1.py
  8. 0 9
      tasks/task2.py
  9. 33 26
      tasks/task3.py

+ 1 - 0
applications/__init__.py

@@ -3,6 +3,7 @@
 """
 from .aidit_api import AIDTApi
 from .denet_mysql import DeNetMysql
+from .ad_mysql import AdMySQL
 from .pq_mysql import PQMySQL
 from .functions import Functions
 from .data_works import ODPSApi

+ 42 - 0
applications/ad_mysql.py

@@ -0,0 +1,42 @@
+"""
+@author: luojunhui
+"""
+import pymysql
+
+
+class AdMySQL(object):
+    """
+    PQ Mysql
+    """
+    connection = pymysql.connect(
+        host='rm-bp12k5fuh5zyx31d28o.mysql.rds.aliyuncs.com',
+        port=3306,
+        user='wx2023_ad',
+        password='wx2023_adP@assword1234',
+        db='adplatform',
+        charset='utf8mb4'
+    )
+
+    # @classmethod
+    # def update(cls, sql, params):
+    #     """
+    #     更新
+    #     :return:
+    #     """
+    #     cursor = cls.connection.cursor()
+    #     cursor.execute(sql, params)
+    #     cls.connection.commit()
+
+    @classmethod
+    def select(cls, sql):
+        """
+        查询
+        :param sql:
+        :return:
+        """
+        cursor = cls.connection.cursor()
+        cursor.execute(sql)
+        result = cursor.fetchall()
+        return result
+
+

+ 69 - 0
applications/functions.py

@@ -51,3 +51,72 @@ class Functions(object):
         cursor.execute(sql)
         article_link = cursor.fetchone()
         return article_link
+
+    @classmethod
+    def matchLinkByIdTuple(cls, channel_id_tuple):
+        """
+        Use channelContentId to match articleUrl
+        :param channel_id_tuple:
+        :return:
+        """
+        connection = pymysql.connect(
+            host='rm-bp12k5fuh5zyx31d28o.mysql.rds.aliyuncs.com',
+            port=3306,
+            user='wx2023_ad',
+            password='wx2023_adP@assword1234',
+            db='adplatform',
+            charset='utf8mb4'
+        )
+        sql = f"""select id, account_id, link, item_index, title from changwen_article where id in {channel_id_tuple};"""
+        cursor = connection.cursor()
+        cursor.execute(sql)
+        article_link = cursor.fetchall()
+        L = {}
+        for line in article_link:
+            key = line[0]
+            value = {
+                "gh_key": "{}_{}".format(line[1], line[3]),
+                "url": line[2],
+                "title": line[4]
+            }
+            L[key] = value
+        return L
+
+    @classmethod
+    def TitleSimilarity(cls, title_list, target_title):
+        """
+        计算标题相似度
+        :return:
+        """
+
+        def title_sim_v2(title_a, title_b, thredhold=0.8):
+            """
+            :param title_a:
+            :param title_b:
+            :param thredhold:
+            :return:
+            """
+            if len(title_a) < 1 or len(title_b) < 1:
+                return False
+            set_a = set(title_a)
+            set_b = set(title_b)
+            set_cross = set_a & set_b
+            set_union = set_a | set_b
+            if not set_union:
+                return False
+            min_len = max(min(len(set_a), len(set_b)), 1)
+            rate = len(set_cross) / min_len
+            if rate >= thredhold:
+                return True
+            else:
+                return False
+
+        for title in title_list:
+            sim_score = title_sim_v2(target_title, title)
+            if sim_score:
+                return True
+        return False
+
+
+
+

+ 29 - 0
longArticleJob.py

@@ -0,0 +1,29 @@
+"""
+@author: luojunhui
+"""
+from tasks import *
+
+
+class Job(object):
+    """
+    Long articles job
+    """
+    @classmethod
+    def initColdPool(cls):
+        """
+        每天初始化冷启动池
+        :return:
+        """
+        cold_pool = ColdStartPool()
+        cold_pool.deal()
+
+    @classmethod
+    def sendToLevel3(cls):
+        """
+        按照配比,将数据抓取计划发不到第三层
+        :return:
+        """
+        cold_start = ColdStartTask()
+        cold_start.sendToColdPool()
+
+

+ 41 - 28
stratrgy/strategy.py

@@ -19,17 +19,16 @@ class ArticlePoolStrategy(object):
     def getData(cls, article_list):
         """
         :param article_list: 每天召回的文章list
-        :return: {
-            "Level1": [],
-            "Level2": [],
-            "Level3": []
-        }
         """
         detail_list = []
         print("查询文章url......")
+        id_tuple = [i['id'] for i in article_list]
+        detail_dict = cls.Fun.matchLinkByIdTuple(channel_id_tuple=tuple(id_tuple))
         for i in tqdm(article_list):
-            detail = cls.Fun.matchLinkById(i['article_id'])
-            i['gh_id'], i['url'], i['index'] = detail
+            content_id = i['id']
+            i['gh_key'] = detail_dict[content_id]['gh_key']
+            i['url'] = detail_dict[content_id]['url']
+            i['title'] = detail_dict[content_id]['title']
             detail_list.append(i)
         print("查询完成, 开始排序")
         return detail_list
@@ -42,8 +41,8 @@ class ArticlePoolStrategy(object):
         """
         L = []
         for line in detail_list:
-            key = "{}_{}".format(line['gh_id'], line['index'])
-            article_read = line['increase_read_count']
+            key = line['gh_key']
+            article_read = line['read_count']
             if accountBaseInfo.get(key):
                 avg_read = accountBaseInfo[key]['readAvg']
                 # 计算比率
@@ -53,33 +52,47 @@ class ArticlePoolStrategy(object):
                     "avg_read": avg_read,
                     "article_read": article_read,
                     "level_rate": level_rate,
-                    "url": line['url']
+                    "url": line['url'],
+                    "title": line['title']
                 }
                 L.append(obj)
         L = sorted(L, key=lambda x: x["level_rate"], reverse=True)
-        for index, i in enumerate(L):
-            print(index,"\t", i['key'], "\t", i['level_rate'])
         result = {
             "Level1": [],
             "Level2": []
         }
-        c = 0
+        c1 = 0
+        c2 = 0
         for line in L:
-            print(json.dumps(line, ensure_ascii=False, indent=4))
-            if line['level_rate'] > 0.2:
-                c += 1
-                gh_key = line['key']
-                if pool_level_detail.get(gh_key):
-                    now_level = pool_level_detail[gh_key]
-                    if now_level == "3":
-                        result['Level2'].append(line['url'])
-                    elif now_level == "2":
-                        result['Level1'].append(line['url'])
-                    else:
-                        continue
-                else:
-                    result['Level2'].append(line['url'])
-        print(c)
+            gh_key = line['key']
+            if pool_level_detail.get(gh_key):
+                now_level = pool_level_detail[gh_key]
+            else:
+                now_level = "3"
+            match now_level:
+                case "2":
+                    # 往1层升
+                    if line['level_rate'] > 0.1 and line['avg_read'] >= 1000:
+                        now_title_list = [i['title'] for i in result['Level1']]
+                        if cls.Fun.TitleSimilarity(now_title_list, line['title']):
+                            continue
+                        else:
+                            result['Level1'].append(line)
+                case "3":
+                    if line['level_rate'] > 0.1 and line['avg_read'] >= 100:
+                        now_title_list = [i['title'] for i in result['Level2']]
+                        if cls.Fun.TitleSimilarity(now_title_list, line['title']):
+                            continue
+                        else:
+                            result['Level2'].append(line)
+                case "1":
+                    continue
+                    # if line['level_rate'] > 1.0:
+                    #     now_title_list = [i['title'] for i in result['Level1']]
+                    #     if cls.Fun.TitleSimilarity(now_title_list, line['title']):
+                    #         continue
+                    #     else:
+                    #         result['Level1'].append(line)
         return result
 
 

+ 8 - 0
tasks/__init__.py

@@ -0,0 +1,8 @@
+"""
+@author: luojunhui
+"""
+from .task1 import ColdStartPool
+from .task2 import ColdStartTask
+from .task3 import SendToMultiLevels
+from .task4 import update_articles
+from .task5 import AccountArticleProducer

+ 1 - 7
tasks/task1.py

@@ -1,8 +1,6 @@
 """
 @author: luojunhui
 """
-import json
-
 from tqdm import tqdm
 
 from applications import AIDTApi, DeNetMysql, PQMySQL
@@ -87,8 +85,4 @@ class ColdStartPool(object):
         """
         plan_id_list = cls.DeMysql.getUnEmptyPlan()
         for plan_id in tqdm(plan_id_list):
-            cls.updateToPool(plan_id)
-
-
-CST = ColdStartPool()
-CST.deal()
+            cls.updateToPool(plan_id)

+ 0 - 9
tasks/task2.py

@@ -146,12 +146,3 @@ class ColdStartTask(object):
             plan_tag=plan_tag,
             url_list=[i['url'] for i in target_article_list]
         )
-
-
-if __name__ == '__main__':
-    CST = ColdStartTask()
-    CST.sendToColdPool(
-        plan_id=None,
-        plan_name="冷启池子--0730--Monday--分品类抓取--6个品类",
-        plan_tag="autoArticlePoolLevel3",
-    )

+ 33 - 26
tasks/task3.py

@@ -5,7 +5,7 @@ import datetime
 
 from tqdm import tqdm
 
-from applications import AIDTApi, DeNetMysql, PQMySQL, Functions, ODPSApi
+from applications import AIDTApi, DeNetMysql, PQMySQL, Functions, AdMySQL
 from config import poolTagMap
 from stratrgy import ArticlePoolStrategy
 
@@ -18,7 +18,7 @@ class SendToMultiLevels(object):
     DeMysql = DeNetMysql()
     PqMysql = PQMySQL()
     Fun = Functions()
-    OA = ODPSApi()
+    Ad = AdMySQL()
 
     @classmethod
     def getYesterdayData(cls):
@@ -26,19 +26,19 @@ class SendToMultiLevels(object):
         获取前一天数据表现
         :return:
         """
-        odps_sql = "select * from loghubods.changwen_article_datastat where dt = '20240729';"
-        result = cls.OA.select(sql=odps_sql)
+        sql = f"""
+        select article_id, read_count from changwen_article_datastat
+        where article_id in (
+            select id from changwen_article
+            where publish_timestamp >= 1721664000000
+        ) and read_count > 100;
+        """
+        result = cls.Ad.select(sql=sql)
         response_list = [
             {
-                "article_id": record["article_id"],
-                "increase_read_count": record["increase_read_count"],
-                "read_count": record["read_count"],
-                "increase_income": record["increase_income"],
-                "income": record["income"],
-                "increase_share_count": record["increase_share_count"],
-                "share_count": record["share_count"],
-                "update_timestamp": record["update_timestamp"]
-            } for record in result if record['increase_read_count'] >= 1000
+                "id": line[0],
+                "read_count": line[1]
+            } for line in result
         ]
         return response_list
 
@@ -54,23 +54,25 @@ class SendToMultiLevels(object):
         return result
 
     @classmethod
-    def sendToEachCrawlerPlan(cls, key, url_list):
+    def sendToEachCrawlerPlan(cls, key, result_list):
         """
 
+        :param result_list:
         :param key:
-        :param url_list:
         :return:
         """
-        print(key)
-        print(len(url_list))
-        print(url_list)
+        # print(key)
+        # print(len(result_list))
+        # for index, i in enumerate(result_list):
+        #     print(index, "\t",  i['level_rate'], "\t", i['title'], "\t", i['avg_read'], "\t", i['article_read'], "\t", i['key'])
+        # print(url_list)
         # daily自动创建新抓取计划
-        # cls.AidApi.updateArticleIntoCrawlerPlan(
-        #     plan_id=None,
-        #     plan_name="{}--{}".format(datetime.datetime.today().__str__().split(" ")[0], key),
-        #     plan_tag=poolTagMap[key],
-        #     url_list=url_list
-        # )
+        cls.AidApi.updateArticleIntoCrawlerPlan(
+            plan_id=None,
+            plan_name="流量池晋级--{}--{}".format(datetime.datetime.today().__str__().split(" ")[0], key),
+            plan_tag=poolTagMap[key],
+            url_list=[i['url'] for i in result_list]
+        )
 
     @classmethod
     def sendToDifferentPools(cls, pool_info):
@@ -89,9 +91,14 @@ class SendToMultiLevels(object):
         """
         yesterday_data = cls.getYesterdayData()
         level_url_list_map = cls.splitToDifferentPools(yesterday_data)
+        # for line in level_url_list_map:
+        #     print(line)
         cls.sendToDifferentPools(pool_info=level_url_list_map)
 
 
 if __name__ == '__main__':
-    ST = SendToMultiLevels()
-    ST.deal()
+    S = SendToMultiLevels()
+    S.deal()
+    # yesterday_data = S.getYesterdayData()
+    # for line in tqdm(yesterday_data):
+    #     print(line)