Browse Source

文章晋级任务 developing

罗俊辉 8 months ago
parent
commit
778a61ff26
1 changed files with 111 additions and 57 deletions
  1. 111 57
      stratrgy/upLevel.py

+ 111 - 57
stratrgy/upLevel.py

@@ -10,6 +10,75 @@ from applications import longArticlesMySQL
 lam = longArticlesMySQL()
 lam = longArticlesMySQL()
 
 
 
 
+def read_rate_debias(row):
+    """
+    阅读均值倍数通过头条消偏
+    :param row:
+    :return:
+    """
+    if row["位置"] != 1:
+        return row["阅读量"] / (
+                max(1.0, row["头条阅读量"] / row["头条阅读均值"]) * row["阅读均值"]
+        )
+    else:
+        return row["阅读均值倍数"]
+
+
+def filter_same_title(df):
+    """
+    通过标题过滤 dataframe
+    :param df:
+    :return:
+    """
+
+    def title_sim_v2_by_title_list(title_target, title_list, threshold=0.75):
+        """
+        :param title_target:
+        :param title_list:
+        :param threshold:
+        :return:
+        """
+
+        def title_sim_v2(title_a, title_b, threshold=0.75):
+            """
+            标题相似度
+            :param title_a:
+            :param title_b:
+            :param threshold:
+            :return:
+            """
+            if len(title_a) < 1 or len(title_b) < 1:
+                return False
+            set_a = set(title_a)
+            set_b = set(title_b)
+            set_cross = set_a & set_b
+            set_union = set_a | set_b
+            if not set_union:
+                return False
+            min_len = max(min(len(set_a), len(set_b)), 1)
+            rate = len(set_cross) / min_len
+            if rate >= threshold:
+                return True
+            else:
+                return False
+
+        for title in title_list:
+            sim_score = title_sim_v2(title_target, title, threshold=threshold)
+            if sim_score:
+                return title
+        return False
+
+    visited_titles = []
+    data = []
+    for x in df.to_dict(orient='records'):
+        title = x['标题']
+        if title_sim_v2_by_title_list(title, visited_titles):
+            continue
+        visited_titles.append(title)
+        data.append(x)
+    return DataFrame(data)
+
+
 class articleLevelUp(object):
 class articleLevelUp(object):
     """
     """
     文章晋级
     文章晋级
@@ -167,24 +236,9 @@ class articleLevelUp(object):
         }
         }
     }
     }
 
 
-    @classmethod
-    def readRateDebias(cls, row):
+    def get_base_data(self):
         """
         """
-        阅读均值倍数通过头条消偏
-        :param row:
-        :return:
-        """
-        if row["位置"] != 1:
-            return row["阅读量"] / (
-                    max(1.0, row["头条阅读量"] / row["头条阅读均值"]) * row["阅读均值"]
-            )
-        else:
-            return row["阅读均值倍数"]
-
-    @classmethod
-    def getBaseData(cls):
-        """
-
+        从数据文章数据
         :return:
         :return:
         """
         """
         sql = f"""
         sql = f"""
@@ -194,24 +248,23 @@ class articleLevelUp(object):
             datastat_sort_strategy;
             datastat_sort_strategy;
         """
         """
         response = lam.select(sql)
         response = lam.select(sql)
-        df = DataFrame(response, columns=cls.columns)
+        df = DataFrame(response, columns=self.columns)
         df = df.sort_values(by=["阅读均值倍数"], ascending=[False]).reset_index(drop=True)
         df = df.sort_values(by=["阅读均值倍数"], ascending=[False]).reset_index(drop=True)
         df = df[df["粉丝量"] > 10000].reset_index(drop=True)
         df = df[df["粉丝量"] > 10000].reset_index(drop=True)
         return df
         return df
 
 
-    @classmethod
-    def analysisDF(cls, indexList):
+    def analysis_data(self, index_list):
         """
         """
         分析 dataframe 中数据占比
         分析 dataframe 中数据占比
         :return:
         :return:
         """
         """
-        DF = cls.getBaseData()
-        DF = DF[(DF["位置"].isin(indexList))]
-        print(len(DF))
-        avg_read_times = DF['阅读均值倍数'].sort_values(ascending=False)
-        read_rate = DF['阅读率'].sort_values(ascending=False)
-        mini_open_rate = DF['小程序打开率'].sort_values(ascending=False)
-        t_plus_0_fission = DF['T+0裂变率'].sort_values(ascending=False)
+        df = self.get_base_data()
+        # 筛选指定位置的文章
+        df = df[(df["位置"].isin(index_list))]
+        avg_read_times = df['阅读均值倍数'].sort_values(ascending=False)
+        read_rate = df['阅读率'].sort_values(ascending=False)
+        mini_open_rate = df['小程序打开率'].sort_values(ascending=False)
+        t_plus_0_fission = df['T+0裂变率'].sort_values(ascending=False)
         detail = {
         detail = {
             "阅读均值倍数": {
             "阅读均值倍数": {
                 "mean": avg_read_times.mean(),
                 "mean": avg_read_times.mean(),
@@ -260,37 +313,38 @@ class articleLevelUp(object):
         }
         }
         print(json.dumps(detail, ensure_ascii=False, indent=4))
         print(json.dumps(detail, ensure_ascii=False, indent=4))
 
 
-    @classmethod
-    def upLevel38To2(cls):
+    def find_good_articles(self, df, pool_level, index_list, read_count, read_avg_times):
         """
         """
+        获取已经发布文章中,认为是质量好的文章
+        :param read_avg_times: 阅读均值倍数
+        :param read_count: 阅读量
+        :param df: 查询出来的 df
+        :param pool_level: 流量池层级
+        :param index_list: 文章位置
         :return:
         :return:
         """
         """
-        dataThreeToEight = cls.getBaseData()
-        dataThreeToEight = dataThreeToEight[dataThreeToEight['位置'].isin([3, 4, 5, 6, 7, 8])]
-        filter_data = dataThreeToEight[
-            (dataThreeToEight['T+0裂变率'] > cls.statMapThreeToEight['T+0裂变率']['95%'])
-            & (dataThreeToEight['阅读均值倍数'] > cls.statMapThreeToEight['阅读均值倍数']['80%'])
+        good_articles = df[
+            (df['阅读量'] >= read_count) &
+            (df['阅读均值倍数'] >= read_avg_times) &
+            (df['位置'].isin(index_list))
             ]
             ]
-        return filter_data
-
-    @classmethod
-    def upLevel2To1(cls):
-        """
-        :return:
-        """
-        dataThreeToEight = cls.getBaseData()
-        dataThreeToEight = dataThreeToEight[dataThreeToEight['位置'].isin([2])]
-        filter_data = dataThreeToEight[
-            (dataThreeToEight['T+0裂变率'] > cls.statMapThreeToEight['T+0裂变率']['90%'])
-            & (dataThreeToEight['阅读均值倍数'] > cls.statMapThreeToEight['阅读均值倍数']['90%'])
-            ]
-        return filter_data
-
-
-U = articleLevelUp()
-U.analysisDF(indexList=[1])
-f_d = U.upLevel2To1()
-for line in list(zip(f_d['标题'], f_d['链接'])):
-    print(line[0])
-    print(line[1])
-    print("\n")
+        distinct_good_articles = filter_same_title(good_articles)
+        sorted_distinct_good_articles = distinct_good_articles.sort_values(by=['发布日期'],
+                                                                           ascending=[False]).reset_index(drop=True)
+        print(
+            "流量池 level: {} 中,去重后一共 {} 篇优质文章".format(pool_level, len(sorted_distinct_good_articles.index)))
+        url_list = []
+        title_list = []
+        for x in sorted_distinct_good_articles.to_dict(orient='records'):
+            url_list.append(x['链接'])
+            title_list.append(x['标题'])
+        add_url_list_to_account(
+            account_nickname,
+            url_list,
+            title_list,
+            pos,
+            way,
+            plan_key=plan_key,
+            tag=tag,
+            debug=debug,
+        )