Kaynağa Gözat

Revert "文章晋级任务 developing"

This reverts commit 778a61ff26818440e6a9bdd584763edd38b1d26e.
罗俊辉 7 ay önce
ebeveyn
işleme
4a2c054cb1
1 değiştirilmiş dosya ile 57 ekleme ve 111 silme
  1. 57 111
      stratrgy/upLevel.py

+ 57 - 111
stratrgy/upLevel.py

@@ -10,75 +10,6 @@ from applications import longArticlesMySQL
 lam = longArticlesMySQL()
 
 
-def read_rate_debias(row):
-    """
-    阅读均值倍数通过头条消偏
-    :param row:
-    :return:
-    """
-    if row["位置"] != 1:
-        return row["阅读量"] / (
-                max(1.0, row["头条阅读量"] / row["头条阅读均值"]) * row["阅读均值"]
-        )
-    else:
-        return row["阅读均值倍数"]
-
-
-def filter_same_title(df):
-    """
-    通过标题过滤 dataframe
-    :param df:
-    :return:
-    """
-
-    def title_sim_v2_by_title_list(title_target, title_list, threshold=0.75):
-        """
-        :param title_target:
-        :param title_list:
-        :param threshold:
-        :return:
-        """
-
-        def title_sim_v2(title_a, title_b, threshold=0.75):
-            """
-            标题相似度
-            :param title_a:
-            :param title_b:
-            :param threshold:
-            :return:
-            """
-            if len(title_a) < 1 or len(title_b) < 1:
-                return False
-            set_a = set(title_a)
-            set_b = set(title_b)
-            set_cross = set_a & set_b
-            set_union = set_a | set_b
-            if not set_union:
-                return False
-            min_len = max(min(len(set_a), len(set_b)), 1)
-            rate = len(set_cross) / min_len
-            if rate >= threshold:
-                return True
-            else:
-                return False
-
-        for title in title_list:
-            sim_score = title_sim_v2(title_target, title, threshold=threshold)
-            if sim_score:
-                return title
-        return False
-
-    visited_titles = []
-    data = []
-    for x in df.to_dict(orient='records'):
-        title = x['标题']
-        if title_sim_v2_by_title_list(title, visited_titles):
-            continue
-        visited_titles.append(title)
-        data.append(x)
-    return DataFrame(data)
-
-
 class articleLevelUp(object):
     """
     文章晋级
@@ -236,9 +167,24 @@ class articleLevelUp(object):
         }
     }
 
-    def get_base_data(self):
+    @classmethod
+    def readRateDebias(cls, row):
         """
-        从数据文章数据
+        阅读均值倍数通过头条消偏
+        :param row:
+        :return:
+        """
+        if row["位置"] != 1:
+            return row["阅读量"] / (
+                    max(1.0, row["头条阅读量"] / row["头条阅读均值"]) * row["阅读均值"]
+            )
+        else:
+            return row["阅读均值倍数"]
+
+    @classmethod
+    def getBaseData(cls):
+        """
+
         :return:
         """
         sql = f"""
@@ -248,23 +194,24 @@ class articleLevelUp(object):
             datastat_sort_strategy;
         """
         response = lam.select(sql)
-        df = DataFrame(response, columns=self.columns)
+        df = DataFrame(response, columns=cls.columns)
         df = df.sort_values(by=["阅读均值倍数"], ascending=[False]).reset_index(drop=True)
         df = df[df["粉丝量"] > 10000].reset_index(drop=True)
         return df
 
-    def analysis_data(self, index_list):
+    @classmethod
+    def analysisDF(cls, indexList):
         """
         分析 dataframe 中数据占比
         :return:
         """
-        df = self.get_base_data()
-        # 筛选指定位置的文章
-        df = df[(df["位置"].isin(index_list))]
-        avg_read_times = df['阅读均值倍数'].sort_values(ascending=False)
-        read_rate = df['阅读率'].sort_values(ascending=False)
-        mini_open_rate = df['小程序打开率'].sort_values(ascending=False)
-        t_plus_0_fission = df['T+0裂变率'].sort_values(ascending=False)
+        DF = cls.getBaseData()
+        DF = DF[(DF["位置"].isin(indexList))]
+        print(len(DF))
+        avg_read_times = DF['阅读均值倍数'].sort_values(ascending=False)
+        read_rate = DF['阅读率'].sort_values(ascending=False)
+        mini_open_rate = DF['小程序打开率'].sort_values(ascending=False)
+        t_plus_0_fission = DF['T+0裂变率'].sort_values(ascending=False)
         detail = {
             "阅读均值倍数": {
                 "mean": avg_read_times.mean(),
@@ -313,38 +260,37 @@ class articleLevelUp(object):
         }
         print(json.dumps(detail, ensure_ascii=False, indent=4))
 
-    def find_good_articles(self, df, pool_level, index_list, read_count, read_avg_times):
+    @classmethod
+    def upLevel38To2(cls):
         """
-        获取已经发布文章中,认为是质量好的文章
-        :param read_avg_times: 阅读均值倍数
-        :param read_count: 阅读量
-        :param df: 查询出来的 df
-        :param pool_level: 流量池层级
-        :param index_list: 文章位置
         :return:
         """
-        good_articles = df[
-            (df['阅读量'] >= read_count) &
-            (df['阅读均值倍数'] >= read_avg_times) &
-            (df['位置'].isin(index_list))
+        dataThreeToEight = cls.getBaseData()
+        dataThreeToEight = dataThreeToEight[dataThreeToEight['位置'].isin([3, 4, 5, 6, 7, 8])]
+        filter_data = dataThreeToEight[
+            (dataThreeToEight['T+0裂变率'] > cls.statMapThreeToEight['T+0裂变率']['95%'])
+            & (dataThreeToEight['阅读均值倍数'] > cls.statMapThreeToEight['阅读均值倍数']['80%'])
             ]
-        distinct_good_articles = filter_same_title(good_articles)
-        sorted_distinct_good_articles = distinct_good_articles.sort_values(by=['发布日期'],
-                                                                           ascending=[False]).reset_index(drop=True)
-        print(
-            "流量池 level: {} 中,去重后一共 {} 篇优质文章".format(pool_level, len(sorted_distinct_good_articles.index)))
-        url_list = []
-        title_list = []
-        for x in sorted_distinct_good_articles.to_dict(orient='records'):
-            url_list.append(x['链接'])
-            title_list.append(x['标题'])
-        add_url_list_to_account(
-            account_nickname,
-            url_list,
-            title_list,
-            pos,
-            way,
-            plan_key=plan_key,
-            tag=tag,
-            debug=debug,
-        )
+        return filter_data
+
+    @classmethod
+    def upLevel2To1(cls):
+        """
+        :return:
+        """
+        dataThreeToEight = cls.getBaseData()
+        dataThreeToEight = dataThreeToEight[dataThreeToEight['位置'].isin([2])]
+        filter_data = dataThreeToEight[
+            (dataThreeToEight['T+0裂变率'] > cls.statMapThreeToEight['T+0裂变率']['90%'])
+            & (dataThreeToEight['阅读均值倍数'] > cls.statMapThreeToEight['阅读均值倍数']['90%'])
+            ]
+        return filter_data
+
+
+U = articleLevelUp()
+U.analysisDF(indexList=[1])
+f_d = U.upLevel2To1()
+for line in list(zip(f_d['标题'], f_d['链接'])):
+    print(line[0])
+    print(line[1])
+    print("\n")