Kaynağa Gözat

Merge branch '2025-05-07-del-nouse-code' of luojunhui/LongArticlesJob into master

luojunhui 5 ay önce
ebeveyn
işleme
d1e11ec099

+ 0 - 67
coldStartTasks/crawler/weixinAccountCrawler.py

@@ -1,67 +0,0 @@
-"""
-@author: luojunhui
-输入文章连接,输出账号信息,并且把账号存储到数据库中
-"""
-import datetime
-
-from tqdm import tqdm
-from applications import WeixinSpider, longArticlesMySQL
-
-
-class weixinAccountCrawler(object):
-    """
-    weixinAccountCrawler
-    """
-
-    def __init__(self, target_url_list):
-        self.db_client = longArticlesMySQL()
-        self.spider = WeixinSpider()
-        self.url_list = target_url_list
-
-    def get_account_detail(self, url):
-        """
-        通过文章链接获取账号信息
-        :param url:
-        :return:
-        """
-        account_detail = self.spider.get_account_by_url(content_url=url)
-        account_obj = account_detail['data']['data']
-        account_name = account_obj['account_name']
-        gh_id = account_obj['wx_gh']
-        self.insert_account_into_database(account_name, gh_id)
-
-    def insert_account_into_database(self, account_name, gh_id, category=None):
-        """
-        :param category:
-        :param account_name:
-        :param gh_id:
-        :return:
-        """
-        if not category:
-            category = "daily-account-mining"
-        insert_sql = f"""
-            INSERT INTO long_articles_accounts
-            (gh_id, account_source, account_name, account_category, init_date)
-            values 
-            (%s, %s, %s, %s, %s)
-        """
-        self.db_client.update(
-            sql=insert_sql,
-            params=(gh_id, "weixin", account_name, category, datetime.date.today().__str__())
-        )
-
-    def deal(self):
-        """
-        entrance of this code
-        :return:
-        """
-        for url in tqdm(self.url_list):
-            self.get_account_detail(url)
-
-
-if __name__ == '__main__':
-    url_list = [
-        'https://mp.weixin.qq.com/s/Q9Je-eNKcHNjh8S-NqQLgg'
-    ]
-    wac = weixinAccountCrawler(url_list)
-    wac.deal()

+ 0 - 3
flow_pool/__init__.py

@@ -1,3 +0,0 @@
-"""
-@author: luojunhui
-"""

+ 0 - 296
flow_pool/upLevel.py

@@ -1,296 +0,0 @@
-"""
-@author: luojunhui
-"""
-import json
-
-from pandas import DataFrame
-from datetime import datetime
-from applications import longArticlesMySQL
-
-lam = longArticlesMySQL()
-
-
-class articleLevelUp(object):
-    """
-    文章晋级
-    """
-    columns = [
-        "位置",
-        "粉丝量",
-        "阅读量",
-        "平均阅读量",
-        "头条阅读量",
-        "头条平均阅读量",
-        "阅读均值倍数",
-        "阅读率",
-        "小程序打开率",
-        "T+0裂变率",
-        "标题",
-        "链接"
-    ]
-    statMapThreeToEight = {
-        "阅读均值倍数": {
-            "mean": 1.1388723507368606,
-            "max": 62.50000000000001,
-            "min": 0.0,
-            "median": 0.8890469416785206,
-            "75%": 1.2617516081147946,
-            "80%": 1.37797320398902,
-            "90%": 1.8733429945338946,
-            "95%": 2.6455874825730517,
-            "99%": 6.252251764489181
-        },
-        "阅读率": {
-            "mean": 0.0006051220910642054,
-            "max": 0.06252537555826228,
-            "min": 0.0,
-            "median": 0.0002241206067691894,
-            "75%": 0.0005117154674215644,
-            "80%": 0.0006449975188817015,
-            "90%": 0.001255232384471895,
-            "95%": 0.002233845658277497,
-            "99%": 0.00633843067255787
-        },
-        "小程序打开率": {
-            "mean": 0.062085135696479415,
-            "max": 1.0,
-            "min": 0.0,
-            "median": 0.045454545454545456,
-            "75%": 0.08695652173913043,
-            "80%": 0.1,
-            "90%": 0.14285714285714285,
-            "95%": 0.18518518518518517,
-            "99%": 0.310463054187192
-        },
-        "T+0裂变率": {
-            "mean": 0.35277482885383377,
-            "max": 181.0,
-            "min": 0.0,
-            "median": 0.0,
-            "75%": 0.0,
-            "80%": 0.09090909090909091,
-            "90%": 0.6666666666666666,
-            "95%": 1.5,
-            "99%": 6.0
-        }
-    }
-    statMapTwoToOne = {
-        "阅读均值倍数": {
-            "mean": 1.0242728432910957,
-            "max": 4.921632060507756,
-            "min": 0.04236315118498048,
-            "median": 0.9604958720021857,
-            "75%": 1.237352622811623,
-            "80%": 1.3131587863024974,
-            "90%": 1.5778563945144477,
-            "95%": 1.8312064951656155,
-            "99%": 2.5125234834603165
-        },
-        "阅读率": {
-            "mean": 0.0073535037464145655,
-            "max": 0.05265662356955502,
-            "min": 0.00020895172629276676,
-            "median": 0.005941952332154309,
-            "75%": 0.009324205525316574,
-            "80%": 0.010420614811741105,
-            "90%": 0.013728137204835086,
-            "95%": 0.01704242661483454,
-            "99%": 0.02622215995438508
-        },
-        "小程序打开率": {
-            "mean": 0.14893695109764848,
-            "max": 2.5,
-            "min": 0.0,
-            "median": 0.1360318513603185,
-            "75%": 0.1875,
-            "80%": 0.20230028849345147,
-            "90%": 0.25449906489537877,
-            "95%": 0.3051369784478383,
-            "99%": 0.4016107123469446
-        },
-        "T+0裂变率": {
-            "mean": 0.6465295965706923,
-            "max": 12.804878048780488,
-            "min": 0.0,
-            "median": 0.48770491803278687,
-            "75%": 0.8011363636363636,
-            "80%": 0.9144722345551121,
-            "90%": 1.317362236032163,
-            "95%": 1.792137476827772,
-            "99%": 3.277849462365585
-        }
-    }
-    firstLevelMap = {
-        "阅读均值倍数": {
-            "mean": 1.0469541000103093,
-            "max": 25.719380724649426,
-            "min": 0.037429819089207735,
-            "median": 0.9521466355025219,
-            "75%": 1.2800839124458492,
-            "80%": 1.370275508982941,
-            "90%": 1.674800845262867,
-            "95%": 1.995613204168999,
-            "99%": 2.9869225601165135
-        },
-        "阅读率": {
-            "mean": 0.016311355353310464,
-            "max": 0.7427434456928839,
-            "min": 0.0006011082360982278,
-            "median": 0.01255841121495327,
-            "75%": 0.020080845617803843,
-            "80%": 0.022950649260452458,
-            "90%": 0.03136776141996209,
-            "95%": 0.0398727631704118,
-            "99%": 0.05986584275411923
-        },
-        "小程序打开率": {
-            "mean": 0.20655535828501095,
-            "max": 0.8,
-            "min": 0.0,
-            "median": 0.19921326215228996,
-            "75%": 0.25838983436476154,
-            "80%": 0.27586206896551724,
-            "90%": 0.32290043225754594,
-            "95%": 0.3709317026683608,
-            "99%": 0.4685840031614304
-        },
-        "T+0裂变率": {
-            "mean": 0.6660929834568661,
-            "max": 46.0,
-            "min": 0.0,
-            "median": 0.5434782608695652,
-            "75%": 0.7940509083886685,
-            "80%": 0.8776439089692103,
-            "90%": 1.159075752014066,
-            "95%": 1.62348848368522,
-            "99%": 2.785400696864109
-        }
-    }
-
-    @classmethod
-    def readRateDebias(cls, row):
-        """
-        阅读均值倍数通过头条消偏
-        :param row:
-        :return:
-        """
-        if row["位置"] != 1:
-            return row["阅读量"] / (
-                    max(1.0, row["头条阅读量"] / row["头条阅读均值"]) * row["阅读均值"]
-            )
-        else:
-            return row["阅读均值倍数"]
-
-    @classmethod
-    def getBaseData(cls):
-        """
-
-        :return:
-        """
-        sql = f"""
-        SELECT
-            position, fans, view_count, avg_view_count, first_view_count, first_avg_view_count, read_rate, read_fans_rate, first_read_rate, fission0_first_rate, title, link
-        FROM 
-            datastat_sort_strategy;
-        """
-        response = lam.select(sql)
-        df = DataFrame(response, columns=cls.columns)
-        df = df.sort_values(by=["阅读均值倍数"], ascending=[False]).reset_index(drop=True)
-        df = df[df["粉丝量"] > 10000].reset_index(drop=True)
-        return df
-
-    @classmethod
-    def analysisDF(cls, indexList):
-        """
-        分析 dataframe 中数据占比
-        :return:
-        """
-        DF = cls.getBaseData()
-        DF = DF[(DF["位置"].isin(indexList))]
-        print(len(DF))
-        avg_read_times = DF['阅读均值倍数'].sort_values(ascending=False)
-        read_rate = DF['阅读率'].sort_values(ascending=False)
-        mini_open_rate = DF['小程序打开率'].sort_values(ascending=False)
-        t_plus_0_fission = DF['T+0裂变率'].sort_values(ascending=False)
-        detail = {
-            "阅读均值倍数": {
-                "mean": avg_read_times.mean(),
-                "max": avg_read_times.max(),
-                "min": avg_read_times.min(),
-                "median": avg_read_times.median(),
-                "75%": avg_read_times.quantile(0.75),
-                "80%": avg_read_times.quantile(0.8),
-                "90%": avg_read_times.quantile(0.9),
-                "95%": avg_read_times.quantile(0.95),
-                "99%": avg_read_times.quantile(0.99)
-            },
-            "阅读率": {
-                "mean": read_rate.mean(),
-                "max": read_rate.max(),
-                "min": read_rate.min(),
-                "median": read_rate.median(),
-                "75%": read_rate.quantile(0.75),
-                "80%": read_rate.quantile(0.8),
-                "90%": read_rate.quantile(0.9),
-                "95%": read_rate.quantile(0.95),
-                "99%": read_rate.quantile(0.99)
-            },
-            "小程序打开率": {
-                "mean": mini_open_rate.mean(),
-                "max": mini_open_rate.max(),
-                "min": mini_open_rate.min(),
-                "median": mini_open_rate.median(),
-                "75%": mini_open_rate.quantile(0.75),
-                "80%": mini_open_rate.quantile(0.8),
-                "90%": mini_open_rate.quantile(0.9),
-                "95%": mini_open_rate.quantile(0.95),
-                "99%": mini_open_rate.quantile(0.99)
-            },
-            "T+0裂变率": {
-                "mean": t_plus_0_fission.mean(),
-                "max": t_plus_0_fission.max(),
-                "min": t_plus_0_fission.min(),
-                "median": t_plus_0_fission.median(),
-                "75%": t_plus_0_fission.quantile(0.75),
-                "80%": t_plus_0_fission.quantile(0.8),
-                "90%": t_plus_0_fission.quantile(0.9),
-                "95%": t_plus_0_fission.quantile(0.95),
-                "99%": t_plus_0_fission.quantile(0.99)
-            }
-        }
-        print(json.dumps(detail, ensure_ascii=False, indent=4))
-
-    @classmethod
-    def upLevel38To2(cls):
-        """
-        :return:
-        """
-        dataThreeToEight = cls.getBaseData()
-        dataThreeToEight = dataThreeToEight[dataThreeToEight['位置'].isin([3, 4, 5, 6, 7, 8])]
-        filter_data = dataThreeToEight[
-            (dataThreeToEight['T+0裂变率'] > cls.statMapThreeToEight['T+0裂变率']['95%'])
-            & (dataThreeToEight['阅读均值倍数'] > cls.statMapThreeToEight['阅读均值倍数']['80%'])
-            ]
-        return filter_data
-
-    @classmethod
-    def upLevel2To1(cls):
-        """
-        :return:
-        """
-        dataThreeToEight = cls.getBaseData()
-        dataThreeToEight = dataThreeToEight[dataThreeToEight['位置'].isin([2])]
-        filter_data = dataThreeToEight[
-            (dataThreeToEight['T+0裂变率'] > cls.statMapThreeToEight['T+0裂变率']['90%'])
-            & (dataThreeToEight['阅读均值倍数'] > cls.statMapThreeToEight['阅读均值倍数']['90%'])
-            ]
-        return filter_data
-
-
-U = articleLevelUp()
-U.analysisDF(indexList=[1])
-f_d = U.upLevel2To1()
-for line in list(zip(f_d['标题'], f_d['链接'])):
-    print(line[0])
-    print(line[1])
-    print("\n")

+ 1 - 1
run_article_title_exit_v1.py

@@ -1,7 +1,7 @@
 """
 @author: luojunhui
 """
-from flow_pool.exit_article_with_title import main
+from tasks.flow_pool_tasks.exit_article_with_title import main
 
 
 if __name__ == '__main__':

+ 0 - 4
flow_pool/exit_article_with_title.py → tasks/flow_pool_tasks/exit_article_with_title.py

@@ -222,7 +222,3 @@ def main():
         mention=False
     )
 
-
-if __name__ == '__main__':
-    main()
-

+ 0 - 0
not_used_tasks/article_association_task.py → tasks/not_used_tasks/article_association_task.py


+ 0 - 0
not_used_tasks/getTencentReadDetails.py → tasks/not_used_tasks/getTencentReadDetails.py


+ 0 - 0
not_used_tasks/migrateRootSourceId.py → tasks/not_used_tasks/migrateRootSourceId.py


+ 0 - 0
not_used_tasks/updateAccountAvgDaily.py → tasks/not_used_tasks/updateAccountAvgDaily.py


+ 0 - 0
not_used_tasks/updateMinigramInfoDaily.py → tasks/not_used_tasks/updateMinigramInfoDaily.py