Bladeren bron

Merge branch '2024-10-23-luojunhui-avg-info-update-daily' of luojunhui/LongArticlesJob into master

luojunhui 1 jaar geleden
bovenliggende
commit
3aa47fbe31
3 gewijzigde bestanden met toevoegingen van 166 en 47 verwijderingen
  1. 1 0
      applications/longArticlesMysql.py
  2. 139 47
      cal_account_read_rate_avg_daily.py
  3. 26 0
      sh/run_update_account_read_rate_avg.sh

+ 1 - 0
applications/longArticlesMysql.py

@@ -53,4 +53,5 @@ class longArticlesMySQL(object):
         except Exception as e:
             print("Insert Many Defeat--{}".format(e))
             cls.connection.rollback()
+            raise e
 

+ 139 - 47
not_used_tasks/cal_account_read_rate_avg_daily.py → cal_account_read_rate_avg_daily.py

@@ -2,15 +2,28 @@
 @author: luojunhui
 cal each account && position reading rate
 """
+import json
 from tqdm import tqdm
-from datetime import datetime, timezone, timedelta
 from pandas import DataFrame
+from argparse import ArgumentParser
+from datetime import datetime, timezone, timedelta
 
-from applications import DeNetMysql, PQMySQL, longArticlesMySQL
+from applications import DeNetMysql, PQMySQL, longArticlesMySQL, bot
 
 STATISTICS_PERIOD = 31 * 24 * 60 * 60
 
 
+def float_to_percentage(value, decimals=3) -> str:
+    """
+    把小数转化为百分数
+    :param value:
+    :param decimals:
+    :return:
+    """
+    percentage_value = round(value * 100, decimals)
+    return "{}%".format(percentage_value)
+
+
 def filter_outlier_data(group, key='show_view_count'):
     """
 
@@ -24,6 +37,7 @@ def filter_outlier_data(group, key='show_view_count'):
     filtered_group = group[(group[key] > mean - 2 * std) & (group[key] < mean + 2 * std)]
     # 过滤均值倍数大于5的数据
     new_mean = filtered_group[key].mean()
+    # print("阅读均值", new_mean)
     filtered_group = filtered_group[filtered_group[key] < new_mean * 5]
     return filtered_group
 
@@ -46,7 +60,7 @@ def str_to_timestamp(date_string) -> int:
 
     # 使用timestamp()方法将datetime对象转换为时间戳
     timestamp = date_obj.timestamp()
-    return timestamp
+    return int(timestamp)
 
 
 def get_account_fans_by_dt(db_client) -> dict:
@@ -183,7 +197,12 @@ def cal_avg_account_read_rate(df, gh_id, index, dt) -> tuple:
         & (df["updateTime"] <= max_time)
         & (df['ItemIndex'] == index)
         ]
+    # print("位置", index)
     finalDF = filter_outlier_data(filterDataFrame)
+    # finalDF = finalDF.sort_values(by=['updateTime'], ascending=False)
+    # if index == 1:
+    #     for i in finalDF.values.tolist():
+    #         print(datetime.fromtimestamp(i[2]).strftime('%Y-%m-%d'), i)
     return (
         finalDF['readRate'].mean(),
         finalDF['updateTime'].max(),
@@ -192,6 +211,41 @@ def cal_avg_account_read_rate(df, gh_id, index, dt) -> tuple:
     )
 
 
+def check_each_position(db_client, gh_id, index, dt, avg_rate) -> dict:
+    """
+    检验某个具体账号的具体文章的阅读率均值和前段日子的比较
+    :param avg_rate: 当天计算出的阅读率均值
+    :param db_client: 数据库连接
+    :param gh_id: 账号 id
+    :param index: 账号  index
+    :param dt:
+    :return:
+    """
+    dt = int(dt.replace("-", ""))
+    select_sql = f"""
+        SELECT account_name, read_rate_avg
+        FROM long_articles_read_rate
+        WHERE gh_id = '{gh_id}' and position = {index} and dt_version < {dt}
+        ORDER BY dt_version DESC limit 1;
+    """
+    result = db_client.select(select_sql)
+    if result:
+        account_name = result[0][0]
+        previous_read_rate_avg = result[0][1]
+        relative_value = (avg_rate - previous_read_rate_avg) / previous_read_rate_avg
+        if -0.05 <= relative_value <= 0.05:
+            return {}
+        else:
+            response = {
+                "账号名称": account_name,
+                "位置": index,
+                "当天阅读率均值": float_to_percentage(avg_rate),
+                "前一天阅读率均值": float_to_percentage(previous_read_rate_avg),
+                "相对变化率": float_to_percentage(relative_value)
+            }
+            return response
+
+
 def update_single_day(dt, account_list, article_df, lam):
     """
     更新单天数据
@@ -202,37 +256,64 @@ def update_single_day(dt, account_list, article_df, lam):
     :return:
     """
     index_list = [1, 2, 3, 4, 5, 6, 7, 8]
+    error_list = []
+    insert_error_list = []
     for account in tqdm(account_list):
         for index in index_list:
-            avg_rate, max_time, min_time, a_count = cal_avg_account_read_rate(article_df, account['gh_id'], index, dt)
-            # print(account['account_name'], "\t", index, "\t", avg_rate, "\t", max_time, "\t", min_time, "\t", a_count,
-            #       "\t", account['gh_id'])
-            try:
-                if avg_rate == 0:
-                    continue
-                insert_sql = f"""
-                    INSERT INTO long_articles_read_rate_dev
-                    (account_name, gh_id, position, read_rate_avg, remark, articles_count, earliest_publish_time, latest_publish_time, dt_version, is_delete)
-                    values
-                    (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
-                """
-                lam.update(
-                    sql=insert_sql,
-                    params=(
-                        account['account_name'],
-                        account['gh_id'],
-                        index,
-                        avg_rate,
-                        "从 {} 开始往前计算 31  天".format(dt),
-                        a_count,
-                        timestamp_to_str(min_time),
-                        timestamp_to_str(max_time),
-                        dt.replace("-", ""),
-                        0
+            avg_rate, max_time, min_time, articles_count = cal_avg_account_read_rate(article_df, account['gh_id'], index, dt)
+            if articles_count > 0:
+                if index in {1, 2}:
+                    error_obj = check_each_position(
+                        db_client=lam,
+                        gh_id=account['gh_id'],
+                        index=index,
+                        dt=dt,
+                        avg_rate=avg_rate
+                    )
+                    if error_obj:
+                        error_list.append(error_obj)
+                # continue
+                try:
+                    if avg_rate == 0:
+                        continue
+                    insert_sql = f"""
+                        INSERT INTO long_articles_read_rate
+                        (account_name, gh_id, position, read_rate_avg, remark, articles_count, earliest_publish_time, latest_publish_time, dt_version, is_delete)
+                        values
+                        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+                    """
+                    lam.update(
+                        sql=insert_sql,
+                        params=(
+                            account['account_name'],
+                            account['gh_id'],
+                            index,
+                            avg_rate,
+                            "从 {} 开始往前计算 31  天".format(dt),
+                            articles_count,
+                            timestamp_to_str(min_time),
+                            timestamp_to_str(max_time),
+                            dt.replace("-", ""),
+                            0
+                        )
                     )
-                )
-            except Exception as e:
-                print(e)
+                except Exception as e:
+                    insert_error_list.append(e)
+
+    if insert_error_list:
+        bot(
+            title="更新阅读率均值,存在sql 插入失败",
+            detail=insert_error_list
+        )
+
+    if error_list:
+        bot(
+            title="更新阅读率均值,头次出现异常值通知",
+            detail={
+                "时间": dt,
+                "异常列表": error_list
+            }
+        )
 
 
 def main() -> None:
@@ -240,26 +321,37 @@ def main() -> None:
     main function
     :return:
     """
+    parser = ArgumentParser()
+    parser.add_argument("--run-date",
+                        help="Run only once for date in format of %Y-%m-%d. \
+                                    If no specified, run as daily jobs.")
+    args = parser.parse_args()
+    if args.run_date:
+        dt = args.run_date
+    else:
+        dt = datetime.today().strftime('%Y-%m-%d')
+
     lam = longArticlesMySQL()
     de = DeNetMysql()
-    # dt = '2024-10-22'
     account_list = get_publishing_accounts(db_client=de)
-    # 获取这些账号所有的文章
     df = cal_account_read_rate(tuple([i['gh_id'] for i in account_list]))
-    start_dt = start_date = datetime(2024, 8, 1)
-    end_date = datetime(2024, 10, 22)
-    # 计算日期差
-    delta = end_date - start_date
-    # 生成日期字符串列表
-    date_strings = []
-    for i in range(delta.days + 1):
-        date_strings.append((start_date + timedelta(days=i)).strftime('%Y-%m-%d'))
-
-    # 打印结果
-    date_str = '2024-09-11'
-    date_strings = [date_str,]
-    for date_str in tqdm(date_strings):
-        update_single_day(date_str, account_list, df, lam)
+
+    update_single_day(dt, account_list, df, lam)
+
+    # start_dt = start_date = datetime(2024, 8, 1)
+    # end_date = datetime(2024, 10, 22)
+    # # 计算日期差
+    # delta = end_date - start_date
+    # # 生成日期字符串列表
+    # date_strings = []
+    # for i in range(delta.days + 1):
+    #     date_strings.append((start_date + timedelta(days=i)).strftime('%Y-%m-%d'))
+    #
+    # # 打印结果
+    # date_str = '2024-09-11'
+    # date_strings = [date_str,]
+    # for date_str in tqdm(date_strings):
+    #     update_single_day(date_str, account_list, df, lam)
 
 
 if __name__ == '__main__':

+ 26 - 0
sh/run_update_account_read_rate_avg.sh

@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# 获取当前日期,格式为 YYYY-MM-DD
+CURRENT_DATE=$(date +%F)
+
+# 日志文件路径,包含日期
+LOG_FILE="/root/luojunhui/logs/cal_read_rate_avg_task_log_$CURRENT_DATE.txt"
+
+# 重定向整个脚本的输出到带日期的日志文件
+exec >> "$LOG_FILE" 2>&1
+if pgrep -f "python3 cal_account_read_rate_avg_daily.py" > /dev/null
+then
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - cal_account_read_rate_avg_daily.py is running"
+else
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart cal_account_read_rate_avg_daily.py"
+    # 切换到指定目录
+    cd /root/luojunhui/LongArticlesJob
+
+    # 激活 Conda 环境
+    source /root/miniconda3/etc/profile.d/conda.sh
+    conda activate tasks
+
+    # 在后台运行 Python 脚本并重定向日志输出
+    nohup python3 cal_account_read_rate_avg_daily.py >> "${LOG_FILE}" 2>&1 &
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted cal_account_read_rate_avg_daily.py"
+fi