|
@@ -2,27 +2,17 @@
|
|
@author: luojunhui
|
|
@author: luojunhui
|
|
cal each account && position reading rate
|
|
cal each account && position reading rate
|
|
"""
|
|
"""
|
|
-import json
|
|
|
|
from tqdm import tqdm
|
|
from tqdm import tqdm
|
|
from pandas import DataFrame
|
|
from pandas import DataFrame
|
|
from argparse import ArgumentParser
|
|
from argparse import ArgumentParser
|
|
-from datetime import datetime, timezone, timedelta
|
|
|
|
|
|
+from datetime import datetime
|
|
|
|
|
|
-from applications import DeNetMysql, PQMySQL, longArticlesMySQL, bot
|
|
|
|
|
|
+from applications import DeNetMysql, PQMySQL, longArticlesMySQL, bot, Functions
|
|
|
|
+from applications.const import updateAccountReadRateTaskConst
|
|
|
|
|
|
-STATISTICS_PERIOD = 31 * 24 * 60 * 60
|
|
|
|
-ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-def float_to_percentage(value, decimals=3) -> str:
|
|
|
|
- """
|
|
|
|
- 把小数转化为百分数
|
|
|
|
- :param value:
|
|
|
|
- :param decimals:
|
|
|
|
- :return:
|
|
|
|
- """
|
|
|
|
- percentage_value = round(value * 100, decimals)
|
|
|
|
- return "{}%".format(percentage_value)
|
|
|
|
|
|
+const = updateAccountReadRateTaskConst()
|
|
|
|
+functions = Functions()
|
|
|
|
+read_rate_table = "long_articles_read_rate"
|
|
|
|
|
|
|
|
|
|
def filter_outlier_data(group, key='show_view_count'):
|
|
def filter_outlier_data(group, key='show_view_count'):
|
|
@@ -43,27 +33,6 @@ def filter_outlier_data(group, key='show_view_count'):
|
|
return filtered_group
|
|
return filtered_group
|
|
|
|
|
|
|
|
|
|
-def timestamp_to_str(timestamp) -> str:
|
|
|
|
- """
|
|
|
|
- :param timestamp:
|
|
|
|
- """
|
|
|
|
- dt_object = datetime.utcfromtimestamp(timestamp).replace(tzinfo=timezone.utc).astimezone()
|
|
|
|
- date_string = dt_object.strftime('%Y-%m-%d')
|
|
|
|
- return date_string
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-def str_to_timestamp(date_string) -> int:
|
|
|
|
- """
|
|
|
|
- :param date_string:
|
|
|
|
- :return:
|
|
|
|
- """
|
|
|
|
- date_obj = datetime.strptime(date_string, '%Y-%m-%d')
|
|
|
|
-
|
|
|
|
- # 使用timestamp()方法将datetime对象转换为时间戳
|
|
|
|
- timestamp = date_obj.timestamp()
|
|
|
|
- return int(timestamp)
|
|
|
|
-
|
|
|
|
-
|
|
|
|
def get_account_fans_by_dt(db_client) -> dict:
|
|
def get_account_fans_by_dt(db_client) -> dict:
|
|
"""
|
|
"""
|
|
获取每个账号发粉丝,通过日期来区分
|
|
获取每个账号发粉丝,通过日期来区分
|
|
@@ -139,20 +108,20 @@ def get_account_articles_detail(db_client, gh_id_tuple) -> list[dict]:
|
|
"""
|
|
"""
|
|
sql = f"""
|
|
sql = f"""
|
|
SELECT
|
|
SELECT
|
|
- ghId, accountName, updateTime, ItemIndex, show_view_count
|
|
|
|
|
|
+ ghId, accountName, ItemIndex, show_view_count, publish_timestamp
|
|
FROM
|
|
FROM
|
|
official_articles_v2
|
|
official_articles_v2
|
|
WHERE
|
|
WHERE
|
|
- ghId IN {gh_id_tuple} and Type = '9';
|
|
|
|
|
|
+ ghId IN {gh_id_tuple} and Type = '{const.BULK_PUBLISH_TYPE}';
|
|
"""
|
|
"""
|
|
result = db_client.select(sql)
|
|
result = db_client.select(sql)
|
|
response_list = [
|
|
response_list = [
|
|
{
|
|
{
|
|
"ghId": i[0],
|
|
"ghId": i[0],
|
|
"accountName": i[1],
|
|
"accountName": i[1],
|
|
- "updateTime": i[2],
|
|
|
|
- "ItemIndex": i[3],
|
|
|
|
- "show_view_count": i[4]
|
|
|
|
|
|
+ "ItemIndex": i[2],
|
|
|
|
+ "show_view_count": i[3],
|
|
|
|
+ "publish_timestamp": i[4]
|
|
}
|
|
}
|
|
for i in result
|
|
for i in result
|
|
]
|
|
]
|
|
@@ -174,42 +143,41 @@ def cal_account_read_rate(gh_id_tuple) -> DataFrame:
|
|
)
|
|
)
|
|
for line in account_article_detail:
|
|
for line in account_article_detail:
|
|
gh_id = line['ghId']
|
|
gh_id = line['ghId']
|
|
- dt = timestamp_to_str(line['updateTime'])
|
|
|
|
|
|
+ dt = functions.timestamp_to_str(timestamp=line['publish_timestamp'], string_format='%Y-%m-%d')
|
|
fans = fans_dict_each_day.get(gh_id, {}).get(dt, 0)
|
|
fans = fans_dict_each_day.get(gh_id, {}).get(dt, 0)
|
|
line['fans'] = fans
|
|
line['fans'] = fans
|
|
- if fans:
|
|
|
|
|
|
+ if fans > 1000:
|
|
line['readRate'] = line['show_view_count'] / fans if fans else 0
|
|
line['readRate'] = line['show_view_count'] / fans if fans else 0
|
|
response.append(line)
|
|
response.append(line)
|
|
return DataFrame(response,
|
|
return DataFrame(response,
|
|
- columns=['ghId', 'accountName', 'updateTime', 'ItemIndex', 'show_view_count', 'readRate'])
|
|
|
|
|
|
+ columns=['ghId', 'accountName', 'ItemIndex', 'show_view_count', 'publish_timestamp', 'readRate'])
|
|
|
|
|
|
|
|
|
|
-def cal_avg_account_read_rate(df, gh_id, index, dt) -> tuple:
|
|
|
|
|
|
+def cal_avg_account_read_rate(df, gh_id, index, dt) -> dict:
|
|
"""
|
|
"""
|
|
计算账号的阅读率均值
|
|
计算账号的阅读率均值
|
|
:return:
|
|
:return:
|
|
"""
|
|
"""
|
|
- max_time = str_to_timestamp(dt)
|
|
|
|
- min_time = max_time - STATISTICS_PERIOD
|
|
|
|
|
|
+ max_time = functions.str_to_timestamp(date_string=dt)
|
|
|
|
+ min_time = max_time - const.STATISTICS_PERIOD
|
|
|
|
|
|
|
|
+ # 通过
|
|
filterDataFrame = df[
|
|
filterDataFrame = df[
|
|
(df["ghId"] == gh_id)
|
|
(df["ghId"] == gh_id)
|
|
- & (min_time <= df["updateTime"])
|
|
|
|
- & (df["updateTime"] <= max_time)
|
|
|
|
|
|
+ & (min_time <= df["publish_timestamp"])
|
|
|
|
+ & (df["publish_timestamp"] <= max_time)
|
|
& (df['ItemIndex'] == index)
|
|
& (df['ItemIndex'] == index)
|
|
]
|
|
]
|
|
- # print("位置", index)
|
|
|
|
|
|
+
|
|
|
|
+ # 用二倍标准差过滤
|
|
finalDF = filter_outlier_data(filterDataFrame)
|
|
finalDF = filter_outlier_data(filterDataFrame)
|
|
- # finalDF = finalDF.sort_values(by=['updateTime'], ascending=False)
|
|
|
|
- # if index == 1:
|
|
|
|
- # for i in finalDF.values.tolist():
|
|
|
|
- # print(datetime.fromtimestamp(i[2]).strftime('%Y-%m-%d'), i)
|
|
|
|
- return (
|
|
|
|
- finalDF['readRate'].mean(),
|
|
|
|
- finalDF['updateTime'].max(),
|
|
|
|
- finalDF['updateTime'].min(),
|
|
|
|
- len(finalDF)
|
|
|
|
- )
|
|
|
|
|
|
+
|
|
|
|
+ return {
|
|
|
|
+ "read_rate_avg": finalDF['readRate'].mean(),
|
|
|
|
+ "max_publish_time": finalDF['publish_timestamp'].max(),
|
|
|
|
+ "min_publish_time": finalDF['publish_timestamp'].min(),
|
|
|
|
+ "records": len(finalDF)
|
|
|
|
+ }
|
|
|
|
|
|
|
|
|
|
def check_each_position(db_client, gh_id, index, dt, avg_rate) -> dict:
|
|
def check_each_position(db_client, gh_id, index, dt, avg_rate) -> dict:
|
|
@@ -222,11 +190,11 @@ def check_each_position(db_client, gh_id, index, dt, avg_rate) -> dict:
|
|
:param dt:
|
|
:param dt:
|
|
:return:
|
|
:return:
|
|
"""
|
|
"""
|
|
- RELATIVE_VALUE_THRESHOLD = 0.1
|
|
|
|
|
|
+
|
|
dt = int(dt.replace("-", ""))
|
|
dt = int(dt.replace("-", ""))
|
|
select_sql = f"""
|
|
select_sql = f"""
|
|
SELECT account_name, read_rate_avg
|
|
SELECT account_name, read_rate_avg
|
|
- FROM long_articles_read_rate
|
|
|
|
|
|
+ FROM {read_rate_table}
|
|
WHERE gh_id = '{gh_id}' and position = {index} and dt_version < {dt}
|
|
WHERE gh_id = '{gh_id}' and position = {index} and dt_version < {dt}
|
|
ORDER BY dt_version DESC limit 1;
|
|
ORDER BY dt_version DESC limit 1;
|
|
"""
|
|
"""
|
|
@@ -235,15 +203,15 @@ def check_each_position(db_client, gh_id, index, dt, avg_rate) -> dict:
|
|
account_name = result[0][0]
|
|
account_name = result[0][0]
|
|
previous_read_rate_avg = result[0][1]
|
|
previous_read_rate_avg = result[0][1]
|
|
relative_value = (avg_rate - previous_read_rate_avg) / previous_read_rate_avg
|
|
relative_value = (avg_rate - previous_read_rate_avg) / previous_read_rate_avg
|
|
- if -RELATIVE_VALUE_THRESHOLD <= relative_value <= RELATIVE_VALUE_THRESHOLD:
|
|
|
|
|
|
+ if -const.RELATIVE_VALUE_THRESHOLD <= relative_value <= const.RELATIVE_VALUE_THRESHOLD:
|
|
return {}
|
|
return {}
|
|
else:
|
|
else:
|
|
response = {
|
|
response = {
|
|
"账号名称": account_name,
|
|
"账号名称": account_name,
|
|
"位置": index,
|
|
"位置": index,
|
|
- "当天阅读率均值": float_to_percentage(avg_rate),
|
|
|
|
- "前一天阅读率均值": float_to_percentage(previous_read_rate_avg),
|
|
|
|
- "相对变化率": float_to_percentage(relative_value)
|
|
|
|
|
|
+ "当天阅读率均值": Functions().float_to_percentage(avg_rate),
|
|
|
|
+ "前一天阅读率均值": Functions().float_to_percentage(previous_read_rate_avg),
|
|
|
|
+ "相对变化率": Functions().float_to_percentage(relative_value)
|
|
}
|
|
}
|
|
return response
|
|
return response
|
|
|
|
|
|
@@ -257,32 +225,45 @@ def update_single_day(dt, account_list, article_df, lam):
|
|
:param dt:
|
|
:param dt:
|
|
:return:
|
|
:return:
|
|
"""
|
|
"""
|
|
- index_list = [1, 2, 3, 4, 5, 6, 7, 8]
|
|
|
|
error_list = []
|
|
error_list = []
|
|
insert_error_list = []
|
|
insert_error_list = []
|
|
- update_timestamp = str_to_timestamp(dt)
|
|
|
|
|
|
+ update_timestamp = functions.str_to_timestamp(date_string=dt)
|
|
|
|
+
|
|
# 因为计算均值的时候是第二天,所以需要把时间前移一天
|
|
# 因为计算均值的时候是第二天,所以需要把时间前移一天
|
|
- avg_date = timestamp_to_str(update_timestamp - ONE_DAY_IN_SECONDS)
|
|
|
|
|
|
+ avg_date = functions.timestamp_to_str(
|
|
|
|
+ timestamp=update_timestamp - const.ONE_DAY_IN_SECONDS,
|
|
|
|
+ string_format='%Y-%m-%d'
|
|
|
|
+ )
|
|
|
|
+
|
|
for account in tqdm(account_list):
|
|
for account in tqdm(account_list):
|
|
- for index in index_list:
|
|
|
|
- avg_rate, max_time, min_time, articles_count = cal_avg_account_read_rate(article_df, account['gh_id'], index, dt)
|
|
|
|
- if articles_count > 0:
|
|
|
|
|
|
+ for index in const.ARTICLE_INDEX_LIST:
|
|
|
|
+ read_rate_detail = cal_avg_account_read_rate(
|
|
|
|
+ df=article_df,
|
|
|
|
+ gh_id=account['gh_id'],
|
|
|
|
+ index=index,
|
|
|
|
+ dt=dt
|
|
|
|
+ )
|
|
|
|
+ read_rate_avg = read_rate_detail['read_rate_avg']
|
|
|
|
+ max_publish_time = read_rate_detail['max_publish_time']
|
|
|
|
+ min_publish_time = read_rate_detail['min_publish_time']
|
|
|
|
+ articles_count = read_rate_detail['records']
|
|
|
|
+ if articles_count:
|
|
if index in {1, 2}:
|
|
if index in {1, 2}:
|
|
error_obj = check_each_position(
|
|
error_obj = check_each_position(
|
|
db_client=lam,
|
|
db_client=lam,
|
|
gh_id=account['gh_id'],
|
|
gh_id=account['gh_id'],
|
|
index=index,
|
|
index=index,
|
|
dt=dt,
|
|
dt=dt,
|
|
- avg_rate=avg_rate
|
|
|
|
|
|
+ avg_rate=read_rate_avg
|
|
)
|
|
)
|
|
if error_obj:
|
|
if error_obj:
|
|
error_list.append(error_obj)
|
|
error_list.append(error_obj)
|
|
# continue
|
|
# continue
|
|
try:
|
|
try:
|
|
- if avg_rate == 0:
|
|
|
|
|
|
+ if not read_rate_avg:
|
|
continue
|
|
continue
|
|
insert_sql = f"""
|
|
insert_sql = f"""
|
|
- INSERT INTO long_articles_read_rate
|
|
|
|
|
|
+ INSERT INTO {read_rate_table}
|
|
(account_name, gh_id, position, read_rate_avg, remark, articles_count, earliest_publish_time, latest_publish_time, dt_version, is_delete)
|
|
(account_name, gh_id, position, read_rate_avg, remark, articles_count, earliest_publish_time, latest_publish_time, dt_version, is_delete)
|
|
values
|
|
values
|
|
(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
@@ -293,11 +274,11 @@ def update_single_day(dt, account_list, article_df, lam):
|
|
account['account_name'],
|
|
account['account_name'],
|
|
account['gh_id'],
|
|
account['gh_id'],
|
|
index,
|
|
index,
|
|
- avg_rate,
|
|
|
|
|
|
+ read_rate_avg,
|
|
"从 {} 开始往前计算 31 天".format(dt),
|
|
"从 {} 开始往前计算 31 天".format(dt),
|
|
articles_count,
|
|
articles_count,
|
|
- timestamp_to_str(min_time),
|
|
|
|
- timestamp_to_str(max_time),
|
|
|
|
|
|
+ functions.timestamp_to_str(timestamp=min_publish_time, string_format='%Y-%m-%d'),
|
|
|
|
+ functions.timestamp_to_str(timestamp=max_publish_time, string_format='%Y-%m-%d'),
|
|
avg_date.replace("-", ""),
|
|
avg_date.replace("-", ""),
|
|
0
|
|
0
|
|
)
|
|
)
|
|
@@ -351,21 +332,6 @@ def main() -> None:
|
|
|
|
|
|
update_single_day(dt, account_list, df, lam)
|
|
update_single_day(dt, account_list, df, lam)
|
|
|
|
|
|
- # start_dt = start_date = datetime(2024, 8, 1)
|
|
|
|
- # end_date = datetime(2024, 10, 22)
|
|
|
|
- # # 计算日期差
|
|
|
|
- # delta = end_date - start_date
|
|
|
|
- # # 生成日期字符串列表
|
|
|
|
- # date_strings = []
|
|
|
|
- # for i in range(delta.days + 1):
|
|
|
|
- # date_strings.append((start_date + timedelta(days=i)).strftime('%Y-%m-%d'))
|
|
|
|
- #
|
|
|
|
- # # 打印结果
|
|
|
|
- # date_str = '2024-09-11'
|
|
|
|
- # date_strings = [date_str,]
|
|
|
|
- # for date_str in tqdm(date_strings):
|
|
|
|
- # update_single_day(date_str, account_list, df, lam)
|
|
|
|
-
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
main()
|
|
main()
|