|
@@ -22,6 +22,7 @@ const = UpdateAccountReadRateTaskConst()
|
|
|
config = apolloConfig()
|
|
|
unauthorized_account = json.loads(config.getConfigValue("unauthorized_gh_id_fans"))
|
|
|
backup_account_fans = json.loads(config.getConfigValue("backup_account_fans"))
|
|
|
+backup_gzh_account_name = json.loads(config.getConfigValue("backup_gzh_account_name"))
|
|
|
functions = Functions()
|
|
|
read_rate_table = "long_articles_read_rate"
|
|
|
|
|
@@ -67,7 +68,7 @@ def cal_account_read_rate(article_list, fans_dict) -> DataFrame:
|
|
|
:return:
|
|
|
"""
|
|
|
response = []
|
|
|
- for line in article_list:
|
|
|
+ for line in tqdm(article_list):
|
|
|
gh_id = line['ghId']
|
|
|
dt = functions.timestamp_to_str(timestamp=line['publish_timestamp'], string_format='%Y-%m-%d')
|
|
|
fans = fans_dict.get(gh_id, {}).get(dt, const.DEFAULT_FANS)
|
|
@@ -85,7 +86,7 @@ def cal_account_read_rate(article_list, fans_dict) -> DataFrame:
|
|
|
if fans > const.MIN_FANS:
|
|
|
line['readRate'] = line['show_view_count'] / fans if fans else 0
|
|
|
response.append(line)
|
|
|
- return DataFrame(response, columns=['ghId', 'accountName', 'ItemIndex', 'show_view_count', 'publish_timestamp', 'readRate'])
|
|
|
+ return DataFrame(response, columns=['ghId', 'accountName', 'ItemIndex', 'show_view_count', 'publish_timestamp', 'fans', 'readRate'])
|
|
|
|
|
|
|
|
|
def cal_avg_account_read_rate(df, gh_id, index, dt) -> dict:
|
|
@@ -115,12 +116,13 @@ def cal_avg_account_read_rate(df, gh_id, index, dt) -> dict:
|
|
|
}
|
|
|
|
|
|
|
|
|
-def check_each_position(db_client, gh_id, index, dt, avg_rate) -> dict:
|
|
|
+def check_each_position(db_client, gh_id, account_name, index, dt, avg_rate) -> dict:
|
|
|
"""
|
|
|
检验某个具体账号的具体文章的阅读率均值和前段日子的比较
|
|
|
:param avg_rate: 当天计算出的阅读率均值
|
|
|
:param db_client: 数据库连接
|
|
|
:param gh_id: 账号 id
|
|
|
+ :param account_name: 账号名称
|
|
|
:param index: 账号 index
|
|
|
:param dt:
|
|
|
:return:
|
|
@@ -128,15 +130,14 @@ def check_each_position(db_client, gh_id, index, dt, avg_rate) -> dict:
|
|
|
|
|
|
dt = int(dt.replace("-", ""))
|
|
|
select_sql = f"""
|
|
|
- SELECT account_name, read_rate_avg
|
|
|
+ SELECT read_rate_avg
|
|
|
FROM {read_rate_table}
|
|
|
WHERE gh_id = '{gh_id}' and position = {index} and dt_version < {dt}
|
|
|
ORDER BY dt_version DESC limit 1;
|
|
|
"""
|
|
|
- result = db_client.fetch(select_sql)
|
|
|
+ result = db_client.fetch(select_sql, cursor_type=DictCursor)
|
|
|
if result:
|
|
|
- account_name = result[0][0]
|
|
|
- previous_read_rate_avg = result[0][1]
|
|
|
+ previous_read_rate_avg = result[0]['read_rate_avg']
|
|
|
relative_value = (avg_rate - previous_read_rate_avg) / previous_read_rate_avg
|
|
|
if -const.RELATIVE_VALUE_THRESHOLD <= relative_value <= const.RELATIVE_VALUE_THRESHOLD:
|
|
|
return {}
|
|
@@ -154,6 +155,8 @@ def check_each_position(db_client, gh_id, index, dt, avg_rate) -> dict:
|
|
|
]
|
|
|
}
|
|
|
return response
|
|
|
+ else:
|
|
|
+ return {}
|
|
|
|
|
|
|
|
|
def update_single_day(dt, account_list, article_df, lam):
|
|
@@ -177,12 +180,18 @@ def update_single_day(dt, account_list, article_df, lam):
|
|
|
|
|
|
# processed_account_set
|
|
|
processed_account_set = set()
|
|
|
+ without_name_account_set = set()
|
|
|
|
|
|
for account in tqdm(account_list, desc=dt):
|
|
|
+ account_name = account['account_name']
|
|
|
+ gh_id = account['gh_id']
|
|
|
+ if not account_name:
|
|
|
+ account_name = backup_gzh_account_name.get(gh_id, "")
|
|
|
+
|
|
|
for index in const.ARTICLE_INDEX_LIST:
|
|
|
read_rate_detail = cal_avg_account_read_rate(
|
|
|
df=article_df,
|
|
|
- gh_id=account['gh_id'],
|
|
|
+ gh_id=gh_id,
|
|
|
index=index,
|
|
|
dt=dt
|
|
|
)
|
|
@@ -192,11 +201,14 @@ def update_single_day(dt, account_list, article_df, lam):
|
|
|
articles_count = read_rate_detail['records']
|
|
|
if articles_count:
|
|
|
processed_account_set.add(account['gh_id'])
|
|
|
+ if not account_name:
|
|
|
+ without_name_account_set.add(gh_id)
|
|
|
# check read rate in position 1 and 2
|
|
|
if index in [1, 2]:
|
|
|
error_obj = check_each_position(
|
|
|
db_client=lam,
|
|
|
- gh_id=account['gh_id'],
|
|
|
+ gh_id=gh_id,
|
|
|
+ account_name=account_name,
|
|
|
index=index,
|
|
|
dt=dt,
|
|
|
avg_rate=read_rate_avg
|
|
@@ -216,8 +228,8 @@ def update_single_day(dt, account_list, article_df, lam):
|
|
|
lam.save(
|
|
|
query=insert_sql,
|
|
|
params=(
|
|
|
- account['account_name'],
|
|
|
- account['gh_id'],
|
|
|
+ account_name,
|
|
|
+ gh_id,
|
|
|
index,
|
|
|
read_rate_avg,
|
|
|
"从 {} 开始往前计算 31 天".format(dt),
|
|
@@ -232,6 +244,14 @@ def update_single_day(dt, account_list, article_df, lam):
|
|
|
print(e)
|
|
|
insert_error_list.append(str(e))
|
|
|
|
|
|
+ # bot no name account
|
|
|
+ if without_name_account_set:
|
|
|
+ bot(
|
|
|
+ title="更新阅读率均值,存在无名称账号",
|
|
|
+ detail=list(without_name_account_set),
|
|
|
+ mention=False
|
|
|
+ )
|
|
|
+
|
|
|
# bot sql error
|
|
|
if insert_error_list:
|
|
|
bot(
|