cal_account_read_rate_avg_daily.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. """
  2. @author: luojunhui
  3. cal each account && position reading rate
  4. """
  5. import json
  6. from tqdm import tqdm
  7. from pandas import DataFrame
  8. from argparse import ArgumentParser
  9. from datetime import datetime
  10. from applications import DeNetMysql, PQMySQL, longArticlesMySQL, bot, Functions
  11. from applications.const import updateAccountReadRateTaskConst
  12. from config import apolloConfig
  13. const = updateAccountReadRateTaskConst()
  14. config = apolloConfig()
  15. unauthorized_account = json.loads(config.getConfigValue("unauthorized_gh_id_fans"))
  16. functions = Functions()
  17. read_rate_table = "long_articles_read_rate"
  18. def filter_outlier_data(group, key='show_view_count'):
  19. """
  20. :param group:
  21. :param key:
  22. :return:
  23. """
  24. mean = group[key].mean()
  25. std = group[key].std()
  26. # 过滤二倍标准差的数据
  27. filtered_group = group[(group[key] > mean - 2 * std) & (group[key] < mean + 2 * std)]
  28. # 过滤均值倍数大于5的数据
  29. new_mean = filtered_group[key].mean()
  30. # print("阅读均值", new_mean)
  31. filtered_group = filtered_group[filtered_group[key] < new_mean * 5]
  32. return filtered_group
  33. def get_account_fans_by_dt(db_client) -> dict:
  34. """
  35. 获取每个账号发粉丝,通过日期来区分
  36. :return:
  37. """
  38. sql = f"""
  39. SELECT
  40. t1.date_str,
  41. t1.fans_count,
  42. t2.gh_id
  43. FROM datastat_wx t1
  44. JOIN publish_account t2 ON t1.account_id = t2.id
  45. WHERE
  46. t2.channel = 5
  47. AND t2.status = 1
  48. AND t1.date_str >= '2024-07-01'
  49. ORDER BY t1.date_str;
  50. """
  51. result = db_client.select(sql)
  52. D = {}
  53. for line in result:
  54. dt = line[0]
  55. fans = line[1]
  56. gh_id = line[2]
  57. if D.get(gh_id):
  58. D[gh_id][dt] = fans
  59. else:
  60. D[gh_id] = {dt: fans}
  61. return D
  62. def get_publishing_accounts(db_client) -> list[dict]:
  63. """
  64. 获取每日正在发布的账号
  65. :return:
  66. """
  67. sql = f"""
  68. SELECT DISTINCT
  69. t3.`name`,
  70. t3.gh_id,
  71. t3.follower_count,
  72. t6.account_source_name,
  73. t6.mode_type,
  74. t6.account_type,
  75. t6.`status`
  76. FROM
  77. publish_plan t1
  78. JOIN publish_plan_account t2 ON t1.id = t2.plan_id
  79. JOIN publish_account t3 ON t2.account_id = t3.id
  80. LEFT JOIN publish_account_wx_type t4 on t3.id = t4.account_id
  81. LEFT JOIN wx_statistics_group_source_account t5 on t3.id = t5.account_id
  82. LEFT JOIN wx_statistics_group_source t6 on t5.group_source_name = t6.account_source_name
  83. WHERE
  84. t1.plan_status = 1
  85. AND t3.channel = 5
  86. -- AND t3.follower_count > 0
  87. GROUP BY t3.id;
  88. """
  89. account_list = db_client.select(sql)
  90. result_list = [
  91. {
  92. "account_name": i[0],
  93. "gh_id": i[1]
  94. } for i in account_list
  95. ]
  96. return result_list
  97. def get_account_articles_detail(db_client, gh_id_tuple) -> list[dict]:
  98. """
  99. get articles details
  100. :return:
  101. """
  102. sql = f"""
  103. SELECT
  104. ghId, accountName, ItemIndex, show_view_count, publish_timestamp
  105. FROM
  106. official_articles_v2
  107. WHERE
  108. ghId IN {gh_id_tuple} and Type = '{const.BULK_PUBLISH_TYPE}';
  109. """
  110. result = db_client.select(sql)
  111. response_list = [
  112. {
  113. "ghId": i[0],
  114. "accountName": i[1],
  115. "ItemIndex": i[2],
  116. "show_view_count": i[3],
  117. "publish_timestamp": i[4]
  118. }
  119. for i in result
  120. ]
  121. return response_list
  122. def cal_account_read_rate(gh_id_tuple) -> DataFrame:
  123. """
  124. 计算账号位置的阅读率
  125. :return:
  126. """
  127. pq_db = PQMySQL()
  128. de_db = DeNetMysql()
  129. response = []
  130. fans_dict_each_day = get_account_fans_by_dt(db_client=de_db)
  131. account_article_detail = get_account_articles_detail(
  132. db_client=pq_db,
  133. gh_id_tuple=gh_id_tuple
  134. )
  135. for line in account_article_detail:
  136. gh_id = line['ghId']
  137. dt = functions.timestamp_to_str(timestamp=line['publish_timestamp'], string_format='%Y-%m-%d')
  138. fans = fans_dict_each_day.get(gh_id, {}).get(dt, 0)
  139. if not fans:
  140. fans = int(unauthorized_account.get(gh_id, 0))
  141. line['fans'] = fans
  142. if fans > 1000:
  143. line['readRate'] = line['show_view_count'] / fans if fans else 0
  144. response.append(line)
  145. return DataFrame(response,
  146. columns=['ghId', 'accountName', 'ItemIndex', 'show_view_count', 'publish_timestamp', 'readRate'])
  147. def cal_avg_account_read_rate(df, gh_id, index, dt) -> dict:
  148. """
  149. 计算账号的阅读率均值
  150. :return:
  151. """
  152. max_time = functions.str_to_timestamp(date_string=dt)
  153. min_time = max_time - const.STATISTICS_PERIOD
  154. # 通过
  155. filterDataFrame = df[
  156. (df["ghId"] == gh_id)
  157. & (min_time <= df["publish_timestamp"])
  158. & (df["publish_timestamp"] <= max_time)
  159. & (df['ItemIndex'] == index)
  160. ]
  161. # 用二倍标准差过滤
  162. finalDF = filter_outlier_data(filterDataFrame)
  163. return {
  164. "read_rate_avg": finalDF['readRate'].mean(),
  165. "max_publish_time": finalDF['publish_timestamp'].max(),
  166. "min_publish_time": finalDF['publish_timestamp'].min(),
  167. "records": len(finalDF)
  168. }
  169. def check_each_position(db_client, gh_id, index, dt, avg_rate) -> dict:
  170. """
  171. 检验某个具体账号的具体文章的阅读率均值和前段日子的比较
  172. :param avg_rate: 当天计算出的阅读率均值
  173. :param db_client: 数据库连接
  174. :param gh_id: 账号 id
  175. :param index: 账号 index
  176. :param dt:
  177. :return:
  178. """
  179. dt = int(dt.replace("-", ""))
  180. select_sql = f"""
  181. SELECT account_name, read_rate_avg
  182. FROM {read_rate_table}
  183. WHERE gh_id = '{gh_id}' and position = {index} and dt_version < {dt}
  184. ORDER BY dt_version DESC limit 1;
  185. """
  186. result = db_client.select(select_sql)
  187. if result:
  188. account_name = result[0][0]
  189. previous_read_rate_avg = result[0][1]
  190. relative_value = (avg_rate - previous_read_rate_avg) / previous_read_rate_avg
  191. if -const.RELATIVE_VALUE_THRESHOLD <= relative_value <= const.RELATIVE_VALUE_THRESHOLD:
  192. return {}
  193. else:
  194. response = {
  195. "账号名称": account_name,
  196. "位置": index,
  197. "当天阅读率均值": Functions().float_to_percentage(avg_rate),
  198. "前一天阅读率均值": Functions().float_to_percentage(previous_read_rate_avg),
  199. "相对变化率": Functions().float_to_percentage(relative_value)
  200. }
  201. return response
  202. def update_single_day(dt, account_list, article_df, lam):
  203. """
  204. 更新单天数据
  205. :param article_df:
  206. :param lam:
  207. :param account_list:
  208. :param dt:
  209. :return:
  210. """
  211. error_list = []
  212. insert_error_list = []
  213. update_timestamp = functions.str_to_timestamp(date_string=dt)
  214. # 因为计算均值的时候是第二天,所以需要把时间前移一天
  215. avg_date = functions.timestamp_to_str(
  216. timestamp=update_timestamp - const.ONE_DAY_IN_SECONDS,
  217. string_format='%Y-%m-%d'
  218. )
  219. for account in tqdm(account_list, desc=dt):
  220. for index in const.ARTICLE_INDEX_LIST:
  221. read_rate_detail = cal_avg_account_read_rate(
  222. df=article_df,
  223. gh_id=account['gh_id'],
  224. index=index,
  225. dt=dt
  226. )
  227. read_rate_avg = read_rate_detail['read_rate_avg']
  228. max_publish_time = read_rate_detail['max_publish_time']
  229. min_publish_time = read_rate_detail['min_publish_time']
  230. articles_count = read_rate_detail['records']
  231. if articles_count:
  232. if index in {1, 2}:
  233. error_obj = check_each_position(
  234. db_client=lam,
  235. gh_id=account['gh_id'],
  236. index=index,
  237. dt=dt,
  238. avg_rate=read_rate_avg
  239. )
  240. if error_obj:
  241. error_list.append(error_obj)
  242. # continue
  243. try:
  244. if not read_rate_avg:
  245. continue
  246. insert_sql = f"""
  247. INSERT INTO {read_rate_table}
  248. (account_name, gh_id, position, read_rate_avg, remark, articles_count, earliest_publish_time, latest_publish_time, dt_version, is_delete)
  249. values
  250. (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
  251. """
  252. lam.update(
  253. sql=insert_sql,
  254. params=(
  255. account['account_name'],
  256. account['gh_id'],
  257. index,
  258. read_rate_avg,
  259. "从 {} 开始往前计算 31 天".format(dt),
  260. articles_count,
  261. functions.timestamp_to_str(timestamp=min_publish_time, string_format='%Y-%m-%d'),
  262. functions.timestamp_to_str(timestamp=max_publish_time, string_format='%Y-%m-%d'),
  263. avg_date.replace("-", ""),
  264. 0
  265. )
  266. )
  267. except Exception as e:
  268. insert_error_list.append(e)
  269. if insert_error_list:
  270. bot(
  271. title="更新阅读率均值,存在sql 插入失败",
  272. detail=insert_error_list
  273. )
  274. if error_list:
  275. bot(
  276. title="更新阅读率均值,头次出现异常值通知",
  277. detail={
  278. "时间": dt,
  279. "异常列表": error_list
  280. }
  281. )
  282. if not error_list and not insert_error_list:
  283. bot(
  284. title="阅读率均值表,更新成功",
  285. detail={
  286. "日期": dt
  287. }
  288. )
  289. def main() -> None:
  290. """
  291. main function
  292. :return:
  293. """
  294. parser = ArgumentParser()
  295. parser.add_argument("--run-date",
  296. help="Run only once for date in format of %Y-%m-%d. \
  297. If no specified, run as daily jobs.")
  298. args = parser.parse_args()
  299. if args.run_date:
  300. dt = args.run_date
  301. else:
  302. dt = datetime.today().strftime('%Y-%m-%d')
  303. lam = longArticlesMySQL()
  304. de = DeNetMysql()
  305. account_list = get_publishing_accounts(db_client=de)
  306. df = cal_account_read_rate(tuple([i['gh_id'] for i in account_list]))
  307. update_single_day(dt, account_list, df, lam)
  308. if __name__ == '__main__':
  309. main()