cal_read_score.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. """
  2. 计算阅读分
  3. """
  4. import hashlib
  5. import json
  6. import math
  7. from tqdm import tqdm
  8. class ReadScoreCalculator:
  9. def __init__(self, db_client):
  10. self.db_client = db_client
  11. self.read_avg_map = {}
  12. self.unsafe_md5_set = None
  13. async def get_read_avg_map(self):
  14. """
  15. 获取阅读分平均值映射表
  16. """
  17. query = """
  18. select gh_id, position, read_avg_ci_upper
  19. from account_avg_info_v3 where status = 1;
  20. """
  21. raw_data = await self.db_client.async_fetch(query=query, db_name="piaoquan_crawler")
  22. read_avg_ci_map = {}
  23. for item in raw_data:
  24. key = f"{item['gh_id']}-{item['position']}"
  25. value = item['read_avg_ci_upper']
  26. read_avg_ci_map[key] = value
  27. return read_avg_ci_map
  28. async def get_unsafe_titles(self):
  29. query = """
  30. select title_md5 from article_unsafe_title where status = 1;
  31. """
  32. unsafe_title_md5s = await self.db_client.async_fetch(query=query)
  33. return set([item['title_md5'] for item in unsafe_title_md5s])
  34. async def calculate_content_read_detail(self, content_md5):
  35. """
  36. 计算文章的阅读分
  37. """
  38. query = """
  39. select ghId, ItemIndex, show_view_count
  40. from official_articles_v2 where title_md5 = %s
  41. and root_source_id_list is not null and root_source_id_list != '[]' and Type = 9
  42. and status = 1;
  43. """
  44. raw_data = await self.db_client.async_fetch(query=query, db_name="piaoquan_crawler", params=(content_md5,))
  45. total_read_count_first = 0
  46. total_read_avg_first = 0
  47. total_read_count_second = 0
  48. total_read_avg_second = 0
  49. for item in raw_data:
  50. if item['ItemIndex'] == 1:
  51. total_read_count_first += item['show_view_count']
  52. key = f"{item['ghId']}-{item['ItemIndex']}"
  53. total_read_avg_first += self.read_avg_map.get(key, 0)
  54. elif item['ItemIndex'] == 2:
  55. total_read_count_second += item['show_view_count']
  56. key = f"{item['ghId']}-{item['ItemIndex']}"
  57. total_read_avg_second += self.read_avg_map.get(key, 0)
  58. else:
  59. continue
  60. return {
  61. "read_count_first": total_read_count_first,
  62. "read_avg_first": total_read_avg_first,
  63. "read_count_second": total_read_count_second,
  64. "read_avg_second": total_read_avg_second,
  65. }
  66. async def calculate_read_score(self, content, gh_id):
  67. """
  68. 计算文章的阅读分
  69. """
  70. title_md5 = hashlib.md5(content.encode('utf-8')).hexdigest()
  71. if title_md5 in self.unsafe_md5_set:
  72. return None
  73. account_read_avg_first = self.read_avg_map.get(f"{gh_id}-1", 0)
  74. read_detail = await self.calculate_content_read_detail(title_md5)
  75. total_read_count_first = read_detail["read_count_first"]
  76. total_read_avg_first = read_detail["read_avg_first"]
  77. total_read_count_second = read_detail["read_count_second"]
  78. total_read_avg_second = read_detail["read_avg_second"]
  79. big_rate_w = 1.0
  80. if total_read_count_first:
  81. read_rate = total_read_count_first / total_read_avg_first if total_read_avg_first else 0
  82. total_read_avg = total_read_avg_first
  83. elif total_read_count_second:
  84. read_rate = total_read_count_second / total_read_avg_second if total_read_avg_second else 0
  85. total_read_avg = total_read_avg_second
  86. if account_read_avg_first >= 3000:
  87. big_rate_w = 0.001
  88. else:
  89. read_rate = total_read_count_second / total_read_avg_second if total_read_avg_second else 0
  90. total_read_avg = total_read_avg_second
  91. sigmoid = lambda ci_total, slope, avg_pos: 1 / (1 + math.exp(-slope * (ci_total - avg_pos)))
  92. weight = sigmoid(total_read_avg, 0.0002, account_read_avg_first)
  93. if read_rate > 0:
  94. if read_rate > 1 and big_rate_w < 1:
  95. # 对大账号的过高评分进行抑制
  96. score_value = weight * ((read_rate - 1) * big_rate_w + 1)
  97. else:
  98. score_value = weight * read_rate
  99. else:
  100. score_value = 0.0
  101. return {
  102. "score_without_weight": read_rate,
  103. "score": score_value,
  104. "weight": weight,
  105. "total_read_avg": total_read_avg,
  106. "account_read_avg_first": account_read_avg_first,
  107. }
  108. async def get_flow_pool_1_contents(self, account_id):
  109. """
  110. 获取流量池文章
  111. """
  112. query = """
  113. select t4.title
  114. from publish_content t1
  115. join produce_plan_exe_record t2 on t1.source_id = t2.plan_exe_id
  116. join produce_plan t3 on t2.plan_id = t3.id
  117. join crawler_content t4 on t1.crawler_channel_content_id = t4.channel_content_id
  118. where t1.channel = 5 and t1.publish_account_id = %s
  119. and t3.plan_tag = 'autoArticlePoolLevel1' and t1.status = 1
  120. order by t1.id desc;
  121. """
  122. raw_data = await self.db_client.async_fetch(query=query, db_name="aigc", params=(account_id,))
  123. return [item['title'] for item in raw_data]
  124. async def get_flow_pool_1_contents_v2(self, account_id):
  125. """
  126. 获取流量池文章
  127. """
  128. query = """
  129. select title from publish_content_gzh_waiting
  130. where publish_account_id = %s and status = 1 and content_pool_type = 'autoArticlePoolLevel1'
  131. """
  132. raw_data = await self.db_client.async_fetch(query=query, params=(account_id,))
  133. return [item['title'] for item in raw_data]
  134. async def deal(self, data):
  135. self.unsafe_md5_set = await self.get_unsafe_titles()
  136. account_id = data['account_id']
  137. gh_id = data['gh_id']
  138. self.read_avg_map = await self.get_read_avg_map()
  139. titles = await self.get_flow_pool_1_contents_v2(account_id)
  140. L = []
  141. for title in tqdm(titles):
  142. score = await self.calculate_read_score(title, gh_id)
  143. if score is None:
  144. print(f"title unsafe: {title}")
  145. continue
  146. else:
  147. score['title'] = title
  148. L.append(score)
  149. L = sorted(L, key=lambda x: x['score'], reverse=True)
  150. print(json.dumps(L, ensure_ascii=False, indent=4))