upLevel.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. """
  2. @author: luojunhui
  3. """
  4. import json
  5. from pandas import DataFrame
  6. from datetime import datetime
  7. from applications import longArticlesMySQL
  8. lam = longArticlesMySQL()
  9. def read_rate_debias(row):
  10. """
  11. 阅读均值倍数通过头条消偏
  12. :param row:
  13. :return:
  14. """
  15. if row["位置"] != 1:
  16. return row["阅读量"] / (
  17. max(1.0, row["头条阅读量"] / row["头条阅读均值"]) * row["阅读均值"]
  18. )
  19. else:
  20. return row["阅读均值倍数"]
  21. def filter_same_title(df):
  22. """
  23. 通过标题过滤 dataframe
  24. :param df:
  25. :return:
  26. """
  27. def title_sim_v2_by_title_list(title_target, title_list, threshold=0.75):
  28. """
  29. :param title_target:
  30. :param title_list:
  31. :param threshold:
  32. :return:
  33. """
  34. def title_sim_v2(title_a, title_b, threshold=0.75):
  35. """
  36. 标题相似度
  37. :param title_a:
  38. :param title_b:
  39. :param threshold:
  40. :return:
  41. """
  42. if len(title_a) < 1 or len(title_b) < 1:
  43. return False
  44. set_a = set(title_a)
  45. set_b = set(title_b)
  46. set_cross = set_a & set_b
  47. set_union = set_a | set_b
  48. if not set_union:
  49. return False
  50. min_len = max(min(len(set_a), len(set_b)), 1)
  51. rate = len(set_cross) / min_len
  52. if rate >= threshold:
  53. return True
  54. else:
  55. return False
  56. for title in title_list:
  57. sim_score = title_sim_v2(title_target, title, threshold=threshold)
  58. if sim_score:
  59. return title
  60. return False
  61. visited_titles = []
  62. data = []
  63. for x in df.to_dict(orient='records'):
  64. title = x['标题']
  65. if title_sim_v2_by_title_list(title, visited_titles):
  66. continue
  67. visited_titles.append(title)
  68. data.append(x)
  69. return DataFrame(data)
  70. class articleLevelUp(object):
  71. """
  72. 文章晋级
  73. """
  74. columns = [
  75. "位置",
  76. "粉丝量",
  77. "阅读量",
  78. "平均阅读量",
  79. "头条阅读量",
  80. "头条平均阅读量",
  81. "阅读均值倍数",
  82. "阅读率",
  83. "小程序打开率",
  84. "T+0裂变率",
  85. "标题",
  86. "链接"
  87. ]
  88. statMapThreeToEight = {
  89. "阅读均值倍数": {
  90. "mean": 1.1388723507368606,
  91. "max": 62.50000000000001,
  92. "min": 0.0,
  93. "median": 0.8890469416785206,
  94. "75%": 1.2617516081147946,
  95. "80%": 1.37797320398902,
  96. "90%": 1.8733429945338946,
  97. "95%": 2.6455874825730517,
  98. "99%": 6.252251764489181
  99. },
  100. "阅读率": {
  101. "mean": 0.0006051220910642054,
  102. "max": 0.06252537555826228,
  103. "min": 0.0,
  104. "median": 0.0002241206067691894,
  105. "75%": 0.0005117154674215644,
  106. "80%": 0.0006449975188817015,
  107. "90%": 0.001255232384471895,
  108. "95%": 0.002233845658277497,
  109. "99%": 0.00633843067255787
  110. },
  111. "小程序打开率": {
  112. "mean": 0.062085135696479415,
  113. "max": 1.0,
  114. "min": 0.0,
  115. "median": 0.045454545454545456,
  116. "75%": 0.08695652173913043,
  117. "80%": 0.1,
  118. "90%": 0.14285714285714285,
  119. "95%": 0.18518518518518517,
  120. "99%": 0.310463054187192
  121. },
  122. "T+0裂变率": {
  123. "mean": 0.35277482885383377,
  124. "max": 181.0,
  125. "min": 0.0,
  126. "median": 0.0,
  127. "75%": 0.0,
  128. "80%": 0.09090909090909091,
  129. "90%": 0.6666666666666666,
  130. "95%": 1.5,
  131. "99%": 6.0
  132. }
  133. }
  134. statMapTwoToOne = {
  135. "阅读均值倍数": {
  136. "mean": 1.0242728432910957,
  137. "max": 4.921632060507756,
  138. "min": 0.04236315118498048,
  139. "median": 0.9604958720021857,
  140. "75%": 1.237352622811623,
  141. "80%": 1.3131587863024974,
  142. "90%": 1.5778563945144477,
  143. "95%": 1.8312064951656155,
  144. "99%": 2.5125234834603165
  145. },
  146. "阅读率": {
  147. "mean": 0.0073535037464145655,
  148. "max": 0.05265662356955502,
  149. "min": 0.00020895172629276676,
  150. "median": 0.005941952332154309,
  151. "75%": 0.009324205525316574,
  152. "80%": 0.010420614811741105,
  153. "90%": 0.013728137204835086,
  154. "95%": 0.01704242661483454,
  155. "99%": 0.02622215995438508
  156. },
  157. "小程序打开率": {
  158. "mean": 0.14893695109764848,
  159. "max": 2.5,
  160. "min": 0.0,
  161. "median": 0.1360318513603185,
  162. "75%": 0.1875,
  163. "80%": 0.20230028849345147,
  164. "90%": 0.25449906489537877,
  165. "95%": 0.3051369784478383,
  166. "99%": 0.4016107123469446
  167. },
  168. "T+0裂变率": {
  169. "mean": 0.6465295965706923,
  170. "max": 12.804878048780488,
  171. "min": 0.0,
  172. "median": 0.48770491803278687,
  173. "75%": 0.8011363636363636,
  174. "80%": 0.9144722345551121,
  175. "90%": 1.317362236032163,
  176. "95%": 1.792137476827772,
  177. "99%": 3.277849462365585
  178. }
  179. }
  180. firstLevelMap = {
  181. "阅读均值倍数": {
  182. "mean": 1.0469541000103093,
  183. "max": 25.719380724649426,
  184. "min": 0.037429819089207735,
  185. "median": 0.9521466355025219,
  186. "75%": 1.2800839124458492,
  187. "80%": 1.370275508982941,
  188. "90%": 1.674800845262867,
  189. "95%": 1.995613204168999,
  190. "99%": 2.9869225601165135
  191. },
  192. "阅读率": {
  193. "mean": 0.016311355353310464,
  194. "max": 0.7427434456928839,
  195. "min": 0.0006011082360982278,
  196. "median": 0.01255841121495327,
  197. "75%": 0.020080845617803843,
  198. "80%": 0.022950649260452458,
  199. "90%": 0.03136776141996209,
  200. "95%": 0.0398727631704118,
  201. "99%": 0.05986584275411923
  202. },
  203. "小程序打开率": {
  204. "mean": 0.20655535828501095,
  205. "max": 0.8,
  206. "min": 0.0,
  207. "median": 0.19921326215228996,
  208. "75%": 0.25838983436476154,
  209. "80%": 0.27586206896551724,
  210. "90%": 0.32290043225754594,
  211. "95%": 0.3709317026683608,
  212. "99%": 0.4685840031614304
  213. },
  214. "T+0裂变率": {
  215. "mean": 0.6660929834568661,
  216. "max": 46.0,
  217. "min": 0.0,
  218. "median": 0.5434782608695652,
  219. "75%": 0.7940509083886685,
  220. "80%": 0.8776439089692103,
  221. "90%": 1.159075752014066,
  222. "95%": 1.62348848368522,
  223. "99%": 2.785400696864109
  224. }
  225. }
  226. def get_base_data(self):
  227. """
  228. 从数据文章数据
  229. :return:
  230. """
  231. sql = f"""
  232. SELECT
  233. position, fans, view_count, avg_view_count, first_view_count, first_avg_view_count, read_rate, read_fans_rate, first_read_rate, fission0_first_rate, title, link
  234. FROM
  235. datastat_sort_strategy;
  236. """
  237. response = lam.select(sql)
  238. df = DataFrame(response, columns=self.columns)
  239. df = df.sort_values(by=["阅读均值倍数"], ascending=[False]).reset_index(drop=True)
  240. df = df[df["粉丝量"] > 10000].reset_index(drop=True)
  241. return df
  242. def analysis_data(self, index_list):
  243. """
  244. 分析 dataframe 中数据占比
  245. :return:
  246. """
  247. df = self.get_base_data()
  248. # 筛选指定位置的文章
  249. df = df[(df["位置"].isin(index_list))]
  250. avg_read_times = df['阅读均值倍数'].sort_values(ascending=False)
  251. read_rate = df['阅读率'].sort_values(ascending=False)
  252. mini_open_rate = df['小程序打开率'].sort_values(ascending=False)
  253. t_plus_0_fission = df['T+0裂变率'].sort_values(ascending=False)
  254. detail = {
  255. "阅读均值倍数": {
  256. "mean": avg_read_times.mean(),
  257. "max": avg_read_times.max(),
  258. "min": avg_read_times.min(),
  259. "median": avg_read_times.median(),
  260. "75%": avg_read_times.quantile(0.75),
  261. "80%": avg_read_times.quantile(0.8),
  262. "90%": avg_read_times.quantile(0.9),
  263. "95%": avg_read_times.quantile(0.95),
  264. "99%": avg_read_times.quantile(0.99)
  265. },
  266. "阅读率": {
  267. "mean": read_rate.mean(),
  268. "max": read_rate.max(),
  269. "min": read_rate.min(),
  270. "median": read_rate.median(),
  271. "75%": read_rate.quantile(0.75),
  272. "80%": read_rate.quantile(0.8),
  273. "90%": read_rate.quantile(0.9),
  274. "95%": read_rate.quantile(0.95),
  275. "99%": read_rate.quantile(0.99)
  276. },
  277. "小程序打开率": {
  278. "mean": mini_open_rate.mean(),
  279. "max": mini_open_rate.max(),
  280. "min": mini_open_rate.min(),
  281. "median": mini_open_rate.median(),
  282. "75%": mini_open_rate.quantile(0.75),
  283. "80%": mini_open_rate.quantile(0.8),
  284. "90%": mini_open_rate.quantile(0.9),
  285. "95%": mini_open_rate.quantile(0.95),
  286. "99%": mini_open_rate.quantile(0.99)
  287. },
  288. "T+0裂变率": {
  289. "mean": t_plus_0_fission.mean(),
  290. "max": t_plus_0_fission.max(),
  291. "min": t_plus_0_fission.min(),
  292. "median": t_plus_0_fission.median(),
  293. "75%": t_plus_0_fission.quantile(0.75),
  294. "80%": t_plus_0_fission.quantile(0.8),
  295. "90%": t_plus_0_fission.quantile(0.9),
  296. "95%": t_plus_0_fission.quantile(0.95),
  297. "99%": t_plus_0_fission.quantile(0.99)
  298. }
  299. }
  300. print(json.dumps(detail, ensure_ascii=False, indent=4))
  301. def find_good_articles(self, df, pool_level, index_list, read_count, read_avg_times):
  302. """
  303. 获取已经发布文章中,认为是质量好的文章
  304. :param read_avg_times: 阅读均值倍数
  305. :param read_count: 阅读量
  306. :param df: 查询出来的 df
  307. :param pool_level: 流量池层级
  308. :param index_list: 文章位置
  309. :return:
  310. """
  311. good_articles = df[
  312. (df['阅读量'] >= read_count) &
  313. (df['阅读均值倍数'] >= read_avg_times) &
  314. (df['位置'].isin(index_list))
  315. ]
  316. distinct_good_articles = filter_same_title(good_articles)
  317. sorted_distinct_good_articles = distinct_good_articles.sort_values(by=['发布日期'],
  318. ascending=[False]).reset_index(drop=True)
  319. print(
  320. "流量池 level: {} 中,去重后一共 {} 篇优质文章".format(pool_level, len(sorted_distinct_good_articles.index)))
  321. url_list = []
  322. title_list = []
  323. for x in sorted_distinct_good_articles.to_dict(orient='records'):
  324. url_list.append(x['链接'])
  325. title_list.append(x['标题'])
  326. add_url_list_to_account(
  327. account_nickname,
  328. url_list,
  329. title_list,
  330. pos,
  331. way,
  332. plan_key=plan_key,
  333. tag=tag,
  334. debug=debug,
  335. )