upLevel.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. """
  2. @author: luojunhui
  3. """
  4. import json
  5. from pandas import DataFrame
  6. from datetime import datetime
  7. from applications import longArticlesMySQL
  8. lam = longArticlesMySQL()
  9. class articleLevelUp(object):
  10. """
  11. 文章晋级
  12. """
  13. columns = [
  14. "位置",
  15. "粉丝量",
  16. "阅读量",
  17. "平均阅读量",
  18. "头条阅读量",
  19. "头条平均阅读量",
  20. "阅读均值倍数",
  21. "阅读率",
  22. "小程序打开率",
  23. "T+0裂变率",
  24. "标题",
  25. "链接"
  26. ]
  27. statMapThreeToEight = {
  28. "阅读均值倍数": {
  29. "mean": 1.1388723507368606,
  30. "max": 62.50000000000001,
  31. "min": 0.0,
  32. "median": 0.8890469416785206,
  33. "75%": 1.2617516081147946,
  34. "80%": 1.37797320398902,
  35. "90%": 1.8733429945338946,
  36. "95%": 2.6455874825730517,
  37. "99%": 6.252251764489181
  38. },
  39. "阅读率": {
  40. "mean": 0.0006051220910642054,
  41. "max": 0.06252537555826228,
  42. "min": 0.0,
  43. "median": 0.0002241206067691894,
  44. "75%": 0.0005117154674215644,
  45. "80%": 0.0006449975188817015,
  46. "90%": 0.001255232384471895,
  47. "95%": 0.002233845658277497,
  48. "99%": 0.00633843067255787
  49. },
  50. "小程序打开率": {
  51. "mean": 0.062085135696479415,
  52. "max": 1.0,
  53. "min": 0.0,
  54. "median": 0.045454545454545456,
  55. "75%": 0.08695652173913043,
  56. "80%": 0.1,
  57. "90%": 0.14285714285714285,
  58. "95%": 0.18518518518518517,
  59. "99%": 0.310463054187192
  60. },
  61. "T+0裂变率": {
  62. "mean": 0.35277482885383377,
  63. "max": 181.0,
  64. "min": 0.0,
  65. "median": 0.0,
  66. "75%": 0.0,
  67. "80%": 0.09090909090909091,
  68. "90%": 0.6666666666666666,
  69. "95%": 1.5,
  70. "99%": 6.0
  71. }
  72. }
  73. statMapTwoToOne = {
  74. "阅读均值倍数": {
  75. "mean": 1.0242728432910957,
  76. "max": 4.921632060507756,
  77. "min": 0.04236315118498048,
  78. "median": 0.9604958720021857,
  79. "75%": 1.237352622811623,
  80. "80%": 1.3131587863024974,
  81. "90%": 1.5778563945144477,
  82. "95%": 1.8312064951656155,
  83. "99%": 2.5125234834603165
  84. },
  85. "阅读率": {
  86. "mean": 0.0073535037464145655,
  87. "max": 0.05265662356955502,
  88. "min": 0.00020895172629276676,
  89. "median": 0.005941952332154309,
  90. "75%": 0.009324205525316574,
  91. "80%": 0.010420614811741105,
  92. "90%": 0.013728137204835086,
  93. "95%": 0.01704242661483454,
  94. "99%": 0.02622215995438508
  95. },
  96. "小程序打开率": {
  97. "mean": 0.14893695109764848,
  98. "max": 2.5,
  99. "min": 0.0,
  100. "median": 0.1360318513603185,
  101. "75%": 0.1875,
  102. "80%": 0.20230028849345147,
  103. "90%": 0.25449906489537877,
  104. "95%": 0.3051369784478383,
  105. "99%": 0.4016107123469446
  106. },
  107. "T+0裂变率": {
  108. "mean": 0.6465295965706923,
  109. "max": 12.804878048780488,
  110. "min": 0.0,
  111. "median": 0.48770491803278687,
  112. "75%": 0.8011363636363636,
  113. "80%": 0.9144722345551121,
  114. "90%": 1.317362236032163,
  115. "95%": 1.792137476827772,
  116. "99%": 3.277849462365585
  117. }
  118. }
  119. firstLevelMap = {
  120. "阅读均值倍数": {
  121. "mean": 1.0469541000103093,
  122. "max": 25.719380724649426,
  123. "min": 0.037429819089207735,
  124. "median": 0.9521466355025219,
  125. "75%": 1.2800839124458492,
  126. "80%": 1.370275508982941,
  127. "90%": 1.674800845262867,
  128. "95%": 1.995613204168999,
  129. "99%": 2.9869225601165135
  130. },
  131. "阅读率": {
  132. "mean": 0.016311355353310464,
  133. "max": 0.7427434456928839,
  134. "min": 0.0006011082360982278,
  135. "median": 0.01255841121495327,
  136. "75%": 0.020080845617803843,
  137. "80%": 0.022950649260452458,
  138. "90%": 0.03136776141996209,
  139. "95%": 0.0398727631704118,
  140. "99%": 0.05986584275411923
  141. },
  142. "小程序打开率": {
  143. "mean": 0.20655535828501095,
  144. "max": 0.8,
  145. "min": 0.0,
  146. "median": 0.19921326215228996,
  147. "75%": 0.25838983436476154,
  148. "80%": 0.27586206896551724,
  149. "90%": 0.32290043225754594,
  150. "95%": 0.3709317026683608,
  151. "99%": 0.4685840031614304
  152. },
  153. "T+0裂变率": {
  154. "mean": 0.6660929834568661,
  155. "max": 46.0,
  156. "min": 0.0,
  157. "median": 0.5434782608695652,
  158. "75%": 0.7940509083886685,
  159. "80%": 0.8776439089692103,
  160. "90%": 1.159075752014066,
  161. "95%": 1.62348848368522,
  162. "99%": 2.785400696864109
  163. }
  164. }
  165. @classmethod
  166. def readRateDebias(cls, row):
  167. """
  168. 阅读均值倍数通过头条消偏
  169. :param row:
  170. :return:
  171. """
  172. if row["位置"] != 1:
  173. return row["阅读量"] / (
  174. max(1.0, row["头条阅读量"] / row["头条阅读均值"]) * row["阅读均值"]
  175. )
  176. else:
  177. return row["阅读均值倍数"]
  178. @classmethod
  179. def getBaseData(cls):
  180. """
  181. :return:
  182. """
  183. sql = f"""
  184. SELECT
  185. position, fans, view_count, avg_view_count, first_view_count, first_avg_view_count, read_rate, read_fans_rate, first_read_rate, fission0_first_rate, title, link
  186. FROM
  187. datastat_sort_strategy;
  188. """
  189. response = lam.select(sql)
  190. df = DataFrame(response, columns=cls.columns)
  191. df = df.sort_values(by=["阅读均值倍数"], ascending=[False]).reset_index(drop=True)
  192. df = df[df["粉丝量"] > 10000].reset_index(drop=True)
  193. return df
  194. @classmethod
  195. def analysisDF(cls, indexList):
  196. """
  197. 分析 dataframe 中数据占比
  198. :return:
  199. """
  200. DF = cls.getBaseData()
  201. DF = DF[(DF["位置"].isin(indexList))]
  202. print(len(DF))
  203. avg_read_times = DF['阅读均值倍数'].sort_values(ascending=False)
  204. read_rate = DF['阅读率'].sort_values(ascending=False)
  205. mini_open_rate = DF['小程序打开率'].sort_values(ascending=False)
  206. t_plus_0_fission = DF['T+0裂变率'].sort_values(ascending=False)
  207. detail = {
  208. "阅读均值倍数": {
  209. "mean": avg_read_times.mean(),
  210. "max": avg_read_times.max(),
  211. "min": avg_read_times.min(),
  212. "median": avg_read_times.median(),
  213. "75%": avg_read_times.quantile(0.75),
  214. "80%": avg_read_times.quantile(0.8),
  215. "90%": avg_read_times.quantile(0.9),
  216. "95%": avg_read_times.quantile(0.95),
  217. "99%": avg_read_times.quantile(0.99)
  218. },
  219. "阅读率": {
  220. "mean": read_rate.mean(),
  221. "max": read_rate.max(),
  222. "min": read_rate.min(),
  223. "median": read_rate.median(),
  224. "75%": read_rate.quantile(0.75),
  225. "80%": read_rate.quantile(0.8),
  226. "90%": read_rate.quantile(0.9),
  227. "95%": read_rate.quantile(0.95),
  228. "99%": read_rate.quantile(0.99)
  229. },
  230. "小程序打开率": {
  231. "mean": mini_open_rate.mean(),
  232. "max": mini_open_rate.max(),
  233. "min": mini_open_rate.min(),
  234. "median": mini_open_rate.median(),
  235. "75%": mini_open_rate.quantile(0.75),
  236. "80%": mini_open_rate.quantile(0.8),
  237. "90%": mini_open_rate.quantile(0.9),
  238. "95%": mini_open_rate.quantile(0.95),
  239. "99%": mini_open_rate.quantile(0.99)
  240. },
  241. "T+0裂变率": {
  242. "mean": t_plus_0_fission.mean(),
  243. "max": t_plus_0_fission.max(),
  244. "min": t_plus_0_fission.min(),
  245. "median": t_plus_0_fission.median(),
  246. "75%": t_plus_0_fission.quantile(0.75),
  247. "80%": t_plus_0_fission.quantile(0.8),
  248. "90%": t_plus_0_fission.quantile(0.9),
  249. "95%": t_plus_0_fission.quantile(0.95),
  250. "99%": t_plus_0_fission.quantile(0.99)
  251. }
  252. }
  253. print(json.dumps(detail, ensure_ascii=False, indent=4))
  254. @classmethod
  255. def upLevel38To2(cls):
  256. """
  257. :return:
  258. """
  259. dataThreeToEight = cls.getBaseData()
  260. dataThreeToEight = dataThreeToEight[dataThreeToEight['位置'].isin([3, 4, 5, 6, 7, 8])]
  261. filter_data = dataThreeToEight[
  262. (dataThreeToEight['T+0裂变率'] > cls.statMapThreeToEight['T+0裂变率']['95%'])
  263. & (dataThreeToEight['阅读均值倍数'] > cls.statMapThreeToEight['阅读均值倍数']['80%'])
  264. ]
  265. return filter_data
  266. @classmethod
  267. def upLevel2To1(cls):
  268. """
  269. :return:
  270. """
  271. dataThreeToEight = cls.getBaseData()
  272. dataThreeToEight = dataThreeToEight[dataThreeToEight['位置'].isin([2])]
  273. filter_data = dataThreeToEight[
  274. (dataThreeToEight['T+0裂变率'] > cls.statMapThreeToEight['T+0裂变率']['90%'])
  275. & (dataThreeToEight['阅读均值倍数'] > cls.statMapThreeToEight['阅读均值倍数']['90%'])
  276. ]
  277. return filter_data
  278. U = articleLevelUp()
  279. U.analysisDF(indexList=[1])
  280. f_d = U.upLevel2To1()
  281. for line in list(zip(f_d['标题'], f_d['链接'])):
  282. print(line[0])
  283. print(line[1])
  284. print("\n")