task2.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. """
  2. @author: luojunhui
  3. """
  4. import datetime
  5. import json
  6. from tqdm import tqdm
  7. from applications import AIDTApi, DeNetMysql, PQMySQL, Functions
  8. from config import cateMap, coldPoolArticlesNum, accountBaseInfo
  9. class ColdStartTask(object):
  10. """
  11. 冷启分配任务
  12. """
  13. AidApi = AIDTApi()
  14. DeMysql = DeNetMysql()
  15. PqMysql = PQMySQL()
  16. Fun = Functions()
  17. pool3 = "autoArticlePoolLevel3"
  18. @classmethod
  19. def generate_account_dict(cls):
  20. """
  21. 生成account_list
  22. :return:
  23. """
  24. account_dict = {}
  25. for key in accountBaseInfo:
  26. account_name = accountBaseInfo[key]['accountName']
  27. account_gh_id = accountBaseInfo[key]['ghId']
  28. account_dict[account_name] = account_gh_id
  29. return account_dict
  30. @classmethod
  31. def getTopArticles(cls, category, limit_count):
  32. """
  33. 获取高分享的文章list
  34. :return:
  35. """
  36. sql = f"""
  37. select content_channel_id, content_link, title
  38. from cold_start_article_pool
  39. where category = '{category}' and status = 1
  40. order by view_count DESC, publish_time_stamp DESC
  41. limit {limit_count};
  42. """
  43. result = cls.PqMysql.select(sql)
  44. content_id = [i[0] for i in result]
  45. sql2 = f"""
  46. update cold_start_article_pool
  47. set status = %s
  48. where content_channel_id in %s
  49. """
  50. cls.PqMysql.update(sql=sql2, params=(0, tuple(content_id)))
  51. return result
  52. @classmethod
  53. def getAccountScoreList(cls, title_list, account_name):
  54. """
  55. 预分配文章给不同的账号
  56. :return:
  57. """
  58. score_list = cls.Fun.getTitleScore(title_list=title_list, account_name=account_name)[account_name]['score_list']
  59. return score_list
  60. @classmethod
  61. def splitToAccount(cls, obj_list):
  62. """
  63. split articles to each account
  64. :return:
  65. """
  66. account_dict = cls.generate_account_dict()
  67. account_list = list(account_dict.keys())
  68. title_list = [i['title'] for i in obj_list]
  69. for account in tqdm(account_list):
  70. score_list = cls.getAccountScoreList(title_list=title_list, account_name=account)
  71. L = []
  72. for index, score in enumerate(score_list):
  73. L.append((obj_list[index]['id'], score))
  74. SL = sorted(L, key=lambda x: x[1], reverse=True)
  75. gh_id = account_dict[account]
  76. date_str = datetime.datetime.today().strftime("%Y-%m-%d")
  77. insert_sql = f"""
  78. INSERT INTO article_pre_distribute_account
  79. (gh_id, date, article_list)
  80. VALUES
  81. (%s, %s, %s);
  82. """
  83. try:
  84. PQMySQL.update(sql=insert_sql, params=(gh_id, date_str, json.dumps(SL[:30], ensure_ascii=False)))
  85. except Exception as e:
  86. print("插入出现问题----{}".format(e))
  87. print("成功更新完成")
  88. @classmethod
  89. def findArticlesDaily(cls):
  90. """
  91. 和每个账号计算相关性分数
  92. :return:
  93. """
  94. category_list = [
  95. "军事政法",
  96. "健康养生",
  97. "宗教历史",
  98. "情感生活",
  99. "娱乐八卦",
  100. "新闻媒体"
  101. ]
  102. L = []
  103. for category in tqdm(category_list):
  104. print("{} is processing......".format(category))
  105. limit_count = coldPoolArticlesNum * cateMap.get(category, 0.1)
  106. article_tuple = cls.getTopArticles(category, int(limit_count))
  107. title_list = [article[2] for article in article_tuple]
  108. score_list = cls.Fun.getTitleScore(title_list, "指尖奇文")['指尖奇文']['score_list']
  109. for index, score in enumerate(score_list):
  110. obj = {
  111. "id": article_tuple[index][0],
  112. "url": article_tuple[index][1],
  113. "title": article_tuple[index][2],
  114. "cate": category,
  115. "score": score
  116. }
  117. L.append(obj)
  118. result = [i for i in L if i['score'] >= 0.35]
  119. return result
  120. @classmethod
  121. def sendToColdPool(cls, plan_id, plan_name, plan_tag):
  122. """
  123. 把文章send至第四层
  124. :return:
  125. """
  126. # 获取6个品类的数据
  127. target_article_list = cls.findArticlesDaily()
  128. # 预分配账号
  129. cls.splitToAccount(target_article_list)
  130. # 再加一次配比,每个品类的数量占比
  131. cls.AidApi.updateArticleIntoCrawlerPlan(
  132. plan_id=plan_id,
  133. plan_name=plan_name,
  134. plan_tag=plan_tag,
  135. url_list=[i['url'] for i in target_article_list]
  136. )
  137. if __name__ == '__main__':
  138. CST = ColdStartTask()
  139. CST.sendToColdPool(
  140. plan_id=None,
  141. plan_name="冷启池子--0730--Monday--分品类抓取--6个品类",
  142. plan_tag="autoArticlePoolLevel3",
  143. )