task2.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. """
  2. @author: luojunhui
  3. """
  4. import datetime
  5. import json
  6. from tqdm import tqdm
  7. from applications import AIDTApi, DeNetMysql, PQMySQL, Functions
  8. from config import poolLevelConfig, cateMap, coldPoolArticlesNum, accountBaseInfo
  9. class ColdStartTask(object):
  10. """
  11. 冷启分配任务
  12. """
  13. AidApi = AIDTApi()
  14. DeMysql = DeNetMysql()
  15. PqMysql = PQMySQL()
  16. Fun = Functions()
  17. pool3 = poolLevelConfig['3']
  18. @classmethod
  19. def generate_account_dict(cls):
  20. """
  21. 生成account_list
  22. :return:
  23. """
  24. account_dict = {}
  25. for key in accountBaseInfo:
  26. account_name = accountBaseInfo[key]['accountName']
  27. account_gh_id = accountBaseInfo[key]['ghId']
  28. account_dict[account_name] = account_gh_id
  29. return account_dict
  30. @classmethod
  31. def getTopArticles(cls, category, limit_count):
  32. """
  33. 获取高分享的文章list
  34. :return:
  35. """
  36. sql = f"""
  37. select content_channel_id, content_link, title
  38. from cold_start_article_pool
  39. where category = '{category}'
  40. order by view_count DESC, publish_time_stamp DESC
  41. limit {limit_count};
  42. """
  43. result = cls.PqMysql.select(sql)
  44. return result
  45. @classmethod
  46. def getAccountScoreList(cls, title_list, account_name):
  47. """
  48. 预分配文章给不同的账号
  49. :return:
  50. """
  51. score_list = cls.Fun.getTitleScore(title_list=title_list, account_name=account_name)[account_name]['score_list']
  52. return score_list
  53. @classmethod
  54. def splitToAccount(cls, obj_list):
  55. """
  56. split articles to each account
  57. :return:
  58. """
  59. account_dict = cls.generate_account_dict()
  60. account_list = list(account_dict.keys())
  61. title_list = [i['title'] for i in obj_list]
  62. for account in tqdm(account_list):
  63. score_list = cls.getAccountScoreList(title_list=title_list, account_name=account)
  64. L = []
  65. for index, score in enumerate(score_list):
  66. L.append((obj_list[index]['id'], score))
  67. SL = sorted(L, key=lambda x: x[1], reverse=True)
  68. gh_id = account_dict[account]
  69. date_str = datetime.datetime.today().strftime("%Y-%m-%d")
  70. insert_sql = f"""
  71. INSERT INTO article_pre_distribute_account
  72. (gh_id, date, article_list)
  73. VALUES
  74. (%s, %s, %s);
  75. """
  76. try:
  77. PQMySQL.update(sql=insert_sql, params=(gh_id, date_str, json.dumps(SL[:30], ensure_ascii=False)))
  78. except Exception as e:
  79. print("插入出现问题----{}".format(e))
  80. print("成功更新完成")
  81. @classmethod
  82. def findArticlesDaily(cls):
  83. """
  84. 和每个账号计算相关性分数
  85. :return:
  86. """
  87. category_list = [
  88. "军事政法",
  89. "健康养生",
  90. "宗教历史",
  91. "情感生活",
  92. "娱乐八卦",
  93. "新闻媒体"
  94. ]
  95. L = []
  96. for category in tqdm(category_list):
  97. print("{} is processing......".format(category))
  98. limit_count = coldPoolArticlesNum * cateMap.get(category, 0.1)
  99. article_tuple = cls.getTopArticles(category, int(limit_count))
  100. title_list = [article[2] for article in article_tuple]
  101. score_list = cls.Fun.getTitleScore(title_list, "指尖奇文")['指尖奇文']['score_list']
  102. for index, score in enumerate(score_list):
  103. obj = {
  104. "id": article_tuple[index][0],
  105. "url": article_tuple[index][1],
  106. "title": article_tuple[index][2],
  107. "cate": category,
  108. "score": score
  109. }
  110. L.append(obj)
  111. result = [i for i in L if i['score'] >= 0.35]
  112. return result
  113. @classmethod
  114. def sendToColdPool(cls, plan_id, plan_name, plan_tag):
  115. """
  116. 把文章send至第四层
  117. :return:
  118. """
  119. # 获取6个品类的数据
  120. target_article_list = cls.findArticlesDaily()
  121. # 预分配账号
  122. cls.splitToAccount(target_article_list)
  123. # 再加一次配比,每个品类的数量占比
  124. cls.AidApi.updateArticleIntoCrawlerPlan(
  125. plan_id=plan_id,
  126. plan_name=plan_name,
  127. plan_tag=plan_tag,
  128. url_list=[i['url'] for i in target_article_list]
  129. )
  130. if __name__ == '__main__':
  131. CST = ColdStartTask()
  132. CST.sendToColdPool(
  133. plan_id=None,
  134. plan_name="冷启池子--0729--Monday--分品类抓取--6个品类",
  135. plan_tag="autoArticlePoolLevel1",
  136. )