task2_dev.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. """
  2. @author: luojunhui
  3. """
  4. import datetime
  5. import json
  6. from tqdm import tqdm
  7. from applications import AIDTApi, DeNetMysql, PQMySQL, Functions, WeixinSpider
  8. from config import cateMap, coldPoolArticlesNum, accountBaseInfo
  9. class ColdStartTask(object):
  10. """
  11. 冷启分配任务
  12. """
  13. AidApi = AIDTApi()
  14. DeMysql = DeNetMysql()
  15. PqMysql = PQMySQL()
  16. Fun = Functions()
  17. Spider = WeixinSpider()
  18. pool3 = "autoArticlePoolLevel3"
  19. @classmethod
  20. def generate_account_dict(cls):
  21. """
  22. 生成account_list
  23. :return:
  24. """
  25. account_dict = {}
  26. for key in accountBaseInfo:
  27. account_name = accountBaseInfo[key]['accountName']
  28. account_gh_id = accountBaseInfo[key]['ghId']
  29. account_dict[account_gh_id] = account_name
  30. return account_dict
  31. @classmethod
  32. def usedArticle(cls, content_id):
  33. """
  34. 已经使用的文章,把文章状态改为0
  35. :return:
  36. """
  37. print("useful", content_id)
  38. @classmethod
  39. def badArticle(cls, content_id):
  40. """
  41. 低分的文章,把文章状态改为2
  42. :return:
  43. """
  44. print("bad", content_id)
  45. @classmethod
  46. def getTopArticles(cls, category, limit_count):
  47. """
  48. 获取高分享的文章list
  49. :return:
  50. """
  51. sql = f"""
  52. select content_channel_id, content_link, title
  53. from cold_start_article_pool
  54. where category = '{category}' and status = 1
  55. order by view_count DESC, publish_time_stamp DESC
  56. limit {limit_count};
  57. """
  58. result = cls.PqMysql.select(sql)
  59. return result
  60. @classmethod
  61. def splitCategoryToAccount(cls, cate_list):
  62. """
  63. split articles to each account
  64. :return:
  65. """
  66. account_index_info = {
  67. "gh_058e41145a0c": 30,
  68. "gh_0e4fd9e88386": 30,
  69. "gh_744cb16f6e16": 30,
  70. "gh_ac43eb24376d": 30,
  71. "gh_970460d9ccec": 30,
  72. "gh_56ca3dae948c": 30,
  73. "gh_c91b42649690": 30,
  74. "gh_6d205db62f04": 30,
  75. "gh_e24da99dc899": 30,
  76. "gh_4c058673c07e": 30,
  77. "gh_03d32e83122f": 30,
  78. "gh_c69776baf2cd": 30,
  79. "gh_30816d8adb52": 30,
  80. "gh_789a40fe7935": 30,
  81. "gh_95ed5ecf9363": 30,
  82. "gh_3e91f0624545": 30,
  83. "gh_57573f01b2ee": 30,
  84. "gh_9877c8541764": 30,
  85. "gh_6cfd1132df94": 30,
  86. "gh_008ef23062ee": 30,
  87. "gh_5ae65db96cb7": 30,
  88. "gh_be8c29139989": 30,
  89. "gh_51e4ad40466d": 30,
  90. "gh_d4dffc34ac39": 30,
  91. "gh_89ef4798d3ea": 30,
  92. "gh_b15de7c99912": 30,
  93. "gh_9f8dc5b0c74e": 30,
  94. "gh_7b4a5f86d68c": 30,
  95. "gh_c5cdf60d9ab4": 5,
  96. "gh_0c89e11f8bf3": 5,
  97. "gh_e0eb490115f5": 5,
  98. "gh_a2901d34f75b": 5,
  99. "gh_d5f935d0d1f2": 30
  100. }
  101. account_dict = cls.generate_account_dict()
  102. account_list = list(account_index_info.keys())
  103. title_list = [i['title'] for i in cate_list]
  104. L_map = {}
  105. for account_id in account_list:
  106. account_name = account_dict[account_id]
  107. score_list = cls.Fun.getTitleScore(title_list=title_list, account_name=account_name)[account_name]['score_list']
  108. for index, score in enumerate(score_list):
  109. channel_content_id = cate_list[index]['id']
  110. item = tuple([account_id, score])
  111. if L_map.get(channel_content_id):
  112. L_map[channel_content_id].append(item)
  113. else:
  114. L_map[channel_content_id] = [item]
  115. for key in L_map:
  116. L_map[key] = sorted(L_map[key], reverse=True, key=lambda x: x[1])
  117. content_account = []
  118. for item in cate_list:
  119. content_id = item['id']
  120. account_list = L_map[content_id]
  121. for account_tuple in account_list:
  122. gh_id, score = account_tuple[0], account_tuple[1]
  123. if account_index_info[gh_id] > 0:
  124. sub_item = tuple([content_id, gh_id, score])
  125. content_account.append(sub_item)
  126. account_index_info[gh_id] -= 1
  127. break
  128. # return content_account
  129. account_article_dict = {}
  130. for item in content_account:
  131. content_id, gh_id, score = item
  132. sub_i = tuple([content_id, score])
  133. if account_article_dict.get(gh_id):
  134. account_article_dict[gh_id].append(sub_i)
  135. else:
  136. account_article_dict[gh_id] = [sub_i]
  137. for account in tqdm(account_article_dict):
  138. date_str = datetime.datetime.today().strftime("%Y-%m-%d")
  139. print(account)
  140. print(account_article_dict[account])
  141. # insert_sql = f"""
  142. # INSERT INTO article_pre_distribute_account
  143. # (gh_id, date, article_list)
  144. # VALUES
  145. # (%s, %s, %s);
  146. # """
  147. # try:
  148. # PQMySQL.update(sql=insert_sql, params=(
  149. # account, date_str, json.dumps(account_article_dict[account], ensure_ascii=False)))
  150. # except Exception as e:
  151. # print("插入出现问题----{}".format(e))
  152. print("成功更新完成")
  153. @classmethod
  154. def findCategoryArticlesDaily(cls):
  155. """
  156. 和每个账号计算相关性分数
  157. :return:
  158. """
  159. category_list = [
  160. "军事政法",
  161. "健康养生",
  162. "宗教历史",
  163. "情感生活",
  164. "娱乐八卦",
  165. "新闻媒体",
  166. ]
  167. L = []
  168. for category in tqdm(category_list):
  169. print("{} is processing......".format(category))
  170. category_total = coldPoolArticlesNum * cateMap.get(category, 0.1)
  171. category_count = 0
  172. while category_count < category_total:
  173. article_list = cls.getTopArticles(category, 10)
  174. if len(article_list) == 0:
  175. print("{}: 该品类没有数据了!".format(category))
  176. break
  177. title_list = [article[2] for article in article_list]
  178. score_list = cls.Fun.getTitleScore(title_list, "指尖奇文")['指尖奇文']['score_list']
  179. for index, score in enumerate(score_list):
  180. content_id = article_list[index][0]
  181. if score >= 0.35:
  182. obj = {
  183. "id": article_list[index][0],
  184. "url": article_list[index][1],
  185. "title": article_list[index][2],
  186. "cate": category,
  187. "score": score
  188. }
  189. category_count += 1
  190. # cls.usedArticle(content_id=content_id)
  191. print("used_article")
  192. L.append(obj)
  193. else:
  194. # cls.badArticle(content_id=content_id)
  195. print("bad article")
  196. print(category_count)
  197. return L
  198. @classmethod
  199. def findAssociationArticlesDaily(cls):
  200. """
  201. 获取相关文章
  202. :return:
  203. """
  204. target_num = int(0.4 * coldPoolArticlesNum)
  205. sql = f"""
  206. select id, ori_account_name, association_url, association_title, url_md5
  207. from association_articles
  208. where status = 1
  209. order by read_count DESC
  210. limit {target_num};"""
  211. temp_list = cls.PqMysql.select(sql)
  212. article_list = []
  213. for i in tqdm(temp_list):
  214. try:
  215. o = {
  216. "related_account_name": i[1],
  217. "url": i[2],
  218. "title": i[3],
  219. "url_md5": i[4],
  220. # "id": i[4]
  221. "id": cls.Spider.get_article_text(i[2])['data']['data']['channel_content_id']
  222. }
  223. except:
  224. o = {
  225. "related_account_name": i[1],
  226. "url": i[2],
  227. "title": i[3],
  228. "url_md5": i[4],
  229. "id": i[4]
  230. }
  231. article_list.append(o)
  232. return article_list
  233. @classmethod
  234. def sendToColdPool(cls, plan_id=None):
  235. """
  236. 把文章send至第四层
  237. :return:
  238. """
  239. # 获取6个品类的数据
  240. category_list = cls.findCategoryArticlesDaily()
  241. # cls.splitCategoryToAccount(category_list)
  242. # for line in category_list:
  243. # print(line)
  244. association_list = cls.findAssociationArticlesDaily()
  245. # for line in association_list:
  246. # print(line)
  247. # 预分配账号
  248. d_list = association_list + category_list
  249. cls.splitCategoryToAccount(d_list)
  250. if __name__ == '__main__':
  251. CT = ColdStartTask()
  252. CT.sendToColdPool()