task2.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. """
  2. @author: luojunhui
  3. """
  4. import datetime
  5. import json
  6. from tqdm import tqdm
  7. from applications import AIDTApi, DeNetMysql, PQMySQL, Functions, WeixinSpider
  8. from config import cateMap, coldPoolArticlesNum, accountBaseInfo
  9. class ColdStartTask(object):
  10. """
  11. 冷启分配任务
  12. """
  13. AidApi = AIDTApi()
  14. DeMysql = DeNetMysql()
  15. PqMysql = PQMySQL()
  16. Fun = Functions()
  17. Spider = WeixinSpider()
  18. pool3 = "autoArticlePoolLevel3"
  19. @classmethod
  20. def generate_account_dict(cls):
  21. """
  22. 生成account_list
  23. :return:
  24. """
  25. account_dict = {}
  26. for key in accountBaseInfo:
  27. account_name = accountBaseInfo[key]['accountName']
  28. account_gh_id = accountBaseInfo[key]['ghId']
  29. account_dict[account_gh_id] = account_name
  30. return account_dict
  31. @classmethod
  32. def usedArticle(cls, content_id):
  33. """
  34. 已经使用的文章,把文章状态改为0
  35. :return:
  36. """
  37. sql = f"""
  38. update cold_start_article_pool
  39. set status = %s
  40. where content_channel_id = %s;
  41. """
  42. cls.PqMysql.update(sql=sql, params=(0, content_id))
  43. @classmethod
  44. def badArticle(cls, content_id):
  45. """
  46. 低分的文章,把文章状态改为2
  47. :return:
  48. """
  49. sql = f"""
  50. update cold_start_article_pool
  51. set status = %s
  52. where content_channel_id = %s;
  53. """
  54. cls.PqMysql.update(sql=sql, params=(2, content_id))
  55. @classmethod
  56. def getTopArticles(cls, category, limit_count):
  57. """
  58. 获取高分享的文章list
  59. :return:
  60. """
  61. sql = f"""
  62. select content_channel_id, content_link, title
  63. from cold_start_article_pool
  64. where category = '{category}' and status = 1
  65. order by view_count DESC, publish_time_stamp DESC
  66. limit {limit_count};
  67. """
  68. result = cls.PqMysql.select(sql)
  69. return result
  70. @classmethod
  71. def splitCategoryToAccount(cls, cate_list):
  72. """
  73. split articles to each account
  74. :return:
  75. """
  76. account_index_info = {
  77. "gh_058e41145a0c": 30,
  78. "gh_0e4fd9e88386": 30,
  79. "gh_744cb16f6e16": 30,
  80. "gh_ac43eb24376d": 30,
  81. "gh_970460d9ccec": 30,
  82. "gh_56ca3dae948c": 30,
  83. "gh_c91b42649690": 30,
  84. "gh_6d205db62f04": 30,
  85. "gh_e24da99dc899": 30,
  86. "gh_4c058673c07e": 30,
  87. "gh_03d32e83122f": 30,
  88. "gh_c69776baf2cd": 30,
  89. "gh_30816d8adb52": 30,
  90. "gh_789a40fe7935": 30,
  91. "gh_95ed5ecf9363": 30,
  92. "gh_3e91f0624545": 30,
  93. "gh_57573f01b2ee": 30,
  94. "gh_9877c8541764": 30,
  95. "gh_6cfd1132df94": 30,
  96. "gh_008ef23062ee": 30,
  97. "gh_5ae65db96cb7": 30,
  98. "gh_be8c29139989": 30,
  99. "gh_51e4ad40466d": 30,
  100. "gh_d4dffc34ac39": 30,
  101. "gh_89ef4798d3ea": 30,
  102. "gh_b15de7c99912": 30,
  103. "gh_9f8dc5b0c74e": 30,
  104. "gh_7b4a5f86d68c": 30,
  105. "gh_c5cdf60d9ab4": 5,
  106. "gh_0c89e11f8bf3": 5,
  107. "gh_e0eb490115f5": 5,
  108. "gh_a2901d34f75b": 5,
  109. "gh_d5f935d0d1f2": 30
  110. }
  111. account_dict = cls.generate_account_dict()
  112. account_list = list(account_index_info.keys())
  113. title_list = [i['title'] for i in cate_list]
  114. L_map = {}
  115. for account_id in account_list:
  116. account_name = account_dict[account_id]
  117. score_list = cls.Fun.getTitleScore(title_list=title_list, account_name=account_name)[account_name]['score_list']
  118. for index, score in enumerate(score_list):
  119. channel_content_id = cate_list[index]['id']
  120. item = tuple([account_id, score])
  121. if L_map.get(channel_content_id):
  122. L_map[channel_content_id].append(item)
  123. else:
  124. L_map[channel_content_id] = [item]
  125. for key in L_map:
  126. L_map[key] = sorted(L_map[key], reverse=True, key=lambda x: x[1])
  127. content_account = []
  128. for item in cate_list:
  129. content_id = item['id']
  130. account_list = L_map[content_id]
  131. for account_tuple in account_list:
  132. gh_id, score = account_tuple[0], account_tuple[1]
  133. if account_index_info[gh_id] > 0:
  134. sub_item = tuple([content_id, gh_id, score])
  135. content_account.append(sub_item)
  136. account_index_info[gh_id] -= 1
  137. break
  138. # return content_account
  139. account_article_dict = {}
  140. for item in content_account:
  141. content_id, gh_id, score = item
  142. sub_i = tuple([content_id, score])
  143. if account_article_dict.get(gh_id):
  144. account_article_dict[gh_id].append(sub_i)
  145. else:
  146. account_article_dict[gh_id] = [sub_i]
  147. for account in tqdm(account_article_dict):
  148. date_str = datetime.datetime.today().strftime("%Y-%m-%d")
  149. insert_sql = f"""
  150. INSERT INTO article_pre_distribute_account
  151. (gh_id, date, article_list)
  152. VALUES
  153. (%s, %s, %s);
  154. """
  155. try:
  156. PQMySQL.update(sql=insert_sql, params=(account, date_str, json.dumps(account_article_dict[account], ensure_ascii=False)))
  157. except Exception as e:
  158. print("插入出现问题----{}".format(e))
  159. print("成功更新完成")
  160. @classmethod
  161. def findCategoryArticlesDaily(cls):
  162. """
  163. 和每个账号计算相关性分数
  164. :return:
  165. """
  166. category_list = [
  167. "军事政法",
  168. "健康养生",
  169. "宗教历史",
  170. "情感生活",
  171. "娱乐八卦",
  172. "新闻媒体",
  173. ]
  174. L = []
  175. for category in tqdm(category_list):
  176. print("{} is processing......".format(category))
  177. category_total = coldPoolArticlesNum * cateMap.get(category, 0.1)
  178. category_count = 0
  179. while category_count < category_total:
  180. article_list = cls.getTopArticles(category, 10)
  181. if len(article_list) == 0:
  182. print("{}: 该品类没有数据了!".format(category))
  183. break
  184. title_list = [article[2] for article in article_list]
  185. score_list = cls.Fun.getTitleScore(title_list, "指尖奇文")['指尖奇文']['score_list']
  186. for index, score in enumerate(score_list):
  187. content_id = article_list[index][0]
  188. if score >= 0.35:
  189. obj = {
  190. "id": article_list[index][0],
  191. "url": article_list[index][1],
  192. "title": article_list[index][2],
  193. "cate": category,
  194. "score": score
  195. }
  196. category_count += 1
  197. cls.usedArticle(content_id=content_id)
  198. print("used_article")
  199. L.append(obj)
  200. else:
  201. cls.badArticle(content_id=content_id)
  202. print("bad article")
  203. print(category_count)
  204. return L
  205. @classmethod
  206. def findAssociationArticlesDaily(cls):
  207. """
  208. 获取相关文章
  209. :return:
  210. """
  211. target_num = int(0.4 * coldPoolArticlesNum)
  212. sql = f"""
  213. select id, ori_account_name, association_url, association_title, url_md5
  214. from association_articles
  215. where status = 1
  216. order by read_count DESC
  217. limit {target_num};"""
  218. temp_list = cls.PqMysql.select(sql)
  219. id_tuple = tuple([i[0] for i in temp_list])
  220. update_sql = f"""
  221. update association_articles
  222. set status = %s
  223. where id in %s
  224. """
  225. cls.PqMysql.update(sql=update_sql, params=(0, id_tuple))
  226. article_list = []
  227. for i in tqdm(temp_list):
  228. try:
  229. o = {
  230. "related_account_name": i[1],
  231. "url": i[2],
  232. "title": i[3],
  233. "url_md5": i[4],
  234. # "id": i[4]
  235. "id": cls.Spider.get_article_text(i[2])['data']['data']['channel_content_id']
  236. }
  237. except:
  238. o = {
  239. "related_account_name": i[1],
  240. "url": i[2],
  241. "title": i[3],
  242. "url_md5": i[4],
  243. "id": i[4]
  244. }
  245. article_list.append(o)
  246. return article_list
  247. @classmethod
  248. def sendToColdPool(cls, plan_id=None):
  249. """
  250. 把文章send至第四层
  251. :return:
  252. """
  253. # 获取6个品类的数据
  254. association_list = cls.findAssociationArticlesDaily()
  255. category_list = cls.findCategoryArticlesDaily()
  256. d_list = category_list + association_list
  257. # 预分配账号
  258. cls.splitCategoryToAccount(d_list)
  259. try:
  260. army = [i for i in category_list if i['cate'] == '军事政法']
  261. cls.AidApi.updateArticleIntoCrawlerPlan(
  262. plan_id=plan_id,
  263. plan_name="军事政法类冷启",
  264. plan_tag=cls.pool3,
  265. url_list=[i['url'] for i in army]
  266. )
  267. except Exception as e:
  268. print("error--{}".format(e))
  269. try:
  270. history = [i for i in category_list if i['cate'] == '宗教历史']
  271. cls.AidApi.updateArticleIntoCrawlerPlan(
  272. plan_id=plan_id,
  273. plan_name="宗教历史类冷启",
  274. plan_tag=cls.pool3,
  275. url_list=[i['url'] for i in history]
  276. )
  277. except Exception as e:
  278. print("error--{}".format(e))
  279. try:
  280. news = [i for i in category_list if i['cate'] == '新闻媒体']
  281. cls.AidApi.updateArticleIntoCrawlerPlan(
  282. plan_id=plan_id,
  283. plan_name="新闻媒体类冷启",
  284. plan_tag=cls.pool3,
  285. url_list=[i['url'] for i in news]
  286. )
  287. except Exception as e:
  288. print("error--{}".format(e))
  289. try:
  290. life = [i for i in category_list if i['cate'] == '情感生活']
  291. cls.AidApi.updateArticleIntoCrawlerPlan(
  292. plan_id=plan_id,
  293. plan_name="生活情感类冷启",
  294. plan_tag=cls.pool3,
  295. url_list=[i['url'] for i in life]
  296. )
  297. except Exception as e:
  298. print("error--{}".format(e))
  299. try:
  300. healthy = [i for i in category_list if i['cate'] == '健康养生']
  301. cls.AidApi.updateArticleIntoCrawlerPlan(
  302. plan_id=plan_id,
  303. plan_name="健康养生类冷启",
  304. plan_tag=cls.pool3,
  305. url_list=[i['url'] for i in healthy]
  306. )
  307. except Exception as e:
  308. print("error--{}".format(e))
  309. try:
  310. fun = [i for i in category_list if i['cate'] == '娱乐八卦']
  311. cls.AidApi.updateArticleIntoCrawlerPlan(
  312. plan_id=plan_id,
  313. plan_name="娱乐八卦类冷启",
  314. plan_tag=cls.pool3,
  315. url_list=[i['url'] for i in fun]
  316. )
  317. except Exception as e:
  318. print("error--{}".format(e))
  319. cls.AidApi.updateArticleIntoCrawlerPlan(
  320. plan_id=plan_id,
  321. plan_name="文章账号联想冷启",
  322. plan_tag=cls.pool3,
  323. url_list=[i['url'] for i in association_list]
  324. )
  325. if __name__ == '__main__':
  326. CT = ColdStartTask()
  327. CT.sendToColdPool()