|
@@ -1,9 +1,13 @@
|
|
|
"""
|
|
|
@author: luojunhui
|
|
|
"""
|
|
|
+import datetime
|
|
|
+import json
|
|
|
+
|
|
|
+from tqdm import tqdm
|
|
|
|
|
|
from applications import AIDTApi, DeNetMysql, PQMySQL, Functions
|
|
|
-from config import poolLevelConfig, cateMap, coldPoolArticlesNum
|
|
|
+from config import poolLevelConfig, cateMap, coldPoolArticlesNum, accountBaseInfo
|
|
|
|
|
|
|
|
|
class ColdStartTask(object):
|
|
@@ -14,7 +18,20 @@ class ColdStartTask(object):
|
|
|
DeMysql = DeNetMysql()
|
|
|
PqMysql = PQMySQL()
|
|
|
Fun = Functions()
|
|
|
- pool4 = poolLevelConfig['1']
|
|
|
+ pool3 = poolLevelConfig['3']
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def generate_account_dict(cls):
|
|
|
+ """
|
|
|
+ 生成account_list
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ account_dict = {}
|
|
|
+ for key in accountBaseInfo:
|
|
|
+ account_name = accountBaseInfo[key]['accountName']
|
|
|
+ account_gh_id = accountBaseInfo[key]['ghId']
|
|
|
+ account_dict[account_name] = account_gh_id
|
|
|
+ return account_dict
|
|
|
|
|
|
@classmethod
|
|
|
def getTopArticles(cls, category, limit_count):
|
|
@@ -23,7 +40,7 @@ class ColdStartTask(object):
|
|
|
:return:
|
|
|
"""
|
|
|
sql = f"""
|
|
|
- select content_id, content_link, title
|
|
|
+ select content_channel_id, content_link, title
|
|
|
from cold_start_article_pool
|
|
|
where category = '{category}'
|
|
|
order by view_count DESC, publish_time_stamp DESC
|
|
@@ -33,14 +50,61 @@ class ColdStartTask(object):
|
|
|
return result
|
|
|
|
|
|
@classmethod
|
|
|
- def computeScore(cls):
|
|
|
+ def getAccountScoreList(cls, title_list, account_name):
|
|
|
+ """
|
|
|
+ 预分配文章给不同的账号
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ score_list = cls.Fun.getTitleScore(title_list=title_list, account_name=account_name)[account_name]['score_list']
|
|
|
+ return score_list
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def splitToAccount(cls, obj_list):
|
|
|
+ """
|
|
|
+ split articles to each account
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ account_dict = cls.generate_account_dict()
|
|
|
+ account_list = list(account_dict.keys())
|
|
|
+ title_list = [i['title'] for i in obj_list]
|
|
|
+ for account in tqdm(account_list):
|
|
|
+ score_list = cls.getAccountScoreList(title_list=title_list, account_name=account)
|
|
|
+ L = []
|
|
|
+ for index, score in enumerate(score_list):
|
|
|
+ L.append((obj_list[index]['id'], score))
|
|
|
+ SL = sorted(L, key=lambda x: x[1], reverse=True)
|
|
|
+ gh_id = account_dict[account]
|
|
|
+ date_str = datetime.datetime.today().strftime("%Y-%m-%d")
|
|
|
+ insert_sql = f"""
|
|
|
+ INSERT INTO article_pre_distribute_account
|
|
|
+ (gh_id, date, article_list)
|
|
|
+ VALUES
|
|
|
+ (%s, %s, %s);
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ PQMySQL.update(sql=insert_sql, params=(gh_id, date_str, json.dumps(SL[:30], ensure_ascii=False)))
|
|
|
+ except Exception as e:
|
|
|
+ print("插入出现问题----{}".format(e))
|
|
|
+
|
|
|
+ print("成功更新完成")
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def findArticlesDaily(cls):
|
|
|
"""
|
|
|
和每个账号计算相关性分数
|
|
|
:return:
|
|
|
"""
|
|
|
- category_list = ["军事政法", "健康养生", "宗教历史"]
|
|
|
+ category_list = [
|
|
|
+ "军事政法",
|
|
|
+ "健康养生",
|
|
|
+ "宗教历史",
|
|
|
+ "情感生活",
|
|
|
+ "娱乐八卦",
|
|
|
+ "新闻媒体"
|
|
|
+ ]
|
|
|
L = []
|
|
|
- for category in category_list:
|
|
|
+ for category in tqdm(category_list):
|
|
|
+ print("{} is processing......".format(category))
|
|
|
limit_count = coldPoolArticlesNum * cateMap.get(category, 0.1)
|
|
|
article_tuple = cls.getTopArticles(category, int(limit_count))
|
|
|
title_list = [article[2] for article in article_tuple]
|
|
@@ -63,17 +127,17 @@ class ColdStartTask(object):
|
|
|
把文章send至第四层
|
|
|
:return:
|
|
|
"""
|
|
|
- result = cls.computeScore()
|
|
|
- army = [i for i in result if i['cate'] == '军事政法']
|
|
|
- healthy = [i for i in result if i['cate'] == '健康养生']
|
|
|
- history = [i for i in result if i['cate'] == '宗教历史']
|
|
|
- url_list = [i['url'] for i in result]
|
|
|
+ # 获取6个品类的数据
|
|
|
+ target_article_list = cls.findArticlesDaily()
|
|
|
+ # 预分配账号
|
|
|
+ cls.splitToAccount(target_article_list)
|
|
|
+
|
|
|
# 再加一次配比,每个品类的数量占比
|
|
|
cls.AidApi.updateArticleIntoCrawlerPlan(
|
|
|
plan_id=plan_id,
|
|
|
plan_name=plan_name,
|
|
|
plan_tag=plan_tag,
|
|
|
- url_list=[i['url'] for i in result]
|
|
|
+ url_list=[i['url'] for i in target_article_list]
|
|
|
)
|
|
|
|
|
|
|