123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150 |
- """
- @author: luojunhui
- """
- import datetime
- import json
- from tqdm import tqdm
- from applications import AIDTApi, DeNetMysql, PQMySQL, Functions
- from config import poolLevelConfig, cateMap, coldPoolArticlesNum, accountBaseInfo
- class ColdStartTask(object):
- """
- 冷启分配任务
- """
- AidApi = AIDTApi()
- DeMysql = DeNetMysql()
- PqMysql = PQMySQL()
- Fun = Functions()
- pool3 = poolLevelConfig['3']
- @classmethod
- def generate_account_dict(cls):
- """
- 生成account_list
- :return:
- """
- account_dict = {}
- for key in accountBaseInfo:
- account_name = accountBaseInfo[key]['accountName']
- account_gh_id = accountBaseInfo[key]['ghId']
- account_dict[account_name] = account_gh_id
- return account_dict
- @classmethod
- def getTopArticles(cls, category, limit_count):
- """
- 获取高分享的文章list
- :return:
- """
- sql = f"""
- select content_channel_id, content_link, title
- from cold_start_article_pool
- where category = '{category}'
- order by view_count DESC, publish_time_stamp DESC
- limit {limit_count};
- """
- result = cls.PqMysql.select(sql)
- return result
- @classmethod
- def getAccountScoreList(cls, title_list, account_name):
- """
- 预分配文章给不同的账号
- :return:
- """
- score_list = cls.Fun.getTitleScore(title_list=title_list, account_name=account_name)[account_name]['score_list']
- return score_list
- @classmethod
- def splitToAccount(cls, obj_list):
- """
- split articles to each account
- :return:
- """
- account_dict = cls.generate_account_dict()
- account_list = list(account_dict.keys())
- title_list = [i['title'] for i in obj_list]
- for account in tqdm(account_list):
- score_list = cls.getAccountScoreList(title_list=title_list, account_name=account)
- L = []
- for index, score in enumerate(score_list):
- L.append((obj_list[index]['id'], score))
- SL = sorted(L, key=lambda x: x[1], reverse=True)
- gh_id = account_dict[account]
- date_str = datetime.datetime.today().strftime("%Y-%m-%d")
- insert_sql = f"""
- INSERT INTO article_pre_distribute_account
- (gh_id, date, article_list)
- VALUES
- (%s, %s, %s);
- """
- try:
- PQMySQL.update(sql=insert_sql, params=(gh_id, date_str, json.dumps(SL[:30], ensure_ascii=False)))
- except Exception as e:
- print("插入出现问题----{}".format(e))
- print("成功更新完成")
- @classmethod
- def findArticlesDaily(cls):
- """
- 和每个账号计算相关性分数
- :return:
- """
- category_list = [
- "军事政法",
- "健康养生",
- "宗教历史",
- "情感生活",
- "娱乐八卦",
- "新闻媒体"
- ]
- L = []
- for category in tqdm(category_list):
- print("{} is processing......".format(category))
- limit_count = coldPoolArticlesNum * cateMap.get(category, 0.1)
- article_tuple = cls.getTopArticles(category, int(limit_count))
- title_list = [article[2] for article in article_tuple]
- score_list = cls.Fun.getTitleScore(title_list, "指尖奇文")['指尖奇文']['score_list']
- for index, score in enumerate(score_list):
- obj = {
- "id": article_tuple[index][0],
- "url": article_tuple[index][1],
- "title": article_tuple[index][2],
- "cate": category,
- "score": score
- }
- L.append(obj)
- result = [i for i in L if i['score'] >= 0.35]
- return result
- @classmethod
- def sendToColdPool(cls, plan_id, plan_name, plan_tag):
- """
- 把文章send至第四层
- :return:
- """
- # 获取6个品类的数据
- target_article_list = cls.findArticlesDaily()
- # 预分配账号
- cls.splitToAccount(target_article_list)
- # 再加一次配比,每个品类的数量占比
- cls.AidApi.updateArticleIntoCrawlerPlan(
- plan_id=plan_id,
- plan_name=plan_name,
- plan_tag=plan_tag,
- url_list=[i['url'] for i in target_article_list]
- )
- if __name__ == '__main__':
- CST = ColdStartTask()
- CST.sendToColdPool(
- plan_id=None,
- plan_name="冷启池子--0729--Monday--分品类抓取--6个品类",
- plan_tag="autoArticlePoolLevel1",
- )
|