|
@@ -8,6 +8,7 @@ from tqdm import tqdm
|
|
|
|
|
|
from applications import AIDTApi, DeNetMysql, PQMySQL, Functions, WeixinSpider
|
|
|
from config import cateMap, coldPoolArticlesNum, accountBaseInfo
|
|
|
+from stratrgy.distribution import ArticleDistribution
|
|
|
|
|
|
|
|
|
class ColdStartTask(object):
|
|
@@ -19,6 +20,7 @@ class ColdStartTask(object):
|
|
|
PqMysql = PQMySQL()
|
|
|
Fun = Functions()
|
|
|
Spider = WeixinSpider()
|
|
|
+ D = ArticleDistribution()
|
|
|
pool3 = "autoArticlePoolLevel3"
|
|
|
|
|
|
@classmethod
|
|
@@ -158,16 +160,17 @@ class ColdStartTask(object):
|
|
|
|
|
|
for account in tqdm(account_article_dict):
|
|
|
date_str = datetime.datetime.today().strftime("%Y-%m-%d")
|
|
|
- insert_sql = f"""
|
|
|
- INSERT INTO article_pre_distribute_account
|
|
|
- (gh_id, date, article_list)
|
|
|
- VALUES
|
|
|
- (%s, %s, %s);
|
|
|
- """
|
|
|
- try:
|
|
|
- PQMySQL.update(sql=insert_sql, params=(account, date_str, json.dumps(account_article_dict[account], ensure_ascii=False)))
|
|
|
- except Exception as e:
|
|
|
- print("插入出现问题----{}".format(e))
|
|
|
+ print(account, date_str, json.dumps(account_article_dict[account], ensure_ascii=False))
|
|
|
+ # insert_sql = f"""
|
|
|
+ # INSERT INTO article_pre_distribute_account
|
|
|
+ # (gh_id, date, article_list)
|
|
|
+ # VALUES
|
|
|
+ # (%s, %s, %s);
|
|
|
+ # """
|
|
|
+ # try:
|
|
|
+ # PQMySQL.update(sql=insert_sql, params=(account, date_str, json.dumps(account_article_dict[account], ensure_ascii=False)))
|
|
|
+ # except Exception as e:
|
|
|
+ # print("插入出现问题----{}".format(e))
|
|
|
|
|
|
print("成功更新完成")
|
|
|
|
|
@@ -179,11 +182,11 @@ class ColdStartTask(object):
|
|
|
"""
|
|
|
category_list = [
|
|
|
"军事政法",
|
|
|
- "健康养生",
|
|
|
+ # "健康养生",
|
|
|
"宗教历史",
|
|
|
- "情感生活",
|
|
|
- "娱乐八卦",
|
|
|
- "新闻媒体",
|
|
|
+ # "情感生活",
|
|
|
+ # "娱乐八卦",
|
|
|
+ # "新闻媒体",
|
|
|
]
|
|
|
L = []
|
|
|
for category in tqdm(category_list):
|
|
@@ -224,13 +227,14 @@ class ColdStartTask(object):
|
|
|
获取相关文章
|
|
|
:return:
|
|
|
"""
|
|
|
- target_num = int(0.4 * coldPoolArticlesNum)
|
|
|
+ # target_num = int(0.8 * coldPoolArticlesNum)
|
|
|
sql = f"""
|
|
|
- select id, ori_account_name, association_url, association_title, url_md5
|
|
|
+ select id, publish_timestamp, title, link, title_score, url_md5
|
|
|
from association_articles
|
|
|
- where status = 1
|
|
|
- order by read_count DESC
|
|
|
- limit {target_num};"""
|
|
|
+ where status = 1 and content_length > 500
|
|
|
+ order by publish_timestamp
|
|
|
+ DESC limit 10000;
|
|
|
+ """
|
|
|
temp_list = cls.PqMysql.select(sql)
|
|
|
id_tuple = tuple([i[0] for i in temp_list])
|
|
|
update_sql = f"""
|
|
@@ -239,24 +243,37 @@ class ColdStartTask(object):
|
|
|
where id in %s
|
|
|
"""
|
|
|
cls.PqMysql.update(sql=update_sql, params=(0, id_tuple))
|
|
|
+ # url_md5去重
|
|
|
+ L = {}
|
|
|
+ for line in temp_list:
|
|
|
+ key = line[-1]
|
|
|
+ if L.get(key):
|
|
|
+ L[key].append(list(line))
|
|
|
+ else:
|
|
|
+ L[key] = [list(line)]
|
|
|
+
|
|
|
+ LL = []
|
|
|
+ for key in L:
|
|
|
+ value_list = L[key]
|
|
|
+ sorted_k = sorted(value_list, reverse=True, key=lambda x: (x[1], x[4]))
|
|
|
+ LL.append(sorted_k[0])
|
|
|
article_list = []
|
|
|
- for i in tqdm(temp_list):
|
|
|
+ LL = sorted(LL, reverse=True, key=lambda x: x[1])
|
|
|
+ for i in tqdm(LL[:int(680 * 0.8)]):
|
|
|
try:
|
|
|
o = {
|
|
|
- "related_account_name": i[1],
|
|
|
- "url": i[2],
|
|
|
- "title": i[3],
|
|
|
- "url_md5": i[4],
|
|
|
- # "id": i[4]
|
|
|
- "id": cls.Spider.get_article_text(i[2])['data']['data']['channel_content_id']
|
|
|
+ "url": i[3],
|
|
|
+ "title": i[2],
|
|
|
+ "url_md5": i[5],
|
|
|
+ "id": i[3]
|
|
|
+ # "id": cls.Spider.get_article_text(i[3])['data']['data']['channel_content_id']
|
|
|
}
|
|
|
except:
|
|
|
o = {
|
|
|
- "related_account_name": i[1],
|
|
|
- "url": i[2],
|
|
|
- "title": i[3],
|
|
|
- "url_md5": i[4],
|
|
|
- "id": i[4]
|
|
|
+ "url": i[3],
|
|
|
+ "title": i[2],
|
|
|
+ "url_md5": i[5],
|
|
|
+ "id": i[3]
|
|
|
}
|
|
|
article_list.append(o)
|
|
|
return article_list
|
|
@@ -264,88 +281,89 @@ class ColdStartTask(object):
|
|
|
@classmethod
|
|
|
def sendToColdPool(cls, plan_id=None):
|
|
|
"""
|
|
|
- 把文章send至第四层
|
|
|
+ 把文章send至第三层
|
|
|
:return:
|
|
|
"""
|
|
|
# 获取6个品类的数据
|
|
|
association_list = cls.findAssociationArticlesDaily()
|
|
|
- category_list = cls.findCategoryArticlesDaily()
|
|
|
- d_list = category_list + association_list
|
|
|
- # 预分配账号
|
|
|
- cls.splitCategoryToAccount(d_list)
|
|
|
-
|
|
|
- try:
|
|
|
- army = [i for i in category_list if i['cate'] == '军事政法']
|
|
|
- cls.AidApi.updateArticleIntoCrawlerPlan(
|
|
|
- plan_id=plan_id,
|
|
|
- plan_name="军事政法类冷启",
|
|
|
- plan_tag=cls.pool3,
|
|
|
- url_list=[i['url'] for i in army]
|
|
|
- )
|
|
|
- except Exception as e:
|
|
|
- print("error--{}".format(e))
|
|
|
-
|
|
|
- try:
|
|
|
- history = [i for i in category_list if i['cate'] == '宗教历史']
|
|
|
- cls.AidApi.updateArticleIntoCrawlerPlan(
|
|
|
- plan_id=plan_id,
|
|
|
- plan_name="宗教历史类冷启",
|
|
|
- plan_tag=cls.pool3,
|
|
|
- url_list=[i['url'] for i in history]
|
|
|
- )
|
|
|
- except Exception as e:
|
|
|
- print("error--{}".format(e))
|
|
|
-
|
|
|
- try:
|
|
|
- news = [i for i in category_list if i['cate'] == '新闻媒体']
|
|
|
- cls.AidApi.updateArticleIntoCrawlerPlan(
|
|
|
- plan_id=plan_id,
|
|
|
- plan_name="新闻媒体类冷启",
|
|
|
- plan_tag=cls.pool3,
|
|
|
- url_list=[i['url'] for i in news]
|
|
|
- )
|
|
|
- except Exception as e:
|
|
|
- print("error--{}".format(e))
|
|
|
-
|
|
|
- try:
|
|
|
- life = [i for i in category_list if i['cate'] == '情感生活']
|
|
|
- cls.AidApi.updateArticleIntoCrawlerPlan(
|
|
|
- plan_id=plan_id,
|
|
|
- plan_name="生活情感类冷启",
|
|
|
- plan_tag=cls.pool3,
|
|
|
- url_list=[i['url'] for i in life]
|
|
|
- )
|
|
|
- except Exception as e:
|
|
|
- print("error--{}".format(e))
|
|
|
-
|
|
|
- try:
|
|
|
- healthy = [i for i in category_list if i['cate'] == '健康养生']
|
|
|
- cls.AidApi.updateArticleIntoCrawlerPlan(
|
|
|
- plan_id=plan_id,
|
|
|
- plan_name="健康养生类冷启",
|
|
|
- plan_tag=cls.pool3,
|
|
|
- url_list=[i['url'] for i in healthy]
|
|
|
- )
|
|
|
- except Exception as e:
|
|
|
- print("error--{}".format(e))
|
|
|
-
|
|
|
- try:
|
|
|
- fun = [i for i in category_list if i['cate'] == '娱乐八卦']
|
|
|
- cls.AidApi.updateArticleIntoCrawlerPlan(
|
|
|
- plan_id=plan_id,
|
|
|
- plan_name="娱乐八卦类冷启",
|
|
|
- plan_tag=cls.pool3,
|
|
|
- url_list=[i['url'] for i in fun]
|
|
|
- )
|
|
|
- except Exception as e:
|
|
|
- print("error--{}".format(e))
|
|
|
-
|
|
|
- cls.AidApi.updateArticleIntoCrawlerPlan(
|
|
|
- plan_id=plan_id,
|
|
|
- plan_name="文章账号联想冷启",
|
|
|
- plan_tag=cls.pool3,
|
|
|
- url_list=[i['url'] for i in association_list]
|
|
|
- )
|
|
|
+ cls.D.association_split(association_list)
|
|
|
+ # category_list = cls.findCategoryArticlesDaily()
|
|
|
+ # d_list = category_list + association_list
|
|
|
+ # # # 预分配账号
|
|
|
+ # cls.splitCategoryToAccount(association_list)
|
|
|
+ # #
|
|
|
+ # try:
|
|
|
+ # army = [i for i in category_list if i['cate'] == '军事政法']
|
|
|
+ # cls.AidApi.updateArticleIntoCrawlerPlan(
|
|
|
+ # plan_id=plan_id,
|
|
|
+ # plan_name="军事政法类冷启",
|
|
|
+ # plan_tag=cls.pool3,
|
|
|
+ # url_list=[i['url'] for i in army]
|
|
|
+ # )
|
|
|
+ # except Exception as e:
|
|
|
+ # print("error--{}".format(e))
|
|
|
+ #
|
|
|
+ # try:
|
|
|
+ # history = [i for i in category_list if i['cate'] == '宗教历史']
|
|
|
+ # cls.AidApi.updateArticleIntoCrawlerPlan(
|
|
|
+ # plan_id=plan_id,
|
|
|
+ # plan_name="宗教历史类冷启",
|
|
|
+ # plan_tag=cls.pool3,
|
|
|
+ # url_list=[i['url'] for i in history]
|
|
|
+ # )
|
|
|
+ # except Exception as e:
|
|
|
+ # print("error--{}".format(e))
|
|
|
+ # #
|
|
|
+ # # try:
|
|
|
+ # # news = [i for i in category_list if i['cate'] == '新闻媒体']
|
|
|
+ # # cls.AidApi.updateArticleIntoCrawlerPlan(
|
|
|
+ # # plan_id=plan_id,
|
|
|
+ # # plan_name="新闻媒体类冷启",
|
|
|
+ # # plan_tag=cls.pool3,
|
|
|
+ # # url_list=[i['url'] for i in news]
|
|
|
+ # # )
|
|
|
+ # # except Exception as e:
|
|
|
+ # # print("error--{}".format(e))
|
|
|
+ # #
|
|
|
+ # # try:
|
|
|
+ # # life = [i for i in category_list if i['cate'] == '情感生活']
|
|
|
+ # # cls.AidApi.updateArticleIntoCrawlerPlan(
|
|
|
+ # # plan_id=plan_id,
|
|
|
+ # # plan_name="生活情感类冷启",
|
|
|
+ # # plan_tag=cls.pool3,
|
|
|
+ # # url_list=[i['url'] for i in life]
|
|
|
+ # # )
|
|
|
+ # # except Exception as e:
|
|
|
+ # # print("error--{}".format(e))
|
|
|
+ # #
|
|
|
+ # # try:
|
|
|
+ # # healthy = [i for i in category_list if i['cate'] == '健康养生']
|
|
|
+ # # cls.AidApi.updateArticleIntoCrawlerPlan(
|
|
|
+ # # plan_id=plan_id,
|
|
|
+ # # plan_name="健康养生类冷启",
|
|
|
+ # # plan_tag=cls.pool3,
|
|
|
+ # # url_list=[i['url'] for i in healthy]
|
|
|
+ # # )
|
|
|
+ # # except Exception as e:
|
|
|
+ # # print("error--{}".format(e))
|
|
|
+ # #
|
|
|
+ # # try:
|
|
|
+ # # fun = [i for i in category_list if i['cate'] == '娱乐八卦']
|
|
|
+ # # cls.AidApi.updateArticleIntoCrawlerPlan(
|
|
|
+ # # plan_id=plan_id,
|
|
|
+ # # plan_name="娱乐八卦类冷启",
|
|
|
+ # # plan_tag=cls.pool3,
|
|
|
+ # # url_list=[i['url'] for i in fun]
|
|
|
+ # # )
|
|
|
+ # # except Exception as e:
|
|
|
+ # # print("error--{}".format(e))
|
|
|
+ # #
|
|
|
+ # cls.AidApi.updateArticleIntoCrawlerPlan(
|
|
|
+ # plan_id=plan_id,
|
|
|
+ # plan_name="文章账号联想冷启",
|
|
|
+ # plan_tag=cls.pool3,
|
|
|
+ # url_list=[i['url'] for i in association_list]
|
|
|
+ # )
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|