""" @author: luojunhui """ import time import numpy as np from pymysql.cursors import DictCursor from tqdm import tqdm from applications import WeixinSpider from applications.api import similarity_between_title_list from applications.const import ColdStartTaskConst from applications.db import DatabaseConnector from applications.functions import Functions from applications.utils import get_inner_account_set from applications.utils import whether_title_sensitive from config import long_articles_config spider = WeixinSpider() functions = Functions() const = ColdStartTaskConst() class ArticleAssociationCrawler(object): """ article association crawler task """ def __init__(self): self.db_client = DatabaseConnector(db_config=long_articles_config) self.db_client.connect() self.inner_account_set = get_inner_account_set() def get_seed_url_list(self): """ 获取种子url列表 """ sql = f""" select gh_id, title, link from datastat_sort_strategy where date_str > '20250220' and view_count > 1000 and read_rate > 1.3 and type = 9 order by read_rate desc limit 30; """ seed_article_list = self.db_client.fetch(query=sql, cursor_type=DictCursor) return seed_article_list def get_level_up_title_list(self): """ 获取晋级文章标题列表 status: 1 表示文章已经溯源完成 deleted: 0 表示文章正常 level = 'autoArticlePoolLevel1' 表示头条 """ sql = f""" select distinct title from article_pool_promotion_source where level = 'autoArticlePoolLevel1' and status = 1 and deleted = 0; """ mysql_response = self.db_client.fetch(query=sql) title_list = [i[0] for i in mysql_response] return title_list def get_recommend_url_list_with_depth(self, seed_url, source_title, source_account, base_title_list, depth=1): """ @param seed_url: good url from data_sort_strategy @param depth: association depth @param source_title: article title @param source_account: article account """ if depth > const.ARTICLE_ASSOCIATION_MAX_DEPTH: return res = spider.get_recommend_articles(content_link=seed_url) related_articles = res['data']['data']['list'] if related_articles: title_list = [i['title'] for i in related_articles] similarity_array = similarity_between_title_list(title_list, base_title_list) recommend_articles = [] for index, score_list in enumerate(similarity_array): sorted_score_list = sorted(score_list) percent_threshold_score = np.percentile(sorted_score_list, const.PERCENT_THRESHOLD) if percent_threshold_score < const.CORRELATION_THRESHOLD: continue else: article_obj = related_articles[index] article_obj['score'] = percent_threshold_score recommend_articles.append(article_obj) recommend_process_bar = tqdm(recommend_articles, desc="save recommend articles") for article in recommend_process_bar: obj = { "title": article['title'], "url": article['url'], "gh_id": article['username'], "index": article['idx'], "send_time": article['send_time'], "read_cnt": article['read_num'], "depth": depth, "source_article_title": source_title, "source_account": source_account, } self.insert_recommend_article(obj) recommend_process_bar.set_postfix({"title": article['title'], "depth": depth}) self.get_recommend_url_list_with_depth( seed_url=obj["url"], source_title=obj["title"], source_account=obj["gh_id"], base_title_list=base_title_list, depth=depth + 1 ) else: return def insert_recommend_article(self, obj): """ insert recommend article """ # whether account inside if obj['gh_id'] in self.inner_account_set: return # whether article title exists title = obj['title'] select_sql = "select article_id from crawler_meta_article where title = %s;" res = self.db_client.fetch(query=select_sql, params=(title,)) if res: return # whether title sensitive title_sensitivity = const.TITLE_SENSITIVE if whether_title_sensitive(title) else const.TITLE_NOT_SENSITIVE # insert this article insert_sql = f""" insert into crawler_meta_article (platform, mode, category, out_account_id, article_index, title, link, read_cnt, publish_time, crawler_time, status, unique_index, source_article_title, source_account, title_sensitivity) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); """ self.db_client.save( query=insert_sql, params=( "weixin", "recommend", "article_association", obj["gh_id"], obj["index"], obj["title"], obj["url"], obj["read_cnt"], obj["send_time"], int(time.time()), const.DEFAULT_ARTICLE_STATUS, functions.generateGzhId(obj["url"]), obj['source_article_title'], obj['source_account'], title_sensitivity ) ) def deal(self): """ class entrance """ seed_article_list = self.get_seed_url_list() deal_bar = tqdm(seed_article_list, desc="article association crawler") base_title_list = self.get_level_up_title_list() for article in deal_bar: try: self.get_recommend_url_list_with_depth( seed_url=article["link"], source_title=article["title"], source_account=article["gh_id"], base_title_list=base_title_list ) deal_bar.set_postfix({"article_title": article["title"]}) except Exception as e: print(e) print(article) continue