|
@@ -18,10 +18,13 @@ from applications.utils import Item
|
|
|
from applications.utils import insert_into_candidate_account_pool_table
|
|
|
from coldStartTasks.crawler.baidu import haokan_search_videos
|
|
|
from coldStartTasks.crawler.baidu.baidu_spider import baidu_account_video_crawler
|
|
|
-from coldStartTasks.crawler.toutiao import get_associated_recommendation
|
|
|
-from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
|
|
|
from coldStartTasks.crawler.channels import search_in_wechat_channel
|
|
|
from coldStartTasks.crawler.channels import get_channel_account_videos
|
|
|
+from coldStartTasks.crawler.toutiao import get_associated_recommendation
|
|
|
+from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
|
|
|
+from coldStartTasks.crawler.wechat import get_article_detail
|
|
|
+from coldStartTasks.crawler.wechat import get_article_list_from_account
|
|
|
+from coldStartTasks.crawler.wechat import get_source_account_from_article
|
|
|
from config import apolloConfig, long_articles_config
|
|
|
|
|
|
config = apolloConfig()
|
|
@@ -344,3 +347,107 @@ class HaoKanAccountCrawler(CrawlerAccounts):
|
|
|
"traceback": traceback.format_exc(),
|
|
|
},
|
|
|
)
|
|
|
+
|
|
|
+
|
|
|
+class GzhAccountCrawler(CrawlerAccounts):
|
|
|
+
|
|
|
+ def get_task_list(self):
|
|
|
+ fetch_query = f"""
|
|
|
+ select id, article_url
|
|
|
+ from publish_single_video_source
|
|
|
+ where source_account = 1 and platform = 'gzh' limit 10;
|
|
|
+ """
|
|
|
+ task_list = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
|
|
|
+ return task_list
|
|
|
+
|
|
|
+ def process_official_account(self, account_name, account_id):
|
|
|
+ """
|
|
|
+ process_official_account
|
|
|
+ """
|
|
|
+ account_item = Item()
|
|
|
+ account_item.add("account_name", account_name)
|
|
|
+ account_item.add("account_id", account_id)
|
|
|
+ account_item.add("platform", "gzh")
|
|
|
+ account_item.add("crawler_date", datetime.datetime.today().strftime("%Y-%m-%d"))
|
|
|
+
|
|
|
+ # fetch account video first page video list
|
|
|
+ fetch_response = get_article_list_from_account(account_id=account_id, index=None)
|
|
|
+ msg_list = fetch_response["data"]["data"]
|
|
|
+ title_list = []
|
|
|
+ for msg in msg_list:
|
|
|
+ sub_title_list = [i['Title'] for i in msg['AppMsg']['DetailInfo']]
|
|
|
+ if len(title_list) > 10:
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ title_list += sub_title_list
|
|
|
+
|
|
|
+ title_list_str = json.dumps(title_list, ensure_ascii=False)
|
|
|
+ account_item.add("title_list", title_list_str)
|
|
|
+
|
|
|
+ # check item
|
|
|
+ account_item.check(source="candidate_account")
|
|
|
+
|
|
|
+ # insert into database
|
|
|
+ self.insert_video_into_recommend_table(account_item.item)
|
|
|
+
|
|
|
+ def extract_account_from_article_link(self, article_link):
|
|
|
+ """
|
|
|
+ try to get account info from article link
|
|
|
+ """
|
|
|
+ # is article link original
|
|
|
+ article_detail = get_article_detail(article_link)
|
|
|
+ is_original = article_detail["data"]["data"]["is_original"]
|
|
|
+
|
|
|
+ if is_original:
|
|
|
+ return
|
|
|
+ # extract source account
|
|
|
+ source_account = get_source_account_from_article(article_link)
|
|
|
+ if not source_account:
|
|
|
+ return
|
|
|
+ else:
|
|
|
+ account_name = source_account['name']
|
|
|
+ gh_id = source_account['gh_id']
|
|
|
+ self.process_official_account(account_name, gh_id)
|
|
|
+
|
|
|
+ def update_crawler_article_status(self, article_id_tuple: tuple):
|
|
|
+ """
|
|
|
+ update crawler article status
|
|
|
+ """
|
|
|
+ update_query = f"""
|
|
|
+ update publish_single_video_source
|
|
|
+ set source_account = %s
|
|
|
+ where id in %s;
|
|
|
+ """
|
|
|
+ affected_rows = self.db_client.save(
|
|
|
+ query=update_query, params=(0, article_id_tuple)
|
|
|
+ )
|
|
|
+ return affected_rows
|
|
|
+
|
|
|
+ def deal(self):
|
|
|
+ task_list = self.get_task_list()
|
|
|
+ task_id_list = []
|
|
|
+ for crawler_article_obj in tqdm(task_list, desc="crawler article list"):
|
|
|
+ article_url = crawler_article_obj['article_url']
|
|
|
+ article_id = crawler_article_obj['id']
|
|
|
+ task_id_list.append(int(article_id))
|
|
|
+ try:
|
|
|
+ self.extract_account_from_article_link(article_url)
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ log(
|
|
|
+ task="gzh_account_crawler",
|
|
|
+ function="extract_account_from_article_link",
|
|
|
+ message="extract account from article link failed",
|
|
|
+ data={
|
|
|
+ "article_url": article_url,
|
|
|
+ "error": str(e),
|
|
|
+ "traceback": traceback.format_exc(),
|
|
|
+ },
|
|
|
+ )
|
|
|
+
|
|
|
+ if task_id_list:
|
|
|
+ article_id_tuple = tuple(task_id_list)
|
|
|
+ affected_rows = self.update_crawler_article_status(article_id_tuple)
|
|
|
+ print(affected_rows)
|
|
|
+
|
|
|
+
|