|
@@ -2,10 +2,15 @@
|
|
|
@author: luojunhui
|
|
|
"""
|
|
|
|
|
|
+from __future__ import annotations
|
|
|
+
|
|
|
import time
|
|
|
+import traceback
|
|
|
|
|
|
+from pymysql.cursors import DictCursor
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
+from applications import log
|
|
|
from applications.const import ToutiaoVideoCrawlerConst
|
|
|
from applications.db import DatabaseConnector
|
|
|
from applications.pipeline import scrape_video_entities_process
|
|
@@ -33,21 +38,40 @@ class CrawlerToutiaoAccountVideos:
|
|
|
"""
|
|
|
get account list
|
|
|
"""
|
|
|
- return
|
|
|
+ sql = f"""
|
|
|
+ select account_id, max_cursor
|
|
|
+ from video_meta_accounts
|
|
|
+ where platform = 'toutiao' and status = {const.TOUTIAO_ACCOUNT_GOOD_STATUS};
|
|
|
+ """
|
|
|
+ account_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
|
|
|
+ return account_list
|
|
|
|
|
|
- def crawler_each_account_video_list(self, account_id, max_behot_time=0):
|
|
|
+ def crawler_each_account_video_list(
|
|
|
+ self, account_id: str, max_cursor: int | None, max_behot_time: int = 0
|
|
|
+ ):
|
|
|
"""
|
|
|
- get each account video list
|
|
|
+ account_id: toutiao account id
|
|
|
+ max_cursor: crawler latest cursor for each account
|
|
|
+ max_behot_time: max behot time from toutiao, use to switch to next page
|
|
|
"""
|
|
|
- current_cursor = max_behot_time
|
|
|
has_more = True
|
|
|
+ current_cursor = max_behot_time
|
|
|
+ max_cursor = max_cursor or const.DEFAULT_CURSOR
|
|
|
|
|
|
while has_more:
|
|
|
response = get_toutiao_account_video_list(
|
|
|
account_id=account_id, cookie=cookie, max_behot_time=current_cursor
|
|
|
)
|
|
|
+ if not response:
|
|
|
+ break
|
|
|
+
|
|
|
if response["message"] != "success":
|
|
|
- print("error")
|
|
|
+ log(
|
|
|
+ task="crawler_toutiao_account_videos",
|
|
|
+ function="crawler_toutiao_account_videos",
|
|
|
+ message="get response from toutiao failed",
|
|
|
+ data={"account_id": account_id, "response": response},
|
|
|
+ )
|
|
|
break
|
|
|
|
|
|
video_list = response["data"]
|
|
@@ -58,13 +82,27 @@ class CrawlerToutiaoAccountVideos:
|
|
|
break
|
|
|
|
|
|
max_timestamp_in_this_group = video_list[0]["publish_time"]
|
|
|
- if max_timestamp_in_this_group < const.DEFAULT_CURSOR:
|
|
|
+ if max_timestamp_in_this_group < max_cursor:
|
|
|
break
|
|
|
|
|
|
+ # do crawler each video
|
|
|
crawler_video_list_bar = tqdm(video_list, desc="crawler videos")
|
|
|
for video in crawler_video_list_bar:
|
|
|
- crawler_video_list_bar.set_postfix({"video_id": video["id"]})
|
|
|
- self.crawler_each_video(video)
|
|
|
+ try:
|
|
|
+ crawler_video_list_bar.set_postfix({"video_id": video["id"]})
|
|
|
+ self.crawler_each_video(video)
|
|
|
+ except Exception as e:
|
|
|
+ log(
|
|
|
+ task="crawler_toutiao_account_videos",
|
|
|
+ function="crawler_each_account_video_list",
|
|
|
+ message="crawler each video failed",
|
|
|
+ data={
|
|
|
+ "account_id": account_id,
|
|
|
+ "video_info": video,
|
|
|
+ "error": str(e),
|
|
|
+ "traceback": traceback.format_exc(),
|
|
|
+ },
|
|
|
+ )
|
|
|
|
|
|
if has_more:
|
|
|
time.sleep(const.SLEEP_SECOND)
|
|
@@ -88,7 +126,7 @@ class CrawlerToutiaoAccountVideos:
|
|
|
video_item.add("out_account_id", video_data["user"]["user_id"])
|
|
|
video_item.add("out_account_name", video_data["source"])
|
|
|
video_item.add("publish_timestamp", video_data["publish_time"])
|
|
|
- video_item.add("platform", "toutiao")
|
|
|
+ video_item.add("platform", const.PLATFORM)
|
|
|
video_item.add("read_cnt", video_data["read_count"])
|
|
|
video_item.add("article_url", url)
|
|
|
video_item.add("source_account", const.NO_SOURCE_ACCOUNT_STATUS)
|
|
@@ -107,36 +145,54 @@ class CrawlerToutiaoAccountVideos:
|
|
|
except Exception as e:
|
|
|
print(e)
|
|
|
|
|
|
- def deal(self):
|
|
|
+ def update_account_max_cursor(self, account_id: str) -> None:
|
|
|
+ """
|
|
|
+ update account max cursor
|
|
|
+ """
|
|
|
+ select_sql = f"""
|
|
|
+ select max(publish_timestamp) as max_cursor
|
|
|
+ from publish_single_video_source
|
|
|
+ where out_account_id = '{account_id}' and platform = '{const.PLATFORM}';
|
|
|
+ """
|
|
|
+ response_mysql = self.db_client.fetch(query=select_sql)
|
|
|
+ max_publish_timestamp = response_mysql[0][0]
|
|
|
+
|
|
|
+ if max_publish_timestamp:
|
|
|
+ update_sql = f"""
|
|
|
+ update video_meta_accounts
|
|
|
+ set max_cursor = %s
|
|
|
+ where account_id = %s and platform = %s;
|
|
|
+ """
|
|
|
+ self.db_client.save(
|
|
|
+ query=update_sql,
|
|
|
+ params=(max_publish_timestamp, account_id, const.PLATFORM),
|
|
|
+ )
|
|
|
+
|
|
|
+ def deal(self) -> None:
|
|
|
"""
|
|
|
class entrance
|
|
|
"""
|
|
|
- account_id_list = [
|
|
|
- "MS4wLjABAAAABIPlAy8EngHf3bXkYFrN935tdJuS9nu3wCdeONe5ZkMxcsQ5AQkxYEcUGqPcA6K7",
|
|
|
- "MS4wLjABAAAAXqC8gtp2uYlGyO8Bua10GOTqsmi6TPUshTullb1vsSlK1WoRPRW0b1cFmKEpKDyy",
|
|
|
- "MS4wLjABAAAAUG5on9TNGiGcDAnthjQUz8hs93QU-R37KzAqsCj_IsU",
|
|
|
- "MS4wLjABAAAAHbUq1p1NodyaVw8nBwdz7su5NIrONIcZ22xLbCRIxUC09s8FeqmQh4tg9MOCLktV",
|
|
|
- "MS4wLjABAAAAbH-9GMPXTyC9RE-aSpzi0thIrqw-SzbdPz-v7M7YGGQ",
|
|
|
- "MS4wLjABAAAAq3YelxNuDki2gDu83MEBS7zultxsY8YZ1AWcC1XRugSrFFLOgBZvFeFmNn-h_5Qa",
|
|
|
- "MS4wLjABAAAA29pf0waQ3QOGd03JpLbYgju5Bg4t1xyIZByY0ijTDYN5Y1aL9LV-DuiSAz7UNfqL",
|
|
|
- "MS4wLjABAAAAlNBBh2wsAfQIKY6XkVQj6FC9FZonfX8jjsIiVl7xV4c",
|
|
|
- "MS4wLjABAAAA_5u04HihfTRaYKILhN0ksZqGQXtPqoAS3lMe44oEKFc8NKsVrA6hR-OSN82gw-ue",
|
|
|
- "MS4wLjABAAAAG5dpmasVG0C2bgr9hNclcKxqm6DPz_1dCOr4fzNT-V0",
|
|
|
- "MS4wLjABAAAAoUESfCcb-NXbHXJr-A7TszauMxIvjXd0EhULvmVyUhpj-HSs5gsCxrbZFvEcJZzU",
|
|
|
- "MS4wLjABAAAAlshV8QVXTo4VxSjSHh9B7LpK4_DPKA1vJkbcH8-3Jmq7QohWBHpcphQ2gKAKYe7M",
|
|
|
- "MS4wLjABAAAAKO4skzt3d35FYb92Vv1lVgzpPz9PdAGsXvqs3WyXILs",
|
|
|
- "MS4wLjABAAAAp1CP5bxMGYW7fxMZOJKSuSMQeMD7AMw5MyOvP-1xC14",
|
|
|
- "MS4wLjABAAAAld-tIrZWcmQp9K_IRTI2zcT5GFlzrOH2yj7Cino8xqU",
|
|
|
- "MS4wLjABAAAAncBYHG1eIO-gSC1FIs8YmGjVTQuN9s9-NBbFs_1pOX0apGmlQd0GroZpb2TpAzVb",
|
|
|
- "MS4wLjABAAAAqYXDF25BWZBXePfjCISRSmzQRytwOJhBwii9YnzwirYt1MAzdk6kikc6QChcYC9G",
|
|
|
- "MS4wLjABAAAA_t2pW2XSRFL4P8rV4X3T0hIEnEBxCbLC_cgD3B-Q9mwYorMiNyyoGcmLuyVxnyj1",
|
|
|
- "MS4wLjABAAAAEU1n5akXZ7Fvd8wkm1BV6pMRI58mgZUPgyQGHBiRKIi4UcoRglDk6xgEgEK8Lk3n",
|
|
|
- "MS4wLjABAAAAlwoEZD-OROoX_nMoulzBDCnlMqj72GIAB-PO2A3C0GVmYGOnBEH0jhbibVyRUqir",
|
|
|
- ]
|
|
|
-
|
|
|
- for account_id in account_id_list:
|
|
|
+ account_list = self.get_account_list()
|
|
|
+ account_list_bar = tqdm(account_list, desc="crawler toutiao accounts")
|
|
|
+ for account in account_list_bar:
|
|
|
+ account_id = account["account_id"]
|
|
|
+ max_cursor = account["max_cursor"]
|
|
|
try:
|
|
|
- self.crawler_each_account_video_list(account_id)
|
|
|
+ # crawl each account
|
|
|
+ account_list_bar.set_postfix({"account_id": account_id})
|
|
|
+ self.crawler_each_account_video_list(
|
|
|
+ account_id=account_id, max_cursor=max_cursor
|
|
|
+ )
|
|
|
+ self.update_account_max_cursor(account_id)
|
|
|
+
|
|
|
except Exception as e:
|
|
|
- print(e)
|
|
|
- continue
|
|
|
+ # add log and bot
|
|
|
+ log(
|
|
|
+ task="crawler_toutiao_account_videos",
|
|
|
+ function="deal",
|
|
|
+ message=account_id,
|
|
|
+ data={
|
|
|
+ "error": str(e),
|
|
|
+ "traceback": traceback.format_exc(),
|
|
|
+ },
|
|
|
+ )
|