|
@@ -34,7 +34,7 @@ class CrawlerAccounts:
|
|
|
self.db_client = DatabaseConnector(db_config=long_articles_config)
|
|
|
self.db_client.connect()
|
|
|
|
|
|
- def get_seed_keys(self):
|
|
|
+ def get_seed_keys(self)->list[dict]:
|
|
|
"""
|
|
|
get search keys from database
|
|
|
"""
|
|
@@ -42,7 +42,7 @@ class CrawlerAccounts:
|
|
|
result = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
|
|
|
return result
|
|
|
|
|
|
- def insert_video_into_recommend_table(self, item):
|
|
|
+ def insert_video_into_recommend_table(self, item: dict) -> None:
|
|
|
# whether account exists
|
|
|
final_item = scrape_account_entities_process(item, self.db_client)
|
|
|
if not final_item:
|
|
@@ -97,8 +97,30 @@ class CrawlerAccounts:
|
|
|
fetch_response = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
|
|
|
return fetch_response
|
|
|
|
|
|
+ def get_video_list_with_score(self, platform: str):
|
|
|
+ """
|
|
|
+ find video from video association
|
|
|
+ """
|
|
|
+ fetch_query = f"""
|
|
|
+ select id, account_name, recommend_video_id, title, read_cnt, duration, seed_account, seed_title
|
|
|
+ from video_association
|
|
|
+ where score > 0.5 and platform = '{platform}' and status = 0
|
|
|
+ order by account_name;
|
|
|
+ """
|
|
|
+ fetch_response = self.db_client.fetch(query=fetch_query)
|
|
|
+ return fetch_response
|
|
|
+
|
|
|
+ def update_video_status(self, video_id_tuple: tuple, ori_status: int, new_status: int) -> int:
|
|
|
+ update_query = f"""
|
|
|
+ update video_association
|
|
|
+ set status = %s
|
|
|
+ where id in %s and status = %s;
|
|
|
+ """
|
|
|
+ affected_rows = self.db_client.save(query=update_query, params=(new_status, video_id_tuple, ori_status))
|
|
|
+ return affected_rows
|
|
|
+
|
|
|
|
|
|
-class ChannelAccountCrawler(CrawlerAccounts):
|
|
|
+class ChannelsAccountCrawler(CrawlerAccounts):
|
|
|
"""
|
|
|
crawler channel accounts
|
|
|
strategy:
|
|
@@ -120,23 +142,35 @@ class ChannelAccountCrawler(CrawlerAccounts):
|
|
|
search_video_response = get_channel_account_videos(account_id)
|
|
|
video_list = search_video_response["data"]["object"]
|
|
|
for video in video_list[:5]:
|
|
|
- video_item = Item()
|
|
|
- video_item.add("account_name", account_name)
|
|
|
- video_item.add("account_id", account_id)
|
|
|
- video_item.add("recommend_video_id", video["id"])
|
|
|
- video_item.add("title", video["objectDesc"]["description"])
|
|
|
- video_item.add("duration", video["objectDesc"]["media"][0]["VideoPlayLen"])
|
|
|
- video_item.add("seed_account", "SearchWithOutAccount")
|
|
|
- video_item.add("seed_title", seed_title)
|
|
|
- video_item.add(
|
|
|
- "recommend_date", datetime.datetime.today().strftime("%Y-%m-%d")
|
|
|
- )
|
|
|
- video_item.add("platform", "sph")
|
|
|
- # check item
|
|
|
- video_item.check(source="association")
|
|
|
+ try:
|
|
|
+ video_item = Item()
|
|
|
+ video_item.add("account_name", account_name)
|
|
|
+ video_item.add("account_id", account_id)
|
|
|
+ video_item.add("recommend_video_id", video["id"])
|
|
|
+ video_item.add("title", video["objectDesc"]["description"])
|
|
|
+ video_item.add("duration", video["objectDesc"]["media"][0]["VideoPlayLen"])
|
|
|
+ video_item.add("seed_account", "SearchWithOutAccount")
|
|
|
+ video_item.add("seed_title", seed_title)
|
|
|
+ video_item.add(
|
|
|
+ "recommend_date", datetime.datetime.today().strftime("%Y-%m-%d")
|
|
|
+ )
|
|
|
+ video_item.add("platform", "sph")
|
|
|
+ # check item
|
|
|
+ video_item.check(source="association")
|
|
|
|
|
|
- # save to db
|
|
|
- self.insert_video_into_recommend_table(video_item.item)
|
|
|
+ # save to db
|
|
|
+ self.insert_video_into_recommend_table(video_item.item)
|
|
|
+ except Exception as e:
|
|
|
+ log(
|
|
|
+ task="channel account crawler",
|
|
|
+ function="process_each_video",
|
|
|
+ message="create item and save to db failed",
|
|
|
+ data={
|
|
|
+ "video": video,
|
|
|
+ "error": str(e),
|
|
|
+ "traceback": traceback.format_exc()
|
|
|
+ }
|
|
|
+ )
|
|
|
|
|
|
def search_by_title_from_database(self, title: str) -> None:
|
|
|
"""
|
|
@@ -244,4 +278,4 @@ class ToutiaoAccountCrawler(CrawlerAccounts):
|
|
|
|
|
|
class HaoKanAccountCrawler(CrawlerAccounts):
|
|
|
def deal(self):
|
|
|
- return
|
|
|
+ raise NotImplementedError()
|