|
@@ -34,6 +34,14 @@ class CrawlerAccounts:
|
|
|
self.db_client = DatabaseConnector(db_config=long_articles_config)
|
|
|
self.db_client.connect()
|
|
|
|
|
|
+ def get_seed_keys(self):
|
|
|
+ """
|
|
|
+ get search keys from database
|
|
|
+ """
|
|
|
+ fetch_query = "select title from article_pool_promotion_source where status = 1 and deleted = 0 order by level limit 100;"
|
|
|
+ result = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
|
|
|
+ return result
|
|
|
+
|
|
|
def insert_video_into_recommend_table(self, item):
|
|
|
# whether account exists
|
|
|
final_item = scrape_account_entities_process(item, self.db_client)
|
|
@@ -41,18 +49,20 @@ class CrawlerAccounts:
|
|
|
return
|
|
|
else:
|
|
|
# save to db
|
|
|
- insert_into_associated_recommendation_table(db_client=self.db_client, associated_recommendation_item=final_item)
|
|
|
+ insert_into_associated_recommendation_table(
|
|
|
+ db_client=self.db_client, associated_recommendation_item=final_item
|
|
|
+ )
|
|
|
|
|
|
- def save_similarity_score_to_table(
|
|
|
- self, association_list:list[dict]
|
|
|
- ) -> int:
|
|
|
+ def save_similarity_score_to_table(self, association_list: list[dict]) -> int:
|
|
|
"""
|
|
|
calculate similarity between seed_title_list and association_title_list
|
|
|
"""
|
|
|
- association_id_list = [i['id'] for i in association_list]
|
|
|
- association_title_list = [i['title'] for i in association_list]
|
|
|
- seed_title_list = [i['seed_title'] for i in association_list]
|
|
|
- similarity_score_list = similarity_between_title_list(seed_title_list, association_title_list)
|
|
|
+ association_id_list = [i["id"] for i in association_list]
|
|
|
+ association_title_list = [i["title"] for i in association_list]
|
|
|
+ seed_title_list = [i["seed_title"] for i in association_list]
|
|
|
+ similarity_score_list = similarity_between_title_list(
|
|
|
+ seed_title_list, association_title_list
|
|
|
+ )
|
|
|
similarity_score_array = np.array(similarity_score_list)
|
|
|
|
|
|
# get main diagonal score
|
|
@@ -74,7 +84,7 @@ class CrawlerAccounts:
|
|
|
|
|
|
params.append(tuple(association_id_list))
|
|
|
case_statements = "\n".join(case_statement)
|
|
|
- formatted_sql = batch_update_query.format(case_statements)
|
|
|
+ formatted_sql = batch_update_query.format(case_statements)
|
|
|
affected_rows = self.db_client.save(formatted_sql, params)
|
|
|
return affected_rows
|
|
|
|
|
@@ -97,34 +107,30 @@ class ChannelAccountCrawler(CrawlerAccounts):
|
|
|
2. use search api to get accounts
|
|
|
"""
|
|
|
|
|
|
- def get_seed_keys(self):
|
|
|
- """
|
|
|
- get search keys from database
|
|
|
- """
|
|
|
- fetch_query = "select title from article_pool_promotion_source where status = 1 and deleted = 0 order by level limit 100;"
|
|
|
- result = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
|
|
|
- return result
|
|
|
-
|
|
|
def process_each_video(self, video: dict, seed_title: str):
|
|
|
"""
|
|
|
process video item and save to database
|
|
|
"""
|
|
|
- account_name = video['items'][0]['source']['title']
|
|
|
- search_account_response = search_in_wechat_channel(search_key=account_name, search_type=2)
|
|
|
- account_detail = search_account_response['data']['data'][0]['items'][0]
|
|
|
- account_id = account_detail['jumpInfo']['userName']
|
|
|
+ account_name = video["items"][0]["source"]["title"]
|
|
|
+ search_account_response = search_in_wechat_channel(
|
|
|
+ search_key=account_name, search_type=2
|
|
|
+ )
|
|
|
+ account_detail = search_account_response["data"]["data"][0]["items"][0]
|
|
|
+ account_id = account_detail["jumpInfo"]["userName"]
|
|
|
search_video_response = get_channel_account_videos(account_id)
|
|
|
- video_list = search_video_response['data']['object']
|
|
|
+ video_list = search_video_response["data"]["object"]
|
|
|
for video in video_list[:5]:
|
|
|
video_item = Item()
|
|
|
video_item.add("account_name", account_name)
|
|
|
video_item.add("account_id", account_id)
|
|
|
- video_item.add("recommend_video_id", video['id'])
|
|
|
- video_item.add("title", video['objectDesc']['description'])
|
|
|
- video_item.add("duration", video['objectDesc']['media'][0]['VideoPlayLen'])
|
|
|
+ video_item.add("recommend_video_id", video["id"])
|
|
|
+ video_item.add("title", video["objectDesc"]["description"])
|
|
|
+ video_item.add("duration", video["objectDesc"]["media"][0]["VideoPlayLen"])
|
|
|
video_item.add("seed_account", "SearchWithOutAccount")
|
|
|
video_item.add("seed_title", seed_title)
|
|
|
- video_item.add("recommend_date", datetime.datetime.today().strftime("%Y-%m-%d"))
|
|
|
+ video_item.add(
|
|
|
+ "recommend_date", datetime.datetime.today().strftime("%Y-%m-%d")
|
|
|
+ )
|
|
|
video_item.add("platform", "sph")
|
|
|
# check item
|
|
|
video_item.check(source="association")
|
|
@@ -138,8 +144,8 @@ class ChannelAccountCrawler(CrawlerAccounts):
|
|
|
"""
|
|
|
search_response = search_in_wechat_channel(search_key=title, search_type=1)
|
|
|
print(search_response)
|
|
|
- video_list = search_response['data']['data'][0]['subBoxes']
|
|
|
- for video in tqdm(video_list, desc='crawler each video'):
|
|
|
+ video_list = search_response["data"]["data"][0]["subBoxes"]
|
|
|
+ for video in tqdm(video_list, desc="crawler each video"):
|
|
|
try:
|
|
|
self.process_each_video(video, seed_title=title)
|
|
|
except Exception as e:
|
|
@@ -150,9 +156,9 @@ class ChannelAccountCrawler(CrawlerAccounts):
|
|
|
|
|
|
def deal(self):
|
|
|
seed_title_list = self.get_seed_keys()
|
|
|
- for item in tqdm(seed_title_list, desc='crawler each title'):
|
|
|
+ for item in tqdm(seed_title_list, desc="crawler each title"):
|
|
|
try:
|
|
|
- self.search_by_title_from_database(title=item['title'])
|
|
|
+ self.search_by_title_from_database(title=item["title"])
|
|
|
except Exception as e:
|
|
|
print(e)
|
|
|
|
|
@@ -161,6 +167,7 @@ class ChannelAccountCrawler(CrawlerAccounts):
|
|
|
affected_rows = self.save_similarity_score_to_table(video_list)
|
|
|
print(affected_rows)
|
|
|
|
|
|
+
|
|
|
class ToutiaoAccountCrawler(CrawlerAccounts):
|
|
|
|
|
|
def get_seed_videos(self):
|
|
@@ -233,3 +240,8 @@ class ToutiaoAccountCrawler(CrawlerAccounts):
|
|
|
video_list = self.get_video_list_without_score()
|
|
|
affected_rows = self.save_similarity_score_to_table(video_list)
|
|
|
print(affected_rows)
|
|
|
+
|
|
|
+
|
|
|
+class HaoKanAccountCrawler(CrawlerAccounts):
|
|
|
+ def deal(self):
|
|
|
+ return
|