3 mesiacov pred · 749923a96c
--- a/account_crawler_task.py
+++ b/account_crawler_task.py
@@ -0,0 +1,16 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+@description: try to get some more accounts
			
 
				+"""
			
 
				+from tasks.crawler_accounts_by_association import ChannelAccountCrawler
			
 
				+from tasks.crawler_accounts_by_association import ToutiaoAccountCrawler
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # crawler channels
			
 
				+    channel_account_crawler = ChannelAccountCrawler()
			
 
				+    channel_account_crawler.deal()
			
 
				+
			
 
				+    # crawler toutiao
			
 
				+    toutiao_crawler = ToutiaoAccountCrawler()
			
 
				+    toutiao_crawler.deal()
			
--- a/tasks/crawler_accounts_by_association.py
+++ b/tasks/crawler_accounts_by_association.py
@@ -34,6 +34,14 @@ class CrawlerAccounts:
 
				         self.db_client = DatabaseConnector(db_config=long_articles_config)
			
 
				         self.db_client.connect()
			
 
				 
			
 
				+    def get_seed_keys(self):
			
 
				+        """
			
 
				+        get search keys from database
			
 
				+        """
			
 
				+        fetch_query = "select title from article_pool_promotion_source where status = 1 and deleted = 0 order by level limit 100;"
			
 
				+        result = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
			
 
				+        return result
			
 
				+
			
 
				     def insert_video_into_recommend_table(self, item):
			
 
				         # whether account exists
			
 
				         final_item = scrape_account_entities_process(item, self.db_client)
			
@@ -41,18 +49,20 @@ class CrawlerAccounts:
 
				             return
			
 
				         else:
			
 
				             # save to db
			
 
				-            insert_into_associated_recommendation_table(db_client=self.db_client, associated_recommendation_item=final_item)
			
 
				+            insert_into_associated_recommendation_table(
			
 
				+                db_client=self.db_client, associated_recommendation_item=final_item
			
 
				+            )
			
 
				 
			
 
				-    def save_similarity_score_to_table(
			
 
				-            self, association_list:list[dict]
			
 
				-    ) -> int:
			
 
				+    def save_similarity_score_to_table(self, association_list: list[dict]) -> int:
			
 
				         """
			
 
				         calculate similarity between seed_title_list and association_title_list
			
 
				         """
			
 
				-        association_id_list = [i['id'] for i in association_list]
			
 
				-        association_title_list = [i['title'] for i in association_list]
			
 
				-        seed_title_list = [i['seed_title'] for i in association_list]
			
 
				-        similarity_score_list =  similarity_between_title_list(seed_title_list, association_title_list)
			
 
				+        association_id_list = [i["id"] for i in association_list]
			
 
				+        association_title_list = [i["title"] for i in association_list]
			
 
				+        seed_title_list = [i["seed_title"] for i in association_list]
			
 
				+        similarity_score_list = similarity_between_title_list(
			
 
				+            seed_title_list, association_title_list
			
 
				+        )
			
 
				         similarity_score_array = np.array(similarity_score_list)
			
 
				 
			
 
				         # get main diagonal score
			
@@ -74,7 +84,7 @@ class CrawlerAccounts:
 
				 
			
 
				         params.append(tuple(association_id_list))
			
 
				         case_statements = "\n".join(case_statement)
			
 
				-        formatted_sql =  batch_update_query.format(case_statements)
			
 
				+        formatted_sql = batch_update_query.format(case_statements)
			
 
				         affected_rows = self.db_client.save(formatted_sql, params)
			
 
				         return affected_rows
			
 
				 
			
@@ -97,34 +107,30 @@ class ChannelAccountCrawler(CrawlerAccounts):
 
				         2. use search api to get accounts
			
 
				     """
			
 
				 
			
 
				-    def get_seed_keys(self):
			
 
				-        """
			
 
				-        get search keys from database
			
 
				-        """
			
 
				-        fetch_query = "select title from article_pool_promotion_source where status = 1 and deleted = 0 order by level limit 100;"
			
 
				-        result = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
			
 
				-        return result
			
 
				-
			
 
				     def process_each_video(self, video: dict, seed_title: str):
			
 
				         """
			
 
				         process video item and save to database
			
 
				         """
			
 
				-        account_name = video['items'][0]['source']['title']
			
 
				-        search_account_response = search_in_wechat_channel(search_key=account_name, search_type=2)
			
 
				-        account_detail = search_account_response['data']['data'][0]['items'][0]
			
 
				-        account_id = account_detail['jumpInfo']['userName']
			
 
				+        account_name = video["items"][0]["source"]["title"]
			
 
				+        search_account_response = search_in_wechat_channel(
			
 
				+            search_key=account_name, search_type=2
			
 
				+        )
			
 
				+        account_detail = search_account_response["data"]["data"][0]["items"][0]
			
 
				+        account_id = account_detail["jumpInfo"]["userName"]
			
 
				         search_video_response = get_channel_account_videos(account_id)
			
 
				-        video_list = search_video_response['data']['object']
			
 
				+        video_list = search_video_response["data"]["object"]
			
 
				         for video in video_list[:5]:
			
 
				             video_item = Item()
			
 
				             video_item.add("account_name", account_name)
			
 
				             video_item.add("account_id", account_id)
			
 
				-            video_item.add("recommend_video_id", video['id'])
			
 
				-            video_item.add("title", video['objectDesc']['description'])
			
 
				-            video_item.add("duration", video['objectDesc']['media'][0]['VideoPlayLen'])
			
 
				+            video_item.add("recommend_video_id", video["id"])
			
 
				+            video_item.add("title", video["objectDesc"]["description"])
			
 
				+            video_item.add("duration", video["objectDesc"]["media"][0]["VideoPlayLen"])
			
 
				             video_item.add("seed_account", "SearchWithOutAccount")
			
 
				             video_item.add("seed_title", seed_title)
			
 
				-            video_item.add("recommend_date", datetime.datetime.today().strftime("%Y-%m-%d"))
			
 
				+            video_item.add(
			
 
				+                "recommend_date", datetime.datetime.today().strftime("%Y-%m-%d")
			
 
				+            )
			
 
				             video_item.add("platform", "sph")
			
 
				             # check item
			
 
				             video_item.check(source="association")
			
@@ -138,8 +144,8 @@ class ChannelAccountCrawler(CrawlerAccounts):
 
				         """
			
 
				         search_response = search_in_wechat_channel(search_key=title, search_type=1)
			
 
				         print(search_response)
			
 
				-        video_list = search_response['data']['data'][0]['subBoxes']
			
 
				-        for video in tqdm(video_list, desc='crawler each video'):
			
 
				+        video_list = search_response["data"]["data"][0]["subBoxes"]
			
 
				+        for video in tqdm(video_list, desc="crawler each video"):
			
 
				             try:
			
 
				                 self.process_each_video(video, seed_title=title)
			
 
				             except Exception as e:
			
@@ -150,9 +156,9 @@ class ChannelAccountCrawler(CrawlerAccounts):
 
				 
			
 
				     def deal(self):
			
 
				         seed_title_list = self.get_seed_keys()
			
 
				-        for item in tqdm(seed_title_list, desc='crawler each title'):
			
 
				+        for item in tqdm(seed_title_list, desc="crawler each title"):
			
 
				             try:
			
 
				-                self.search_by_title_from_database(title=item['title'])
			
 
				+                self.search_by_title_from_database(title=item["title"])
			
 
				             except Exception as e:
			
 
				                 print(e)
			
 
				 
			
@@ -161,6 +167,7 @@ class ChannelAccountCrawler(CrawlerAccounts):
 
				         affected_rows = self.save_similarity_score_to_table(video_list)
			
 
				         print(affected_rows)
			
 
				 
			
 
				+
			
 
				 class ToutiaoAccountCrawler(CrawlerAccounts):
			
 
				 
			
 
				     def get_seed_videos(self):
			
@@ -233,3 +240,8 @@ class ToutiaoAccountCrawler(CrawlerAccounts):
 
				         video_list = self.get_video_list_without_score()
			
 
				         affected_rows = self.save_similarity_score_to_table(video_list)
			
 
				         print(affected_rows)
			
 
				+
			
 
				+
			
 
				+class HaoKanAccountCrawler(CrawlerAccounts):
			
 
				+    def deal(self):
			
 
				+        return