luojunhui преди 7 месеца
родител
ревизия
749923a96c
променени са 2 файла, в които са добавени 58 реда и са изтрити 30 реда
  1. 16 0
      account_crawler_task.py
  2. 42 30
      tasks/crawler_accounts_by_association.py

+ 16 - 0
account_crawler_task.py

@@ -0,0 +1,16 @@
+"""
+@author: luojunhui
+@description: try to get some more accounts
+"""
+from tasks.crawler_accounts_by_association import ChannelAccountCrawler
+from tasks.crawler_accounts_by_association import ToutiaoAccountCrawler
+
+
+if __name__ == '__main__':
+    # crawler channels
+    channel_account_crawler = ChannelAccountCrawler()
+    channel_account_crawler.deal()
+
+    # crawler toutiao
+    toutiao_crawler = ToutiaoAccountCrawler()
+    toutiao_crawler.deal()

+ 42 - 30
tasks/crawler_accounts.py → tasks/crawler_accounts_by_association.py

@@ -34,6 +34,14 @@ class CrawlerAccounts:
         self.db_client = DatabaseConnector(db_config=long_articles_config)
         self.db_client.connect()
 
+    def get_seed_keys(self):
+        """
+        get search keys from database
+        """
+        fetch_query = "select title from article_pool_promotion_source where status = 1 and deleted = 0 order by level limit 100;"
+        result = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
+        return result
+
     def insert_video_into_recommend_table(self, item):
         # whether account exists
         final_item = scrape_account_entities_process(item, self.db_client)
@@ -41,18 +49,20 @@ class CrawlerAccounts:
             return
         else:
             # save to db
-            insert_into_associated_recommendation_table(db_client=self.db_client, associated_recommendation_item=final_item)
+            insert_into_associated_recommendation_table(
+                db_client=self.db_client, associated_recommendation_item=final_item
+            )
 
-    def save_similarity_score_to_table(
-            self, association_list:list[dict]
-    ) -> int:
+    def save_similarity_score_to_table(self, association_list: list[dict]) -> int:
         """
         calculate similarity between seed_title_list and association_title_list
         """
-        association_id_list = [i['id'] for i in association_list]
-        association_title_list = [i['title'] for i in association_list]
-        seed_title_list = [i['seed_title'] for i in association_list]
-        similarity_score_list =  similarity_between_title_list(seed_title_list, association_title_list)
+        association_id_list = [i["id"] for i in association_list]
+        association_title_list = [i["title"] for i in association_list]
+        seed_title_list = [i["seed_title"] for i in association_list]
+        similarity_score_list = similarity_between_title_list(
+            seed_title_list, association_title_list
+        )
         similarity_score_array = np.array(similarity_score_list)
 
         # get main diagonal score
@@ -74,7 +84,7 @@ class CrawlerAccounts:
 
         params.append(tuple(association_id_list))
         case_statements = "\n".join(case_statement)
-        formatted_sql =  batch_update_query.format(case_statements)
+        formatted_sql = batch_update_query.format(case_statements)
         affected_rows = self.db_client.save(formatted_sql, params)
         return affected_rows
 
@@ -97,34 +107,30 @@ class ChannelAccountCrawler(CrawlerAccounts):
         2. use search api to get accounts
     """
 
-    def get_seed_keys(self):
-        """
-        get search keys from database
-        """
-        fetch_query = "select title from article_pool_promotion_source where status = 1 and deleted = 0 order by level limit 100;"
-        result = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
-        return result
-
     def process_each_video(self, video: dict, seed_title: str):
         """
         process video item and save to database
         """
-        account_name = video['items'][0]['source']['title']
-        search_account_response = search_in_wechat_channel(search_key=account_name, search_type=2)
-        account_detail = search_account_response['data']['data'][0]['items'][0]
-        account_id = account_detail['jumpInfo']['userName']
+        account_name = video["items"][0]["source"]["title"]
+        search_account_response = search_in_wechat_channel(
+            search_key=account_name, search_type=2
+        )
+        account_detail = search_account_response["data"]["data"][0]["items"][0]
+        account_id = account_detail["jumpInfo"]["userName"]
         search_video_response = get_channel_account_videos(account_id)
-        video_list = search_video_response['data']['object']
+        video_list = search_video_response["data"]["object"]
         for video in video_list[:5]:
             video_item = Item()
             video_item.add("account_name", account_name)
             video_item.add("account_id", account_id)
-            video_item.add("recommend_video_id", video['id'])
-            video_item.add("title", video['objectDesc']['description'])
-            video_item.add("duration", video['objectDesc']['media'][0]['VideoPlayLen'])
+            video_item.add("recommend_video_id", video["id"])
+            video_item.add("title", video["objectDesc"]["description"])
+            video_item.add("duration", video["objectDesc"]["media"][0]["VideoPlayLen"])
             video_item.add("seed_account", "SearchWithOutAccount")
             video_item.add("seed_title", seed_title)
-            video_item.add("recommend_date", datetime.datetime.today().strftime("%Y-%m-%d"))
+            video_item.add(
+                "recommend_date", datetime.datetime.today().strftime("%Y-%m-%d")
+            )
             video_item.add("platform", "sph")
             # check item
             video_item.check(source="association")
@@ -138,8 +144,8 @@ class ChannelAccountCrawler(CrawlerAccounts):
         """
         search_response = search_in_wechat_channel(search_key=title, search_type=1)
         print(search_response)
-        video_list = search_response['data']['data'][0]['subBoxes']
-        for video in tqdm(video_list, desc='crawler each video'):
+        video_list = search_response["data"]["data"][0]["subBoxes"]
+        for video in tqdm(video_list, desc="crawler each video"):
             try:
                 self.process_each_video(video, seed_title=title)
             except Exception as e:
@@ -150,9 +156,9 @@ class ChannelAccountCrawler(CrawlerAccounts):
 
     def deal(self):
         seed_title_list = self.get_seed_keys()
-        for item in tqdm(seed_title_list, desc='crawler each title'):
+        for item in tqdm(seed_title_list, desc="crawler each title"):
             try:
-                self.search_by_title_from_database(title=item['title'])
+                self.search_by_title_from_database(title=item["title"])
             except Exception as e:
                 print(e)
 
@@ -161,6 +167,7 @@ class ChannelAccountCrawler(CrawlerAccounts):
         affected_rows = self.save_similarity_score_to_table(video_list)
         print(affected_rows)
 
+
 class ToutiaoAccountCrawler(CrawlerAccounts):
 
     def get_seed_videos(self):
@@ -233,3 +240,8 @@ class ToutiaoAccountCrawler(CrawlerAccounts):
         video_list = self.get_video_list_without_score()
         affected_rows = self.save_similarity_score_to_table(video_list)
         print(affected_rows)
+
+
+class HaoKanAccountCrawler(CrawlerAccounts):
+    def deal(self):
+        return