|  | @@ -34,6 +34,14 @@ class CrawlerAccounts:
 | 
	
		
			
				|  |  |          self.db_client = DatabaseConnector(db_config=long_articles_config)
 | 
	
		
			
				|  |  |          self.db_client.connect()
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +    def get_seed_keys(self):
 | 
	
		
			
				|  |  | +        """
 | 
	
		
			
				|  |  | +        get search keys from database
 | 
	
		
			
				|  |  | +        """
 | 
	
		
			
				|  |  | +        fetch_query = "select title from article_pool_promotion_source where status = 1 and deleted = 0 order by level limit 100;"
 | 
	
		
			
				|  |  | +        result = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
 | 
	
		
			
				|  |  | +        return result
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |      def insert_video_into_recommend_table(self, item):
 | 
	
		
			
				|  |  |          # whether account exists
 | 
	
		
			
				|  |  |          final_item = scrape_account_entities_process(item, self.db_client)
 | 
	
	
		
			
				|  | @@ -41,18 +49,20 @@ class CrawlerAccounts:
 | 
	
		
			
				|  |  |              return
 | 
	
		
			
				|  |  |          else:
 | 
	
		
			
				|  |  |              # save to db
 | 
	
		
			
				|  |  | -            insert_into_associated_recommendation_table(db_client=self.db_client, associated_recommendation_item=final_item)
 | 
	
		
			
				|  |  | +            insert_into_associated_recommendation_table(
 | 
	
		
			
				|  |  | +                db_client=self.db_client, associated_recommendation_item=final_item
 | 
	
		
			
				|  |  | +            )
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    def save_similarity_score_to_table(
 | 
	
		
			
				|  |  | -            self, association_list:list[dict]
 | 
	
		
			
				|  |  | -    ) -> int:
 | 
	
		
			
				|  |  | +    def save_similarity_score_to_table(self, association_list: list[dict]) -> int:
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  |          calculate similarity between seed_title_list and association_title_list
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  | -        association_id_list = [i['id'] for i in association_list]
 | 
	
		
			
				|  |  | -        association_title_list = [i['title'] for i in association_list]
 | 
	
		
			
				|  |  | -        seed_title_list = [i['seed_title'] for i in association_list]
 | 
	
		
			
				|  |  | -        similarity_score_list =  similarity_between_title_list(seed_title_list, association_title_list)
 | 
	
		
			
				|  |  | +        association_id_list = [i["id"] for i in association_list]
 | 
	
		
			
				|  |  | +        association_title_list = [i["title"] for i in association_list]
 | 
	
		
			
				|  |  | +        seed_title_list = [i["seed_title"] for i in association_list]
 | 
	
		
			
				|  |  | +        similarity_score_list = similarity_between_title_list(
 | 
	
		
			
				|  |  | +            seed_title_list, association_title_list
 | 
	
		
			
				|  |  | +        )
 | 
	
		
			
				|  |  |          similarity_score_array = np.array(similarity_score_list)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |          # get main diagonal score
 | 
	
	
		
			
				|  | @@ -74,7 +84,7 @@ class CrawlerAccounts:
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |          params.append(tuple(association_id_list))
 | 
	
		
			
				|  |  |          case_statements = "\n".join(case_statement)
 | 
	
		
			
				|  |  | -        formatted_sql =  batch_update_query.format(case_statements)
 | 
	
		
			
				|  |  | +        formatted_sql = batch_update_query.format(case_statements)
 | 
	
		
			
				|  |  |          affected_rows = self.db_client.save(formatted_sql, params)
 | 
	
		
			
				|  |  |          return affected_rows
 | 
	
		
			
				|  |  |  
 | 
	
	
		
			
				|  | @@ -97,34 +107,30 @@ class ChannelAccountCrawler(CrawlerAccounts):
 | 
	
		
			
				|  |  |          2. use search api to get accounts
 | 
	
		
			
				|  |  |      """
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    def get_seed_keys(self):
 | 
	
		
			
				|  |  | -        """
 | 
	
		
			
				|  |  | -        get search keys from database
 | 
	
		
			
				|  |  | -        """
 | 
	
		
			
				|  |  | -        fetch_query = "select title from article_pool_promotion_source where status = 1 and deleted = 0 order by level limit 100;"
 | 
	
		
			
				|  |  | -        result = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
 | 
	
		
			
				|  |  | -        return result
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  |      def process_each_video(self, video: dict, seed_title: str):
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  |          process video item and save to database
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  | -        account_name = video['items'][0]['source']['title']
 | 
	
		
			
				|  |  | -        search_account_response = search_in_wechat_channel(search_key=account_name, search_type=2)
 | 
	
		
			
				|  |  | -        account_detail = search_account_response['data']['data'][0]['items'][0]
 | 
	
		
			
				|  |  | -        account_id = account_detail['jumpInfo']['userName']
 | 
	
		
			
				|  |  | +        account_name = video["items"][0]["source"]["title"]
 | 
	
		
			
				|  |  | +        search_account_response = search_in_wechat_channel(
 | 
	
		
			
				|  |  | +            search_key=account_name, search_type=2
 | 
	
		
			
				|  |  | +        )
 | 
	
		
			
				|  |  | +        account_detail = search_account_response["data"]["data"][0]["items"][0]
 | 
	
		
			
				|  |  | +        account_id = account_detail["jumpInfo"]["userName"]
 | 
	
		
			
				|  |  |          search_video_response = get_channel_account_videos(account_id)
 | 
	
		
			
				|  |  | -        video_list = search_video_response['data']['object']
 | 
	
		
			
				|  |  | +        video_list = search_video_response["data"]["object"]
 | 
	
		
			
				|  |  |          for video in video_list[:5]:
 | 
	
		
			
				|  |  |              video_item = Item()
 | 
	
		
			
				|  |  |              video_item.add("account_name", account_name)
 | 
	
		
			
				|  |  |              video_item.add("account_id", account_id)
 | 
	
		
			
				|  |  | -            video_item.add("recommend_video_id", video['id'])
 | 
	
		
			
				|  |  | -            video_item.add("title", video['objectDesc']['description'])
 | 
	
		
			
				|  |  | -            video_item.add("duration", video['objectDesc']['media'][0]['VideoPlayLen'])
 | 
	
		
			
				|  |  | +            video_item.add("recommend_video_id", video["id"])
 | 
	
		
			
				|  |  | +            video_item.add("title", video["objectDesc"]["description"])
 | 
	
		
			
				|  |  | +            video_item.add("duration", video["objectDesc"]["media"][0]["VideoPlayLen"])
 | 
	
		
			
				|  |  |              video_item.add("seed_account", "SearchWithOutAccount")
 | 
	
		
			
				|  |  |              video_item.add("seed_title", seed_title)
 | 
	
		
			
				|  |  | -            video_item.add("recommend_date", datetime.datetime.today().strftime("%Y-%m-%d"))
 | 
	
		
			
				|  |  | +            video_item.add(
 | 
	
		
			
				|  |  | +                "recommend_date", datetime.datetime.today().strftime("%Y-%m-%d")
 | 
	
		
			
				|  |  | +            )
 | 
	
		
			
				|  |  |              video_item.add("platform", "sph")
 | 
	
		
			
				|  |  |              # check item
 | 
	
		
			
				|  |  |              video_item.check(source="association")
 | 
	
	
		
			
				|  | @@ -138,8 +144,8 @@ class ChannelAccountCrawler(CrawlerAccounts):
 | 
	
		
			
				|  |  |          """
 | 
	
		
			
				|  |  |          search_response = search_in_wechat_channel(search_key=title, search_type=1)
 | 
	
		
			
				|  |  |          print(search_response)
 | 
	
		
			
				|  |  | -        video_list = search_response['data']['data'][0]['subBoxes']
 | 
	
		
			
				|  |  | -        for video in tqdm(video_list, desc='crawler each video'):
 | 
	
		
			
				|  |  | +        video_list = search_response["data"]["data"][0]["subBoxes"]
 | 
	
		
			
				|  |  | +        for video in tqdm(video_list, desc="crawler each video"):
 | 
	
		
			
				|  |  |              try:
 | 
	
		
			
				|  |  |                  self.process_each_video(video, seed_title=title)
 | 
	
		
			
				|  |  |              except Exception as e:
 | 
	
	
		
			
				|  | @@ -150,9 +156,9 @@ class ChannelAccountCrawler(CrawlerAccounts):
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      def deal(self):
 | 
	
		
			
				|  |  |          seed_title_list = self.get_seed_keys()
 | 
	
		
			
				|  |  | -        for item in tqdm(seed_title_list, desc='crawler each title'):
 | 
	
		
			
				|  |  | +        for item in tqdm(seed_title_list, desc="crawler each title"):
 | 
	
		
			
				|  |  |              try:
 | 
	
		
			
				|  |  | -                self.search_by_title_from_database(title=item['title'])
 | 
	
		
			
				|  |  | +                self.search_by_title_from_database(title=item["title"])
 | 
	
		
			
				|  |  |              except Exception as e:
 | 
	
		
			
				|  |  |                  print(e)
 | 
	
		
			
				|  |  |  
 | 
	
	
		
			
				|  | @@ -161,6 +167,7 @@ class ChannelAccountCrawler(CrawlerAccounts):
 | 
	
		
			
				|  |  |          affected_rows = self.save_similarity_score_to_table(video_list)
 | 
	
		
			
				|  |  |          print(affected_rows)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |  class ToutiaoAccountCrawler(CrawlerAccounts):
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      def get_seed_videos(self):
 | 
	
	
		
			
				|  | @@ -233,3 +240,8 @@ class ToutiaoAccountCrawler(CrawlerAccounts):
 | 
	
		
			
				|  |  |          video_list = self.get_video_list_without_score()
 | 
	
		
			
				|  |  |          affected_rows = self.save_similarity_score_to_table(video_list)
 | 
	
		
			
				|  |  |          print(affected_rows)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +class HaoKanAccountCrawler(CrawlerAccounts):
 | 
	
		
			
				|  |  | +    def deal(self):
 | 
	
		
			
				|  |  | +        return
 |