Bladeren bron

Merge branch '2025-05-07-account-crawler' of luojunhui/LongArticlesJob into master

luojunhui 5 maanden geleden
bovenliggende
commit
014bbdd283

+ 5 - 6
account_explore_task.py

@@ -3,11 +3,11 @@
 @description: try to get some more accounts
 """
 
-from tasks.crawler_accounts_by_association import ChannelsAccountCrawler
-from tasks.crawler_accounts_by_association import ToutiaoAccountCrawler
-from tasks.crawler_accounts_by_association import HaoKanAccountCrawler
-from tasks.crawler_accounts_by_association import GzhAccountCrawler
-from tasks.generate_search_keys import get_association_title_list_in_multi_threads
+from tasks.crawler_tasks.crawler_account.crawler_accounts_by_association import ChannelsAccountCrawler
+from tasks.crawler_tasks.crawler_account.crawler_accounts_by_association import ToutiaoAccountCrawler
+from tasks.crawler_tasks.crawler_account.crawler_accounts_by_association import HaoKanAccountCrawler
+from tasks.crawler_tasks.crawler_account.crawler_accounts_by_association import GzhAccountCrawler
+from tasks.ai_tasks.generate_search_keys import get_association_title_list_in_multi_threads
 
 
 def deal_each_platform(platform: str) -> None:
@@ -33,7 +33,6 @@ def deal_each_platform(platform: str) -> None:
 
 if __name__ == "__main__":
     # get_association_title_list_in_multi_threads()
-    get_association_title_list_in_multi_threads()
 
     # get each platform
     platform_list = ["sph", "hksp", "toutiao", "gzh"]

+ 1 - 1
account_quality_analysis.py

@@ -1,4 +1,4 @@
-from tasks.account_recognize_by_llm import AccountRecognizer
+from tasks.ai_tasks.account_recognize_by_llm import AccountRecognizer
 
 
 def main():

+ 8 - 4
applications/pipeline/account_pipeline.py

@@ -14,10 +14,14 @@ def whether_duplicate_account_id(account_id: str, platform: str, db_client: Data
         select id, status from video_meta_accounts
         where account_id = %s and platform = %s;
     """
-    duplicate_id, status = db_client.fetch(query=sql, params=(account_id, platform))[0]
-    if duplicate_id and status:
-        return True
-    return False
+    fetch_response = db_client.fetch(query=sql, params=(account_id, platform))
+    if fetch_response:
+        duplicate_id, status = fetch_response[0]
+        if duplicate_id and status:
+            return True
+        return False
+    else:
+        return False
 
 def scrape_account_entities_process(account_item: dict, db_client: DatabaseConnector) -> dict:
     """

+ 1 - 1
crawler_sph_video.py

@@ -2,7 +2,7 @@
 @author: luojunhui
 """
 
-from tasks.crawler_channel_account_videos import CrawlerChannelAccountVideos
+from tasks.crawler_tasks.crawler_video.crawler_sph_videos import CrawlerChannelAccountVideos
 
 if __name__ == "__main__":
     crawler_channel_account_videos = CrawlerChannelAccountVideos()

+ 0 - 0
tasks/account_recognize_by_llm.py → tasks/ai_tasks/account_recognize_by_llm.py


+ 0 - 0
tasks/generate_search_keys.py → tasks/ai_tasks/generate_search_keys.py


+ 0 - 0
tasks/title_rewrite_task.py → tasks/ai_tasks/title_rewrite_task.py


+ 3 - 1
tasks/crawler_accounts_by_association.py → tasks/crawler_tasks/crawler_account/crawler_accounts_by_association.py

@@ -227,6 +227,7 @@ class ToutiaoAccountCrawler(CrawlerAccounts):
 
         # check item
         account_item.check(source="candidate_account")
+        print(account_item.item)
 
         # insert into database
         self.insert_video_into_recommend_table(account_item.item)
@@ -353,7 +354,7 @@ class GzhAccountCrawler(CrawlerAccounts):
         fetch_query = f"""
             select id, article_url
             from publish_single_video_source
-            where source_account = 1 and platform = 'gzh' limit 10;
+            where source_account = 1 and bad_status = 0 and platform = 'gzh' limit 100;
         """
         task_list = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
         return task_list
@@ -401,6 +402,7 @@ class GzhAccountCrawler(CrawlerAccounts):
         # extract source account
         source_account = get_source_account_from_article(article_link)
         if not source_account:
+            print("No source account found \t", article_link)
             return
         else:
             account_name = source_account['name']

+ 0 - 0
tasks/crawler_channel_account_videos.py → tasks/crawler_tasks/crawler_video/crawler_sph_videos.py


+ 0 - 0
tasks/crawler_toutiao_account_videos.py → tasks/crawler_tasks/crawler_video/crawler_toutiao_videos.py


+ 1 - 1
title_process_task.py

@@ -2,7 +2,7 @@
 @author: luojunhui
 """
 from tasks.ai_tasks.category_generation_task import CategoryGenerationTask
-from tasks.title_rewrite_task import TitleRewriteTask
+from tasks.ai_tasks.title_rewrite_task import TitleRewriteTask
 
 
 if __name__ == '__main__':

+ 1 - 1
toutiao_video_crawler.py

@@ -2,7 +2,7 @@
 @author: luojunhui
 """
 
-from tasks.crawler_toutiao_account_videos import CrawlerToutiaoAccountVideos
+from tasks.crawler_tasks.crawler_video.crawler_toutiao_videos import CrawlerToutiaoAccountVideos
 
 
 if __name__ == '__main__':