瀏覽代碼

gzh抓取优化

luojunhui 3 周之前
父節點
當前提交
48e8c4b06a
共有 1 個文件被更改,包括 3 次插入2 次删除
  1. 3 2
      applications/tasks/crawler_tasks/crawler_gzh.py

+ 3 - 2
applications/tasks/crawler_tasks/crawler_gzh.py

@@ -134,7 +134,7 @@ class CrawlerGzhBaseStrategy(CrawlerPipeline, CrawlerGzhConst):
         await self.save_item_to_database(
             media_type="article", item=new_item, trace_id=self.trace_id
         )
-        await asyncio.sleep(self.SLEEP_SECONDS)
+        # await asyncio.sleep(self.SLEEP_SECONDS)
 
     async def update_account_read_avg_info(self, gh_id, account_name):
         """update account read avg info"""
@@ -288,7 +288,8 @@ class CrawlerGzhSearchArticles(CrawlerGzhBaseStrategy):
         """
         @description: 对于搜索到的文章list,获取文章详情, 并且存储到meta表中
         """
-        for article in article_list:
+        for article in tqdm(article_list, desc="获取搜索结果详情"):
+            print(f"{datetime.now()}: start crawling article: {article}")
             url = article["url"]
             detail_response = await get_article_detail(url, is_count=True, is_cache=False)
             if not detail_response: