Ver código fonte

测试账号抓取

luojunhui 6 meses atrás
pai
commit
c6cb765942

+ 2 - 1
applications/functions.py

@@ -172,7 +172,7 @@ class Functions(object):
         response = requests.get(
             url=article_url,
             headers={'User-Agent': FakeUserAgent().random},
-
+            proxies=cls.proxy()
         )
         html_text = response.text
         w = re.search(
@@ -195,6 +195,7 @@ class Functions(object):
         response = requests.get(
             url=article_url,
             headers={'User-Agent': FakeUserAgent().random},
+            proxies=cls.proxy()
         )
         html_text = response.text
         # 正则表达式用于提取 hit_nickname 和 hit_username

+ 6 - 3
coldStartTasks/crawler/weixin_video_crawler.py

@@ -2,6 +2,7 @@
 @author: luojunhui
 抓取视频
 """
+import json
 import time
 import traceback
 from typing import List, Dict
@@ -74,8 +75,10 @@ class WeixinVideoCrawler(object):
         latest_crawler_timestamp = account_obj["latest_crawler_timestamp"]
         if latest_crawler_timestamp is None:
             latest_crawler_timestamp = const.DEFAULT_TIMESTAMP
+        print(gh_id, account_name, latest_crawler_timestamp)
         # 调用爬虫接口
         response = spider.update_msg_list(gh_id, index=cursor)
+        print(json.dumps(response, ensure_ascii=False, indent=4))
         if response['code'] == const.REQUEST_SUCCESS:
             # 一般返回最近10天的msg_list
             msg_list = response.get('data', {}).get("data", [])
@@ -214,7 +217,7 @@ class WeixinVideoCrawler(object):
         :return:
         """
         account_list = self.get_crawler_accounts()
-        for account_obj in tqdm(account_list, desc="crawler_video_for_each_account"):
+        for account_obj in tqdm(account_list[3:4], desc="crawler_video_for_each_account"):
             self.crawler_article_video_list(account_obj)
             self.update_account_latest_crawler_timestamp(gh_id=account_obj["gh_id"])
             time.sleep(const.SLEEP_SECONDS)
@@ -240,9 +243,9 @@ class WeixinVideoCrawler(object):
         执行任务
         :return:
         """
-        start_timestamp = int(time.time())
+        # start_timestamp = int(time.time())
         self.crawler_task()
-        self.mention(start_timestamp)
+        # self.mention(start_timestamp)