浏览代码

1. 好看视频开发自动翻页
2. 视频号修复 bug(爬虫不会自动去获取更新好的 token 和 cookie)

罗俊辉 1 年之前
父节点
当前提交
6ca1301eca

文件差异内容过多而无法显示
+ 13 - 0
haokanshipin/haokanshipin_author/fanye_test.py


+ 3 - 3
haokanshipin/haokanshipin_author/hksp_test.py

@@ -145,7 +145,7 @@ class HaoKanVideoAccount(object):
         )
         )
         if pipeline.process_item():
         if pipeline.process_item():
             # self.mq.send_msg(mq_obj)
             # self.mq.send_msg(mq_obj)
-            print(json.dumps(mq_obj))
+            print(json.dumps(mq_obj, ensure_ascii=False, indent=4))
             print("成功发送至 ETL")
             print("成功发送至 ETL")
 
 
     def schedule(self):
     def schedule(self):
@@ -169,12 +169,12 @@ class HaoKanVideoAccount(object):
 if __name__ == "__main__":
 if __name__ == "__main__":
     select_user_sql = f"""select * from crawler_user_v3 where source = 'haokanshipin';"""
     select_user_sql = f"""select * from crawler_user_v3 where source = 'haokanshipin';"""
     user_list = MysqlHelper.get_values("author", "haokanshipin", select_user_sql, "prod", action="")
     user_list = MysqlHelper.get_values("author", "haokanshipin", select_user_sql, "prod", action="")
-    print(json.dumps(user_list[0], ensure_ascii=False, indent=4))
+    print(json.dumps(user_list[1], ensure_ascii=False, indent=4))
     T = HaoKanVideoAccount(
     T = HaoKanVideoAccount(
         platform="haokanshipin",
         platform="haokanshipin",
         mode="author",
         mode="author",
         rule_dict={},
         rule_dict={},
-        user_dict=user_list[0],
+        user_dict=user_list[1],
         env="prod",
         env="prod",
     )
     )
     T.schedule()
     T.schedule()

+ 12 - 12
shipinhao/shipinhao_author/shipinhao_scheduling.py

@@ -135,27 +135,27 @@ class ShiPinHaoAccount:
 
 
     def get_account_videos(self):
     def get_account_videos(self):
         # 一个账号最多抓取 30 条数据
         # 一个账号最多抓取 30 条数据
-        self.get_token_from_mysql()
         user_id = self.get_account_id()
         user_id = self.get_account_id()
         if user_id:
         if user_id:
             url = "https://mp.weixin.qq.com/cgi-bin/videosnap"
             url = "https://mp.weixin.qq.com/cgi-bin/videosnap"
-            headers = {
-                "authority": "mp.weixin.qq.com",
-                "accept": "*/*",
-                "accept-language": "en,zh-CN;q=0.9,zh;q=0.8",
-                "cookie": self.cookie,
-                "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={}&lang=zh_CN".format(
-                    self.token
-                ),
-                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
-                "x-requested-with": "XMLHttpRequest",
-            }
             buffer = ""  # 翻页指示器
             buffer = ""  # 翻页指示器
             while True:
             while True:
                 if self.download_cnt >= int(
                 if self.download_cnt >= int(
                     self.rule_dict.get("videos_cnt", {}).get("min", 30)
                     self.rule_dict.get("videos_cnt", {}).get("min", 30)
                 ):
                 ):
                     return
                     return
+                self.get_token_from_mysql()
+                headers = {
+                    "authority": "mp.weixin.qq.com",
+                    "accept": "*/*",
+                    "accept-language": "en,zh-CN;q=0.9,zh;q=0.8",
+                    "cookie": self.cookie,
+                    "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={}&lang=zh_CN".format(
+                        self.token
+                    ),
+                    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
+                    "x-requested-with": "XMLHttpRequest",
+                }
                 params = {
                 params = {
                     "action": "get_feed_list",
                     "action": "get_feed_list",
                     "username": user_id,
                     "username": user_id,

部分文件因为文件数量过多而无法显示