wangkun 2 роки тому
батько
коміт
447428eeea

+ 13 - 12
common/publish.py

@@ -214,22 +214,27 @@ class Publish:
     def crawlersrccode(cls, crawler):
     def crawlersrccode(cls, crawler):
         if crawler == 'youtube':
         if crawler == 'youtube':
             return 'YOUTUBE'
             return 'YOUTUBE'
-        elif crawler == 'kanyikan':
-            return 'KANYIKAN'
         elif crawler == "kuaishou":
         elif crawler == "kuaishou":
             return "KUAISHOU_XCX"
             return "KUAISHOU_XCX"
-        elif crawler == "weishi":
-            return "WEISHI"
         elif crawler == "xiaoniangao":
         elif crawler == "xiaoniangao":
             return "XIAONIANGAO_XCX"
             return "XIAONIANGAO_XCX"
+        elif crawler == "gongzhonghao":
+            return "GONGZHONGHAO_XINXIN"
+        elif crawler == 'xigua':
+            return 'XIGUA'
+        elif crawler == 'weixinzhishu':
+            return 'WEIXINZHISHU'
+        elif crawler == "douyin":
+            return "DOUYIN"
+
+        elif crawler == 'kanyikan':
+            return 'KANYIKAN'
+        elif crawler == "weishi":
+            return "WEISHI"
         elif crawler == "benshanzhufu":
         elif crawler == "benshanzhufu":
             return "BENSHANZHUFU"
             return "BENSHANZHUFU"
-        elif crawler == "gongzhonghao_xinxin":
-            return "GONGZHONGHAO_XINXIN"
         elif crawler == 'shipinhao':
         elif crawler == 'shipinhao':
             return 'SHIPINHAO_XCX'
             return 'SHIPINHAO_XCX'
-        elif crawler == 'xigua':
-            return 'XIGUA'
         elif crawler == 'zhihu':
         elif crawler == 'zhihu':
             return 'ZHIHU'
             return 'ZHIHU'
         elif crawler == 'jixiangxingfu':
         elif crawler == 'jixiangxingfu':
@@ -250,10 +255,6 @@ class Publish:
             return 'SHENGSHENGYINGYIN'
             return 'SHENGSHENGYINGYIN'
         elif crawler == 'ganggangdouchuan':
         elif crawler == 'ganggangdouchuan':
             return 'GANGGANGDOUCHUAN'
             return 'GANGGANGDOUCHUAN'
-        elif crawler == 'gongzhonghao_xinxin':
-            return 'GONGZHONGHAO_XINXIN'
-        elif crawler == 'weixinzhishu':
-            return 'WEIXINZHISHU'
         else:
         else:
             return "CRAWLER"
             return "CRAWLER"
 
 

+ 4 - 4
common/scheduling_db.py

@@ -23,7 +23,7 @@ class MysqlHelper:
                     passwd="crawler123456@",  # mysql用户登录密码
                     passwd="crawler123456@",  # mysql用户登录密码
                     db="piaoquan-crawler",  # 数据库名
                     db="piaoquan-crawler",  # 数据库名
                     # 如果数据库里面的文本是utf8编码的,charset指定是utf8
                     # 如果数据库里面的文本是utf8编码的,charset指定是utf8
-                    charset="utf8")
+                    charset="utf8mb4")
             else:
             else:
                 # 创建一个 Connection 对象,代表了一个数据库连接
                 # 创建一个 Connection 对象,代表了一个数据库连接
                 connection = pymysql.connect(
                 connection = pymysql.connect(
@@ -34,7 +34,7 @@ class MysqlHelper:
                     passwd="crawler123456@",  # mysql用户登录密码
                     passwd="crawler123456@",  # mysql用户登录密码
                     db="piaoquan-crawler",  # 数据库名
                     db="piaoquan-crawler",  # 数据库名
                     # 如果数据库里面的文本是utf8编码的,charset指定是utf8
                     # 如果数据库里面的文本是utf8编码的,charset指定是utf8
-                    charset="utf8")
+                    charset="utf8mb4")
         elif env == 'prod':
         elif env == 'prod':
             # 创建一个 Connection 对象,代表了一个数据库连接
             # 创建一个 Connection 对象,代表了一个数据库连接
             connection = pymysql.connect(
             connection = pymysql.connect(
@@ -45,7 +45,7 @@ class MysqlHelper:
                 passwd="crawler123456@",  # mysql用户登录密码
                 passwd="crawler123456@",  # mysql用户登录密码
                 db="piaoquan-crawler",  # 数据库名
                 db="piaoquan-crawler",  # 数据库名
                 # 如果数据库里面的文本是utf8编码的,charset指定是utf8
                 # 如果数据库里面的文本是utf8编码的,charset指定是utf8
-                charset="utf8")
+                charset="utf8mb4")
         else:
         else:
             # 创建一个 Connection 对象,代表了一个数据库连接
             # 创建一个 Connection 对象,代表了一个数据库连接
             connection = pymysql.connect(
             connection = pymysql.connect(
@@ -56,7 +56,7 @@ class MysqlHelper:
                 passwd="crawler123456@",  # mysql用户登录密码
                 passwd="crawler123456@",  # mysql用户登录密码
                 db="piaoquan-crawler",  # 数据库名
                 db="piaoquan-crawler",  # 数据库名
                 # 如果数据库里面的文本是utf8编码的,charset指定是utf8
                 # 如果数据库里面的文本是utf8编码的,charset指定是utf8
-                charset="utf8")
+                charset="utf8mb4")
 
 
         return connection
         return connection
 
 

+ 1 - 1
gongzhonghao/gongzhonghao_follow/gongzhonghao_follow.py

@@ -532,7 +532,7 @@ class GongzhonghaoFollow:
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
     GongzhonghaoFollow.get_token(log_type="follow", crawler="gongzhonghao")
     GongzhonghaoFollow.get_token(log_type="follow", crawler="gongzhonghao")
-    # GongzhonghaoFollow.get_users()
+    GongzhonghaoFollow.get_users()
     # GongzhonghaoFollow.get_videoList(log_type="follow",
     # GongzhonghaoFollow.get_videoList(log_type="follow",
     #                                  crawler="gongzhonghao",
     #                                  crawler="gongzhonghao",
     #                                  user="香音难忘",
     #                                  user="香音难忘",

+ 27 - 28
xigua/xigua_recommend/demo.py

@@ -1,28 +1,27 @@
-logs = {'level': 'INFO',
-        'message': '{"message":{'
-                   '"method":"Network.requestWillBeSent",'
-                   '"params":{'
-                   '"documentURL":"https://www.ixigua.com/?wid_try=1",'
-                   '"frameId":"698D690DBD747CCE87288D66C4A8A45C",'
-                   '"hasUserGesture":false,'
-                   '"initiator":{"stack":{"callFrames":[{"columnNumber":48473,"functionName":"",'
-                   '"lineNumber":4,'
-                   '"scriptId":"21",'
-                   '"url":"https://lf-cdn-tos.bytescm.com/obj/static/secsdk/secsdk-lastest.umd.js"}]},'
-                   '"type":"script"},'
-                   '"loaderId":"51BC9A78F355D630AB0CFCAEC204D50A",'
-                   '"redirectHasExtraInfo":false,'
-                   '"request":{'
-                   '"hasPostData":true,'
-                   '"headers":{'
-                   '"Accept":"application/json,'
-                   ' text/plain, */*",'
-                   '"Content-Type":"application/json","Referer":"https://www.ixigua.com/?wid_try=1",'
-                   '"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36",'
-                   '"sec-ch-ua":"\\"Chromium\\";v=\\"112\\",'
-                   ' \\"Google Chrome\\";v=\\"112\\",'
-                   ' \\"Not:A-Brand\\";v=\\"99\\"",'
-                   '"sec-ch-ua-mobile":"?0","sec-ch-ua-platform":"\\"macOS\\"","x-secsdk-csrf-token":"00010000000129b3d18037305ef04702bb32df2f7f97e19d50b9c5129deb08d9bee0fd9dc68c1754743c2ea56cf0"},"initialPriority":"High","isSameSite":true,"method":"POST","mixedContentType":"none","postData":"{\\"r\\":true,\\"d\\":[{\\"k\\":\\"page_sub_channel\\",\\"v\\":\\"{\\\\\\"datetime\\\\\\":1681096362436,\\\\\\"channel_name\\\\\\":\\\\\\"home\\\\\\"}\\"}]}","postDataEntries":[{"bytes":"eyJyIjp0cnVlLCJkIjpbeyJrIjoicGFnZV9zdWJfY2hhbm5lbCIsInYiOiJ7XCJkYXRldGltZVwiOjE2ODEwOTYzNjI0MzYsXCJjaGFubmVsX25hbWVcIjpcImhvbWVcIn0ifV19"}],"referrerPolicy":"strict-origin-when-cross-origin",'
-                   '"url":"https://www.ixigua.com/at/log/c?aid=1768&msToken=&X-Bogus=DFSzswSOGSy6Sh-htVAcJR/F6q98&_signature=_02B4Z6wo00001O38UmAAAIDBlTK5ZUm9hMDt7HbAAF9Se5"},"requestId":"1969.180",'
-                   '"timestamp":3139.795561,"type":"XHR","wallTime":1681096362.632288}},"webview":"698D690DBD747CCE87288D66C4A8A45C"}',
-        'timestamp': 1681096362632}
+import os
+import sys
+
+sys.path.append(os.getcwd())
+from common.scheduling_db import MysqlHelper
+
+
+class Demo:
+    @classmethod
+    def get_config(cls, log_type, crawler, env):
+        select_sql = f"""select * from crawler_config where source="xigua" """
+        contents = MysqlHelper.get_values(log_type, crawler, select_sql, env, action='')
+        for content in contents:
+            config = content['config']
+            print(type(config))
+            print(config)
+            print(type(eval(config)))
+            token = eval(config['token'])
+            # emoji = config['emoji']
+            # filter = config['filter']
+            print(f"token:{token}")
+            # print(f"emoji:{emoji}")
+            # print(f"filter:{filter}")
+
+
+if __name__ == "__main__":
+    Demo.get_config("demo", "xiaoniangao", "dev")

+ 25 - 0
xigua/xigua_recommend/xigua_recommend.py

@@ -548,6 +548,27 @@ class XiguaRecommend:
         except Exception as e:
         except Exception as e:
             Common.logger(log_type, crawler).error(f'get_video_url:{e}\n')
             Common.logger(log_type, crawler).error(f'get_video_url:{e}\n')
 
 
+    # 过滤词库
+    @classmethod
+    def filter_words(cls, log_type, crawler):
+        try:
+            while True:
+                filter_words_sheet = Feishu.get_values_batch(log_type, crawler, 'KGB4Hc')
+                if filter_words_sheet is None:
+                    Common.logger(log_type, crawler).warning(
+                        f"filter_words_sheet:{filter_words_sheet} 10秒钟后重试")
+                    continue
+                filter_words_list = []
+                for x in filter_words_sheet:
+                    for y in x:
+                        if y is None:
+                            pass
+                        else:
+                            filter_words_list.append(y)
+                return filter_words_list
+        except Exception as e:
+            Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
+
     @classmethod
     @classmethod
     def get_videolist(cls, log_type, crawler, env):
     def get_videolist(cls, log_type, crawler, env):
         while True:
         while True:
@@ -687,6 +708,9 @@ class XiguaRecommend:
     def download_publish(cls, log_type, crawler, video_dict):
     def download_publish(cls, log_type, crawler, video_dict):
         if video_dict['video_id'] in [y for x in Feishu.get_values_batch(log_type, crawler, "1iKGF1") for y in x]:
         if video_dict['video_id'] in [y for x in Feishu.get_values_batch(log_type, crawler, "1iKGF1") for y in x]:
             Common.logger(log_type, crawler).info("视频已存在\n")
             Common.logger(log_type, crawler).info("视频已存在\n")
+        elif any(word if word in video_dict['video_title'] else False for word in
+                 cls.filter_words(log_type, crawler)) is True:
+            Common.logger(log_type, crawler).info('标题已中过滤词\n')
         else:
         else:
             Feishu.insert_columns(log_type, crawler, "1iKGF1", "ROWS", 1, 2)
             Feishu.insert_columns(log_type, crawler, "1iKGF1", "ROWS", 1, 2)
             values = [[time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))),
             values = [[time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))),
@@ -719,4 +743,5 @@ if __name__ == "__main__":
     # XiguaRecommend.get_signature("recommend", "xigua", "dev")
     # XiguaRecommend.get_signature("recommend", "xigua", "dev")
     XiguaRecommend.get_videolist("recommend", "xigua", "dev")
     XiguaRecommend.get_videolist("recommend", "xigua", "dev")
     # print(XiguaRecommend.get_video_url("recommend", "xigua", "7218171653242094139"))
     # print(XiguaRecommend.get_video_url("recommend", "xigua", "7218171653242094139"))
+    # print(XiguaRecommend.filter_words("recommend", "xigua"))
     pass
     pass