wangkun vor 2 Jahren
Ursprung
Commit
7bf2b06d74
4 geänderte Dateien mit 52 neuen und 16 gelöschten Zeilen
  1. 1 0
      README.MD
  2. 13 12
      common/common.py
  3. 25 0
      xigua/xigua_main/run_xg_search_dev.py
  4. 13 4
      xigua/xigua_search/xigua_search_scheduling.py

+ 1 - 0
README.MD

@@ -200,5 +200,6 @@ jieba==0.42.1
 # pip3 install workalendar
 workalendar==17.0.0
 # pip3 install aliyun_python_sdk
+# pip3 install -U aliyun-log-python-sdk
 aliyun_python_sdk==2.2.0
 ```

+ 13 - 12
common/common.py

@@ -42,7 +42,9 @@ class Common:
 
         # 日志文件名
         # log_name = time.strftime("%Y-%m-%d", time.localtime(time.time())) + f'-{crawler}-{log_type}.log'
-        log_name = str(date.today()) + f"-{crawler}-{log_type}.log"
+        # log_name = datetime.datetime.now().strftime('%Y-%m-%d') + f'-{crawler}-{log_type}.log'
+        log_name = datetime.datetime.now().strftime('%Y-%m-%d') + '-' + crawler + '-' + log_type + '.log'
+        # log_name = str(date.today()) + f"-{crawler}-{log_type}.log"
 
         # 日志不打印到控制台
         logger.remove(handler_id=None)
@@ -82,7 +84,6 @@ class Common:
             endpoint = 'cn-hangzhou-intranet.log.aliyuncs.com'
 
         # 创建 LogClient 实例
-        # print("创建 LogClient 实例")
         client = LogClient(endpoint, accessKeyId, accessKey)
 
 
@@ -90,28 +91,28 @@ class Common:
             message = message.replace('\r', ' ')
         if '\n' in message:
             message = message.replace('\n', ' ')
-        # print(f"message:{message}")
         log_group = []
         log_item = LogItem()
-        # print(f"log_item:{type(log_item), log_item}")
-        contents = [(f"{crawler}-{log_type}", message)]
-        # print(f"contents:{type(contents), contents}")
+
+        """
+        生成日志消息体格式,例如
+        crawler:xigua
+        message:不满足抓取规则 
+        mode:search
+        timestamp:1686656143
+        """
+        contents = [(f"crawler", str(crawler)), (f"mode", str(log_type)), (f"message", str(message)), ("timestamp", str(int(time.time())))]
         log_item.set_contents(contents)
         log_group.append(log_item)
-        # print(f"log_group:{type(log_group), log_group}")
 
         # 写入日志
-        # print("开始PutLogsRequest")
         request = PutLogsRequest(project=project,
                                  logstore=logstore,
                                  topic="",
                                  source="",
                                  logitems=log_group,
                                  compress=False)
-        # print(f"request:{request}")
-        # print("put_logs...")
         client.put_logs(request)
-        # print("put_logs...done")
 
     # 清除日志,保留最近 10 个文件
     @classmethod
@@ -398,5 +399,5 @@ class Common:
 
 
 if __name__ == "__main__":
-    Common.tunnel_proxies()
+
     pass

+ 25 - 0
xigua/xigua_main/run_xg_search_dev.py

@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/13
+import os
+import sys
+sys.path.append(os.getcwd())
+from common.common import Common
+from xigua.xigua_search.xigua_search_scheduling import XiguasearchScheduling
+
+
+def xigua_search_main(log_type, crawler, env):
+    Common.logger(log_type, crawler).info("开始抓取西瓜搜索\n")
+    Common.logging(log_type, crawler, env, "开始抓取西瓜搜索\n")
+    XiguasearchScheduling.get_search_videos(log_type=log_type,
+                                            crawler=crawler,
+                                            rule_dict={"play_cnt":{"min":8000,"max":0},"duration":{"min":60,"max":600},"period":{"min":365,"max":365},"videos_cnt":{"min":30,"max":0}},
+                                            user_list=[{"uid": 6267140, "source": "xigua", "link": "退休补贴", "nick_name": "西瓜搜索测试账号", "avatar_url": "http://rescdn.yishihui.com/user/default/avatar/live/1616555578819_u=1922778943,2660693611&fm=26&gp=0.jpg", "mode": "search"}],
+                                            env=env)
+    Common.del_logs(log_type, crawler)
+    Common.logger(log_type, crawler).info("抓取一轮结束\n")
+    Common.logging(log_type, crawler, env, "抓取一轮结束\n")
+
+
+if __name__ == "__main__":
+    xigua_search_main("search", "xigua", "dev")

+ 13 - 4
xigua/xigua_search/xigua_search_scheduling.py

@@ -22,7 +22,7 @@ from common.scheduling_db import MysqlHelper
 from common.common import Common
 from common.feishu import Feishu
 from common.publish import Publish
-from common.public import get_config_from_mysql, download_rule
+from common.public import get_config_from_mysql, download_rule, get_title_score
 from common.userAgent import get_random_user_agent
 
 
@@ -645,11 +645,19 @@ class XiguasearchScheduling:
                         Common.logger(log_type, crawler).info('视频已下载\n')
                         Common.logging(log_type, crawler, env, '视频已下载\n')
                     else:
+                        title_score = get_title_score(log_type, "kuaishou", "16QspO", "0usaDk", video_dict["video_title"])
+                        if title_score <= 0.3:
+                            Common.logger(log_type, crawler).info(f"权重分:{title_score}<=0.3\n")
+                            Common.logging(log_type, crawler, env, f"权重分:{title_score}<=0.3\n")
+                            continue
+                        Common.logger(log_type, crawler).info(f"权重分:{title_score}>0.3\n")
+                        Common.logging(log_type, crawler, env, f"权重分:{title_score}>0.3\n")
                         cls.download_publish(log_type=log_type,
                                              crawler=crawler,
                                              user_dict=user_dict,
                                              video_dict=video_dict,
                                              rule_dict=rule_dict,
+                                             title_score=title_score,
                                              env=env)
                 except Exception as e:
                     Common.logger(log_type, crawler).warning(f"抓取单条视频异常:{e}\n")
@@ -668,7 +676,7 @@ class XiguasearchScheduling:
 
     # 下载 / 上传
     @classmethod
-    def download_publish(cls, log_type, crawler, user_dict, video_dict, rule_dict, env):
+    def download_publish(cls, log_type, crawler, user_dict, video_dict, rule_dict, title_score, env):
 
         Common.download_method(log_type=log_type, crawler=crawler, text='xigua_video',
                                title=video_dict['video_title'], url=video_dict['video_url'])
@@ -769,7 +777,8 @@ class XiguasearchScheduling:
 
         # 视频信息写入飞书
         Feishu.insert_columns(log_type, crawler, "BUNvGC", "ROWS", 1, 2)
-        values = [[user_dict["link"],
+        values = [[title_score,
+            user_dict["link"],
             time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))),
             "关键词搜索",
             video_dict['video_title'],
@@ -790,7 +799,7 @@ class XiguasearchScheduling:
             video_dict['video_url'],
             video_dict['audio_url']]]
         time.sleep(0.5)
-        Feishu.update_values(log_type, crawler, "BUNvGC", "E2:Z2", values)
+        Feishu.update_values(log_type, crawler, "BUNvGC", "D2:Z2", values)
         Common.logger(log_type, crawler).info('视频信息写入飞书完成\n')
         Common.logging(log_type, crawler, env, '视频信息写入飞书完成\n')