Bläddra i källkod

祝福圈子线下 代码优化

piaoquan 1 år sedan
förälder
incheckning
84278f23f1

+ 2 - 115
main/process_offline.sh

@@ -29,100 +29,10 @@ else
   echo "$(date "+%Y-%m-%d %H:%M:%S") Appium 进程状态正常" >> ${log_path}
 fi
 
-# 吉祥幸福
-if [[ "$time" > "00:00:59"  &&  "$time" < "01:59:59" ]];then
-  echo "$(date "+%Y-%m-%d %H:%M:%S") 开始启动 吉祥幸福 爬虫脚本任务" >> ${log_path}
-  ps aux | grep run_ppqsift | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps aux | grep run_zfqz | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps aux | grep run_zmyx | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps aux | grep run_xngplus | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps aux | grep run_xngrule | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps -ef | grep "run_jixiangxingfu_recommend.py" | grep -v "grep"
-  if [ "$?" -eq 1 ];then
-    echo "$(date "+%Y-%m-%d %H:%M:%S") 吉祥幸福爬虫, 异常停止, 正在重启!" >> ${log_path}
-    adb forward --remove-all
-    cd ${piaoquan_crawler_dir}
-    nohup python3 -u jixiangxingfu/jixiangxingfu_main/run_jixiangxingfu_recommend.py --log_type="recommend" --crawler="jixiangxingfu" --env=${env} >>jixiangxingfu/logs/nohup-recommend.log 2>&1 &
-    echo "$(date "+%Y-%m-%d %H:%M:%S") 重启完成!" >> ${log_path}
-  else
-    echo "$(date "+%Y-%m-%d %H:%M:%S") 吉祥幸福 进程状态正常" >> ${log_path}
-  fi
-else
-  echo "$(date "+%Y-%m-%d %H:%M:%S") 吉祥幸福 爬虫脚本任务结束" >> ${log_path}
-fi
 
 
-# 众妙音信-new
-if [[ "$time" > "02:00:00"  &&  "$time" < "03:59:59" ]];then
-  echo "$(date "+%Y-%m-%d %H:%M:%S") 开始启动 众妙音信-new 爬虫脚本任务" >> ${log_path}
-  ps aux | grep run_jixiangxingfu | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps aux | grep run_zfqz | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps aux | grep run_xngplus | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps aux | grep run_xngrule | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps -ef | grep "run_zmyx_recommend.py" | grep -v "grep"
-  if [ "$?" -eq 1 ];then
-    echo "$(date "+%Y-%m-%d %H:%M:%S") 众妙音信-new小程序爬虫, 异常停止, 正在重启!" >> ${log_path}
-    adb forward --remove-all
-    cd ${piaoquan_crawler_dir}
-    nohup python3 -u zhongmiaoyinxin/zhongmiaoyinxin_main/run_zmyx_recommend.py --log_type="recommend" --crawler="zhongmiaoyinxin" --env=${env} >>zhongmiaoyinxin/logs/nohup-recommend.log 2>&1 &
-    echo "$(date "+%Y-%m-%d %H:%M:%S") 重启完成!" >> ${log_path}
-  else
-    echo "$(date "+%Y-%m-%d %H:%M:%S") 众妙音信-new小程序爬虫, 进程状态正常" >> ${log_path}
-  fi
-
-else
-  echo "$(date "+%Y-%m-%d %H:%M:%S") 众妙音信 爬虫脚本任务结束" >> ${log_path}
-fi
-
-# 漂漂圈
-if [[ "$time" > "04:00:00"  &&  "$time" < "06:59:59" ]];then
-  echo "$(date "+%Y-%m-%d %H:%M:%S") 开始启动 漂漂圈 爬虫脚本任务" >> ${log_path}
-  ps aux | grep run_ppqsift | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps aux | grep run_xngplus | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps aux | grep run_xngrule | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps aux | grep run_jixiangxingfu | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps aux | grep run_zmyx | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps aux | grep run_zfqz | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps -ef | grep "run_ppq_recommend.py" | grep -v "grep"
-  if [ "$?" -eq 1 ];then
-    echo "$(date "+%Y-%m-%d %H:%M:%S") 漂漂圈 小程序爬虫, 异常停止, 正在重启!" >> ${log_path}
-    adb forward --remove-all
-    cd ${piaoquan_crawler_dir}
-    nohup python3 -u piaopiaoquan/piaopiaoquan_main/run_ppq_recommend.py --log_type="recommend" --crawler="piaopiaoquan" --env=${env} >>piaopiaoquan/logs/nohup-recommend.log 2>&1 &
-    echo "$(date "+%Y-%m-%d %H:%M:%S") 重启完成!" >> ${log_path}
-  else
-    echo "$(date "+%Y-%m-%d %H:%M:%S") 漂漂圈 程序爬虫, 进程状态正常" >> ${log_path}
-  fi
-
-else
-  echo "$(date "+%Y-%m-%d %H:%M:%S") 漂漂圈 爬虫脚本任务结束" >> ${log_path}
-fi
-
-# 漂漂圈-精选
-if [[ "$time" > "07:00:00"  &&  "$time" < "09:59:59" ]];then
-  echo "$(date "+%Y-%m-%d %H:%M:%S") 开始启动 漂漂圈精选 爬虫脚本任务" >> ${log_path}
-  ps aux | grep run_ppq | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps aux | grep run_xngplus | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps aux | grep run_xngrule | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps aux | grep run_jixiangxingfu | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps aux | grep run_zmyx | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps aux | grep run_zfqz | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps -ef | grep "run_ppqsift_recommend.py" | grep -v "grep"
-  if [ "$?" -eq 1 ];then
-    echo "$(date "+%Y-%m-%d %H:%M:%S") 漂漂圈精选 小程序爬虫, 异常停止, 正在重启!" >> ${log_path}
-    adb forward --remove-all
-    cd ${piaoquan_crawler_dir}
-    nohup python3 -u piaopiaoquan/piaopiaoquan_main/run_ppqsift_recommend.py --log_type="recommend" --crawler="piaopiaoquan" --env=${env} >>piaopiaoquan/logs/nohup-recommend.log 2>&1 &
-    echo "$(date "+%Y-%m-%d %H:%M:%S") 重启完成!" >> ${log_path}
-  else
-    echo "$(date "+%Y-%m-%d %H:%M:%S") 漂漂圈精选 程序爬虫, 进程状态正常" >> ${log_path}
-  fi
-else
-  echo "$(date "+%Y-%m-%d %H:%M:%S") 漂漂圈精选 爬虫脚本任务结束" >> ${log_path}
-fi
-
 # 小年糕+
-if [[ "$time" > "10:00:00"  &&  "$time" < "14:59:59" || "$time" > "18:00:00"  &&  "$time" < "20:59:59" ]];then
+if [[ "$time" > "09:00:00"  &&  "$time" < "16:59:59" || "$time" > "20:00:00"  &&  "$time" < "23:59:59" ]];then
   echo "$(date "+%Y-%m-%d %H:%M:%S") 开始启动 小年糕+ 爬虫脚本任务" >> ${log_path}
   ps aux | grep run_ppqsift | grep -v grep | awk '{print $2}' | xargs kill -9
   ps aux | grep run_xngrule | grep -v grep | awk '{print $2}' | xargs kill -9
@@ -146,7 +56,7 @@ else
 fi
 
 # 祝福圈子
-if [[ "$time" > "22:00:00"  &&  "$time" < "23:59:59" || "$time" > "15:00:00"  &&  "$time" < "17:59:59" ]];then
+if [[ "$time" > "00:05:00"  &&  "$time" < "08:59:59" || "$time" > "17:00:00"  &&  "$time" < "19:59:59" ]];then
   echo "$(date "+%Y-%m-%d %H:%M:%S") 开始启动 祝福圈子 爬虫脚本任务" >> ${log_path}
   ps aux | grep run_xngplus | grep -v grep | awk '{print $2}' | xargs kill -9
   ps aux | grep run_xngrule | grep -v grep | awk '{print $2}' | xargs kill -9
@@ -167,29 +77,6 @@ else
   echo "$(date "+%Y-%m-%d %H:%M:%S") 祝福圈子 爬虫脚本任务结束" >> ${log_path}
 fi
 
-
-
-# 小年糕-rule
-if [[ "$time" > "21:00:00"  &&  "$time" < "21:59:59" ]];then
-  echo "$(date "+%Y-%m-%d %H:%M:%S") 开始启动 小年糕-rule 爬虫脚本任务" >> ${log_path}
-  ps aux | grep run_zmyx | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps aux | grep run_xngplus | grep -v grep | awk '{print $2}' | xargs kill -9
-  ps -ef | grep "run_xngrule_recommend.py" | grep -v "grep"
-  if [ "$?" -eq 1 ];then
-    echo "$(date "+%Y-%m-%d %H:%M:%S") 小年糕-rule 小程序爬虫, 异常停止, 正在重启!" >> ${log_path}
-    adb forward --remove-all
-    cd ${piaoquan_crawler_dir}
-    nohup python3 -u xiaoniangaoplus/xiaoniangaoplus_main/run_xngrule_recommend.py --log_type="recommend" --crawler="xiaoniangaoplus" --env=${env} >>xiaoniangaoplus/logs/nohup-recommend.log 2>&1 &
-    echo "$(date "+%Y-%m-%d %H:%M:%S") 重启完成!" >> ${log_path}
-  else
-    echo "$(date "+%Y-%m-%d %H:%M:%S") 小年糕-rule 程序爬虫, 进程状态正常" >> ${log_path}
-  fi
-
-else
-  echo "$(date "+%Y-%m-%d %H:%M:%S") 小年糕-rule 爬虫脚本任务结束" >> ${log_path}
-fi
-
-
 # 删除日志
 echo "$(date "+%Y-%m-%d %H:%M:%S") 开始清理 10 天前的日志文件" >> ${log_path}
 find ${piaoquan_crawler_dir}main/main_logs/ -mtime +10 -name "*.log" -exec rm -rf {} \;

+ 41 - 6
xiaoniangaoplus/xiaoniangaoplus/xiaoniangao_plus_scheduling2.py

@@ -7,6 +7,7 @@ import random
 import sys
 import time
 import uuid
+from datetime import datetime, timedelta
 from hashlib import md5
 
 import requests
@@ -188,10 +189,19 @@ class XiaoNianGaoPlusRecommend:
             except NoSuchElementException:
                 time.sleep(1)
 
-    def repeat_video(self, video_id):
-        sql = f""" select * from crawler_video where platform in ("众妙音信", "刚刚都传", "吉祥幸福", "知青天天看", "zhufuquanzi", "祝福圈子", "haitunzhufu", "海豚祝福", "小年糕") and out_video_id="{video_id}"; """
-        repeat_video = MysqlHelper.get_values(self.log_type, self.crawler, sql, self.env)
-        return len(repeat_video)
+    def repeat_video(self, out_video_id):
+        current_time = datetime.now()
+        previous_day = current_time - timedelta(days=7)
+        formatted_time = previous_day.strftime("%Y-%m-%d")
+        sql = f""" select * from crawler_video where platform = "{self.platform}" and out_video_id="{out_video_id}" and create_time <= '{formatted_time}'; """
+        Common.logger(self.log_type, self.crawler).info(
+            f"sql{sql}")
+        repeat_video = MysqlHelper.get_values(
+            log_type=self.log_type, crawler=self.platform, env=self.env, sql=sql, action=""
+        )
+        if repeat_video:
+            return False
+        return True
 
     def swipe_up(self):
         self.search_elements('//*[@class="list-list--list"]')
@@ -224,7 +234,6 @@ class XiaoNianGaoPlusRecommend:
         soup = BeautifulSoup(page_source, 'html.parser')
         soup.prettify()
         video_list = soup.findAll(name="wx-view", attrs={"class": "expose--adapt-parent"})
-        index = index + 1
         element_list = [i for i in video_list][index:]
         return element_list[0]
 
@@ -280,7 +289,15 @@ class XiaoNianGaoPlusRecommend:
             comment_cnt = int(comment_str)
         out_video_id = md5(video_title.encode('utf8')).hexdigest()
         out_user_id = md5(user_name.encode('utf8')).hexdigest()
-
+        repeat_id = self.repeat_video(out_video_id)
+        Common.logger(self.log_type, self.crawler).info(
+            f"查询{repeat_id}")
+        if False == repeat_id:
+            num = time.time()
+            out_video_id = out_video_id + str(num)
+            Common.logger(self.log_type, self.crawler).info(
+                f"新id{out_video_id}")
+        Common.logger(self.log_type, self.crawler).info(f"数据统计-----标题:{video_title},播放量:{play_cnt},点赞:{like_cnt},评论:{comment_cnt}")
         video_dict = {
             "video_title": video_title,
             "video_id": out_video_id,
@@ -302,6 +319,15 @@ class XiaoNianGaoPlusRecommend:
             "cover_url": cover_url,
             "session": f"xiaoniangao-{int(time.time())}"
         }
+        AliyunLogger.logging(
+            code="1001",
+            platform=self.platform,
+            mode=self.log_type,
+            env=self.env,
+            trace_id=trace_id,
+            message="扫描到一条视频",
+            data=video_dict
+        )
         pipeline = PiaoQuanPipeline(
             platform=self.crawler,
             mode=self.log_type,
@@ -338,6 +364,15 @@ class XiaoNianGaoPlusRecommend:
             video_dict["user_id"] = self.our_uid
             video_dict["publish_time"] = video_dict["publish_time_str"]
             self.mq.send_msg(video_dict)
+            AliyunLogger.logging(
+                code="1002",
+                platform=self.platform,
+                mode=self.log_type,
+                env=self.env,
+                trace_id=trace_id,
+                message="发送到ETL成功",
+                data=video_dict
+            )
             # print(video_dict)
             self.download_cnt += 1
             self.driver.press_keycode(AndroidKey.BACK)

+ 63 - 34
zhufuquanzi/zhufuquanzi_recommend/zhufuquanzi_recommend_new2.py

@@ -6,6 +6,7 @@ import random
 import sys
 import time
 import uuid
+from datetime import datetime, timedelta
 from hashlib import md5
 
 from appium import webdriver
@@ -176,7 +177,6 @@ class ZFQZRecommendNew:
         soup = BeautifulSoup(page_source, 'html.parser')
         soup.prettify()
         video_list = soup.findAll(name="wx-view", attrs={"class": "expose--adapt-parent"})
-        index = index + 1
         element_list = [i for i in video_list][index:]
         return element_list[0]
 
@@ -186,7 +186,7 @@ class ZFQZRecommendNew:
 
         self.check_to_applet(xpath='//*[@class="tags--tag tags--tag-0 tags--checked"]')
         time.sleep(1)
-        name = ["推荐", "搞笑", "大雪", "亲子"]
+        name = ["推荐", "春节"]
         selected_text = random.choice(name)
         try:
             self.driver.find_element(By.XPATH, f"//wx-button[contains(., '{selected_text}')]").click()
@@ -265,6 +265,19 @@ class ZFQZRecommendNew:
             Common.logger(self.log_type, self.crawler).info(f"{video_url_elements[0].get_attribute('src')}")
             return video_url_elements[0].get_attribute('src')
 
+
+    def repeat_video(self,out_video_id):
+        current_time = datetime.now()
+        previous_day = current_time - timedelta(days=7)
+        formatted_time = previous_day.strftime("%Y-%m-%d")
+        sql = f""" select * from crawler_video where platform = "{self.platform}" and out_video_id="{out_video_id}" and create_time <= '{formatted_time}'; """
+        repeat_video = MysqlHelper.get_values(
+            log_type=self.log_type, crawler=self.platform, env=self.env, sql=sql, action=""
+        )
+        if repeat_video:
+            return False
+        return True
+
     def get_video_info_2(self, video_element):
         Common.logger(self.log_type, self.crawler).info(f"本轮已抓取{self.download_cnt}条视频\n")
         # Common.logging(self.log_type, self.crawler, self.env, f"本轮已抓取{self.download_cnt}条视频\n")
@@ -277,39 +290,34 @@ class ZFQZRecommendNew:
         Common.logger(self.log_type, self.crawler).info(f"第{self.count}条视频")
         # 获取 trace_id, 并且把该 id 当做视频生命周期唯一索引
         trace_id = self.crawler + str(uuid.uuid1())
-        AliyunLogger.logging(
-            code="1001",
-            platform=self.platform,
-            mode=self.log_type,
-            env=self.env,
-            trace_id=trace_id,
-            message="扫描到一条视频",
-        )
         video_title = video_element.find("wx-view", class_="dynamic--title").text
         play_str = video_element.find("wx-view", class_="dynamic--views").text
-        like_str = video_element.findAll("wx-view", class_="dynamic--commerce-btn-text")[0].text
-        comment_str = video_element.findAll("wx-view", class_="dynamic--commerce-btn-text")[1].text
+        # like_str = video_element.findAll("wx-view", class_="dynamic--commerce-btn-text")[0].text
+        # comment_str = video_element.findAll("wx-view", class_="dynamic--commerce-btn-text")[1].text
         duration_str = video_element.find("wx-view", class_="dynamic--duration").text
         user_name = video_element.find("wx-view", class_="dynamic--nick-top").text
         avatar_url = video_element.find("wx-image", class_="avatar--avatar")["src"]
         cover_url = video_element.find("wx-image", class_="dynamic--bg-image")["src"]
         play_cnt = int(play_str.replace("+", "").replace("次播放", ""))
         duration = int(duration_str.split(":")[0].strip()) * 60 + int(duration_str.split(":")[-1].strip())
-        if "点赞" in like_str:
-            like_cnt = 0
-        elif "万" in like_str:
-            like_cnt = int(like_str.split("万")[0]) * 10000
-        else:
-            like_cnt = int(like_str)
-        if "评论" in comment_str:
-            comment_cnt = 0
-        elif "万" in comment_str:
-            comment_cnt = int(comment_str.split("万")[0]) * 10000
-        else:
-            comment_cnt = int(comment_str)
+        # if "点赞" in like_str:
+        #     like_cnt = 0
+        # elif "万" in like_str:
+        #     like_cnt = int(like_str.split("万")[0]) * 10000
+        # else:
+        #     like_cnt = int(like_str)
+        # if "评论" in comment_str:
+        #     comment_cnt = 0
+        # elif "万" in comment_str:
+        #     comment_cnt = int(comment_str.split("万")[0]) * 10000
+        # else:
+        #     comment_cnt = int(comment_str)
         out_video_id = md5(video_title.encode('utf8')).hexdigest()
         out_user_id = md5(user_name.encode('utf8')).hexdigest()
-
+        repeat_id = self.repeat_video(out_video_id)
+        if False == repeat_id:
+            num = time.time()
+            out_video_id = out_video_id+str(num)
         video_dict = {
             "video_title": video_title,
             "video_id": out_video_id,
@@ -318,10 +326,10 @@ class ZFQZRecommendNew:
             "duration": duration,
             "play_str": play_str,
             "play_cnt": play_cnt,
-            "like_str": like_str,
-            "like_cnt": like_cnt,
-            "comment_cnt": comment_cnt,
-            "share_cnt": 0,
+            "like_str": "",
+            "like_cnt": 50,
+            "comment_cnt": 0,
+            "share_cnt": 50,
             "user_name": user_name,
             "user_id": out_user_id,
             'publish_time_stamp': int(time.time()),
@@ -331,6 +339,15 @@ class ZFQZRecommendNew:
             "cover_url": cover_url,
             "session": f"zhufuquanzi-{int(time.time())}"
         }
+        AliyunLogger.logging(
+            code="1001",
+            platform=self.platform,
+            mode=self.log_type,
+            env=self.env,
+            trace_id=trace_id,
+            message="扫描到一条视频",
+            data=video_dict
+        )
         pipeline = PiaoQuanPipeline(
             platform=self.crawler,
             mode=self.log_type,
@@ -359,6 +376,9 @@ class ZFQZRecommendNew:
                 time.sleep(5)
                 return
             video_dict['video_url'] = video_url
+            video_dict['like_cnt'] = 0
+            video_dict['share_cnt'] = 0
+
             video_dict["platform"] = self.crawler
             video_dict["strategy"] = self.log_type
             video_dict["out_video_id"] = video_dict["video_id"]
@@ -366,6 +386,15 @@ class ZFQZRecommendNew:
             video_dict["user_id"] = self.our_uid
             video_dict["publish_time"] = video_dict["publish_time_str"]
             self.mq.send_msg(video_dict)
+            AliyunLogger.logging(
+                code="1002",
+                platform=self.platform,
+                mode=self.log_type,
+                env=self.env,
+                trace_id=trace_id,
+                message="发送到ETL成功",
+                data=video_dict
+            )
             self.download_cnt += 1
             self.driver.press_keycode(AndroidKey.BACK)
             time.sleep(5)
@@ -373,9 +402,9 @@ class ZFQZRecommendNew:
 
 
 if __name__ == "__main__":
-    rule_dict1 = {"period": {"min": 1, "max": 365},
-                  "duration": {"min": 1, "max": 1800},
-                  "favorite_cnt": {"min": 1, "max": 0},
-                  "videos_cnt": {"min": 1, "max": 20},
-                  "share_cnt": {"min": 1, "max": 0}}
+    rule_dict1 = {"period": {"min": 0, "max": 0},
+                  "duration": {"min": 1, "max": 0},
+                  "favorite_cnt": {"min": 0, "max": 0},
+                  "videos_cnt": {"min": 0, "max": 0},
+                  "share_cnt": {"min": 0, "max": 0}}
     ZFQZRecommendNew("recommend", "zhufuquanzi", "dev", rule_dict1, 6267141)