wangkun 1 سال پیش
والد
کامیت
160623ae5c
3فایلهای تغییر یافته به همراه187 افزوده شده و 186 حذف شده
  1. 1 0
      README.MD
  2. 88 88
      shipinhao/shipinhao_search/shipinhao_search.py
  3. 98 98
      shipinhao/shipinhao_search/shipinhao_search_scheduling.py

+ 1 - 0
README.MD

@@ -247,4 +247,5 @@ ps aux | grep run_shipinhao | grep -v grep | awk '{print $2}' | xargs kill -9
 ps aux | grep Appium.app | grep -v grep | awk '{print $2}' | xargs kill -9
 ps aux | grep kuaishou | grep -v grep | awk '{print $2}' | xargs kill -9
 ps aux | grep xigua_search | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep kanyikan | grep -v grep | awk '{print $2}' | xargs kill -9
 ```

+ 88 - 88
shipinhao/shipinhao_search/shipinhao_search.py

@@ -297,102 +297,102 @@ class ShipinhaoSearch:
 
         index = 0
         while True:
-            # try:
-            if cls.search_elements(driver, '//*[@class="double-rich double-rich_vertical"]') is None:
-                Common.logger(log_type, crawler).info('窗口已销毁\n')
-                return
-
-            Common.logger(log_type, crawler).info('获取视频列表\n')
-            video_elements = cls.search_elements(driver, '//div[@class="vc active__mask"]')
-            if video_elements is None:
-                Common.logger(log_type, crawler).warning(f'video_elements:{video_elements}')
-                return
-
-            video_element_temp = video_elements[index:]
-            if len(video_element_temp) == 0:
-                Common.logger(log_type, crawler).info('到底啦~~~~~~~~~~~~~\n')
-                return
+            try:
+                if cls.search_elements(driver, '//*[@class="double-rich double-rich_vertical"]') is None:
+                    Common.logger(log_type, crawler).info('窗口已销毁\n')
+                    return
 
-            for i, video_element in enumerate(video_element_temp):
-                Common.logger(log_type, crawler).info(f"download_cnt:{cls.download_cnt}")
-                if cls.download_cnt >= cls.videos_cnt(log_type, crawler):
-                    Common.logger(log_type, crawler).info(f'搜索词:"{word}",已抓取视频数:{cls.download_cnt}')
-                    cls.download_cnt = 0
+                Common.logger(log_type, crawler).info('获取视频列表\n')
+                video_elements = cls.search_elements(driver, '//div[@class="vc active__mask"]')
+                if video_elements is None:
+                    Common.logger(log_type, crawler).warning(f'video_elements:{video_elements}')
                     return
 
-                if video_element is None:
-                    Common.logger(log_type, crawler).info('到底啦~\n')
+                video_element_temp = video_elements[index:]
+                if len(video_element_temp) == 0:
+                    Common.logger(log_type, crawler).info('到底啦~~~~~~~~~~~~~\n')
                     return
 
-                cls.i += 1
-                cls.search_elements(driver, '//div[@class="vc active__mask"]')
+                for i, video_element in enumerate(video_element_temp):
+                    Common.logger(log_type, crawler).info(f"download_cnt:{cls.download_cnt}")
+                    if cls.download_cnt >= cls.videos_cnt(log_type, crawler):
+                        Common.logger(log_type, crawler).info(f'搜索词:"{word}",已抓取视频数:{cls.download_cnt}')
+                        cls.download_cnt = 0
+                        return
 
-                Common.logger(log_type, crawler).info(f'拖动"视频"列表第{cls.i}个至屏幕中间')
-                time.sleep(3)
-                driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})",
-                                      video_element)
-                if len(video_element.find_elements(By.XPATH, "//*[@text='没有更多的搜索结果']")) != 0:
-                    Common.logger(log_type, crawler).info("没有更多的搜索结果\n")
-                    return
-                video_title = video_element.find_elements(By.XPATH, '//div[@class="title ellipsis_2"]/*[2]')[index + i].text[:40]
-                video_url = video_element.find_elements(By.XPATH, '//div[@class="video-player"]')[index+i].get_attribute('src')
-                cover_url = video_element.find_elements(By.XPATH, '//div[@class="video-player__bd"]')[index+i].get_attribute('style')
-                cover_url = cover_url.split('url("')[-1].split('")')[0]
-                duration = video_element.find_elements(By.XPATH, '//div[@class="play-mask__text"]/*[2]')[index+i].text
-                duration = int(duration.split(':')[0]) * 60 + int(duration.split(':')[-1])
-                user_name = video_element.find_elements(By.XPATH, '//p[@class="vc-source__text"]')[index+i].text
-                avatar_url = video_element.find_elements(By.XPATH, '//div[@class="ui-image-image ui-image vc-source__thumb"]')[index+i].get_attribute('style')
-                avatar_url = avatar_url.split('url("')[-1].split('")')[0]
-                out_video_id = md5(video_title.encode('utf8')).hexdigest()
-                out_user_id = md5(user_name.encode('utf8')).hexdigest()
-
-                video_dict = {
-                    "video_title": video_title,
-                    "video_id": out_video_id,
-                    "play_cnt": 0,
-                    "duration": duration,
-                    "user_name": user_name,
-                    "user_id": out_user_id,
-                    "avatar_url": avatar_url,
-                    "cover_url": cover_url,
-                    "video_url": video_url,
-                    "session": f"shipinhao-search-{int(time.time())}"
-                }
-                for k, v in video_dict.items():
-                    Common.logger(log_type, crawler).info(f"{k}:{v}")
-                if video_title is None or video_url is None:
-                    Common.logger(log_type, crawler).info("无效视频\n")
-                elif cls.repeat_out_video_id(log_type, crawler, out_video_id, env) != 0:
-                    Common.logger(log_type, crawler).info('视频已下载\n')
-                elif cls.repeat_video_url(log_type, crawler, video_url, env) != 0:
-                    Common.logger(log_type, crawler).info('视频已下载\n')
-                else:
-                    video_element.click()
+                    if video_element is None:
+                        Common.logger(log_type, crawler).info('到底啦~\n')
+                        return
+
+                    cls.i += 1
+                    cls.search_elements(driver, '//div[@class="vc active__mask"]')
+
+                    Common.logger(log_type, crawler).info(f'拖动"视频"列表第{cls.i}个至屏幕中间')
                     time.sleep(3)
-                    video_info_dict = cls.get_video_info(driver)
-                    video_dict["like_cnt"] = video_info_dict["like_cnt"]
-                    video_dict["share_cnt"] = video_info_dict["share_cnt"]
-                    video_dict["favorite_cnt"] = video_info_dict["favorite_cnt"]
-                    video_dict["comment_cnt"] = video_info_dict["comment_cnt"]
-                    video_dict["publish_time_str"] = video_info_dict["publish_time_str"]
-                    video_dict["publish_time_stamp"] = video_info_dict["publish_time_stamp"]
-                    Common.logger(log_type, crawler).info(f'publish_time:{video_dict["publish_time_str"]}')
-                    if cls.download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict) is False:
-                        Common.logger(log_type, crawler).info("不满足抓取规则\n")
+                    driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})",
+                                          video_element)
+                    if len(video_element.find_elements(By.XPATH, "//*[@text='没有更多的搜索结果']")) != 0:
+                        Common.logger(log_type, crawler).info("没有更多的搜索结果\n")
+                        return
+                    video_title = video_element.find_elements(By.XPATH, '//div[@class="title ellipsis_2"]/*[2]')[index + i].text[:40]
+                    video_url = video_element.find_elements(By.XPATH, '//div[@class="video-player"]')[index+i].get_attribute('src')
+                    cover_url = video_element.find_elements(By.XPATH, '//div[@class="video-player__bd"]')[index+i].get_attribute('style')
+                    cover_url = cover_url.split('url("')[-1].split('")')[0]
+                    duration = video_element.find_elements(By.XPATH, '//div[@class="play-mask__text"]/*[2]')[index+i].text
+                    duration = int(duration.split(':')[0]) * 60 + int(duration.split(':')[-1])
+                    user_name = video_element.find_elements(By.XPATH, '//p[@class="vc-source__text"]')[index+i].text
+                    avatar_url = video_element.find_elements(By.XPATH, '//div[@class="ui-image-image ui-image vc-source__thumb"]')[index+i].get_attribute('style')
+                    avatar_url = avatar_url.split('url("')[-1].split('")')[0]
+                    out_video_id = md5(video_title.encode('utf8')).hexdigest()
+                    out_user_id = md5(user_name.encode('utf8')).hexdigest()
+
+                    video_dict = {
+                        "video_title": video_title,
+                        "video_id": out_video_id,
+                        "play_cnt": 0,
+                        "duration": duration,
+                        "user_name": user_name,
+                        "user_id": out_user_id,
+                        "avatar_url": avatar_url,
+                        "cover_url": cover_url,
+                        "video_url": video_url,
+                        "session": f"shipinhao-search-{int(time.time())}"
+                    }
+                    for k, v in video_dict.items():
+                        Common.logger(log_type, crawler).info(f"{k}:{v}")
+                    if video_title is None or video_url is None:
+                        Common.logger(log_type, crawler).info("无效视频\n")
+                    elif cls.repeat_out_video_id(log_type, crawler, out_video_id, env) != 0:
+                        Common.logger(log_type, crawler).info('视频已下载\n')
+                    elif cls.repeat_video_url(log_type, crawler, video_url, env) != 0:
+                        Common.logger(log_type, crawler).info('视频已下载\n')
                     else:
-                        cls.download_publish(log_type=log_type,
-                                             crawler=crawler,
-                                             word=word,
-                                             video_dict=video_dict,
-                                             our_uid=our_uid,
-                                             env=env)
-
-            Common.logger(log_type, crawler).info('已抓取完一组视频,休眠1秒\n')
-            time.sleep(1)
-            index = index + len(video_element_temp)
-            # except Exception as e:
-            #     Common.logger(log_type, crawler).info(f"get_videoList:{e}\n")
-            #     cls.i = 0
+                        video_element.click()
+                        time.sleep(3)
+                        video_info_dict = cls.get_video_info(driver)
+                        video_dict["like_cnt"] = video_info_dict["like_cnt"]
+                        video_dict["share_cnt"] = video_info_dict["share_cnt"]
+                        video_dict["favorite_cnt"] = video_info_dict["favorite_cnt"]
+                        video_dict["comment_cnt"] = video_info_dict["comment_cnt"]
+                        video_dict["publish_time_str"] = video_info_dict["publish_time_str"]
+                        video_dict["publish_time_stamp"] = video_info_dict["publish_time_stamp"]
+                        Common.logger(log_type, crawler).info(f'publish_time:{video_dict["publish_time_str"]}')
+                        if cls.download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict) is False:
+                            Common.logger(log_type, crawler).info("不满足抓取规则\n")
+                        else:
+                            cls.download_publish(log_type=log_type,
+                                                 crawler=crawler,
+                                                 word=word,
+                                                 video_dict=video_dict,
+                                                 our_uid=our_uid,
+                                                 env=env)
+
+                Common.logger(log_type, crawler).info('已抓取完一组视频,休眠1秒\n')
+                time.sleep(1)
+                index = index + len(video_element_temp)
+            except Exception as e:
+                Common.logger(log_type, crawler).info(f"get_videoList:{e}\n")
+                cls.i = 0
 
     @classmethod
     def download_publish(cls, log_type, crawler, word, video_dict, our_uid, env):

+ 98 - 98
shipinhao/shipinhao_search/shipinhao_search_scheduling.py

@@ -262,101 +262,101 @@ class ShipinhaoSearchScheduling:
         videos_cnt = rule_dict.get('videos_cnt', {}).get('min', 0)
         index = 0
         while True:
-            # try:
-            if cls.search_elements(driver, '//*[@class="double-rich double-rich_vertical"]') is None:
-                Common.logger(log_type, crawler).info('窗口已销毁\n')
-                return
-
-            Common.logger(log_type, crawler).info('获取视频列表\n')
-            video_elements = cls.search_elements(driver, '//div[@class="vc active__mask"]')
-            if video_elements is None:
-                Common.logger(log_type, crawler).warning(f'video_elements:{video_elements}')
-                return
-
-            video_element_temp = video_elements[index:]
-            if len(video_element_temp) == 0:
-                Common.logger(log_type, crawler).info('到底啦~~~~~~~~~~~~~\n')
-                return
+            try:
+                if cls.search_elements(driver, '//*[@class="double-rich double-rich_vertical"]') is None:
+                    Common.logger(log_type, crawler).info('窗口已销毁\n')
+                    return
 
-            for i, video_element in enumerate(video_element_temp):
-                Common.logger(log_type, crawler).info(f"download_cnt:{cls.download_cnt}")
-                if cls.download_cnt >= int(videos_cnt):
-                    Common.logger(log_type, crawler).info(f'搜索词:"{word}",已抓取视频数:{cls.download_cnt}')
-                    cls.download_cnt = 0
+                Common.logger(log_type, crawler).info('获取视频列表\n')
+                video_elements = cls.search_elements(driver, '//div[@class="vc active__mask"]')
+                if video_elements is None:
+                    Common.logger(log_type, crawler).warning(f'video_elements:{video_elements}')
                     return
 
-                if video_element is None:
-                    Common.logger(log_type, crawler).info('到底啦~\n')
+                video_element_temp = video_elements[index:]
+                if len(video_element_temp) == 0:
+                    Common.logger(log_type, crawler).info('到底啦~~~~~~~~~~~~~\n')
                     return
 
-                cls.i += 1
-                cls.search_elements(driver, '//div[@class="vc active__mask"]')
+                for i, video_element in enumerate(video_element_temp):
+                    Common.logger(log_type, crawler).info(f"download_cnt:{cls.download_cnt}")
+                    if cls.download_cnt >= int(videos_cnt):
+                        Common.logger(log_type, crawler).info(f'搜索词:"{word}",已抓取视频数:{cls.download_cnt}')
+                        cls.download_cnt = 0
+                        return
 
-                Common.logger(log_type, crawler).info(f'拖动"视频"列表第{cls.i}个至屏幕中间')
-                time.sleep(3)
-                driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})",
-                                      video_element)
-                if len(video_element.find_elements(By.XPATH, "//*[@text='没有更多的搜索结果']")) != 0:
-                    Common.logger(log_type, crawler).info("没有更多的搜索结果\n")
-                    return
-                video_title = video_element.find_elements(By.XPATH, '//div[@class="title ellipsis_2"]/*[2]')[index + i].text
-                video_url = video_element.find_elements(By.XPATH, '//div[@class="video-player"]')[index+i].get_attribute('src')
-                cover_url = video_element.find_elements(By.XPATH, '//div[@class="video-player__bd"]')[index+i].get_attribute('style')
-                cover_url = cover_url.split('url("')[-1].split('")')[0]
-                duration = video_element.find_elements(By.XPATH, '//div[@class="play-mask__text"]/*[2]')[index+i].text
-                duration = int(duration.split(':')[0]) * 60 + int(duration.split(':')[-1])
-                user_name = video_element.find_elements(By.XPATH, '//p[@class="vc-source__text"]')[index+i].text
-                avatar_url = video_element.find_elements(By.XPATH, '//div[@class="ui-image-image ui-image vc-source__thumb"]')[index+i].get_attribute('style')
-                avatar_url = avatar_url.split('url("')[-1].split('")')[0]
-                out_video_id = md5(video_title.encode('utf8')).hexdigest()
-                out_user_id = md5(user_name.encode('utf8')).hexdigest()
-
-                video_dict = {
-                    "video_title": video_title,
-                    "video_id": out_video_id,
-                    "play_cnt": 0,
-                    "duration": duration,
-                    "user_name": user_name,
-                    "user_id": out_user_id,
-                    "avatar_url": avatar_url,
-                    "cover_url": cover_url,
-                    "video_url": video_url,
-                    "session": f"shipinhao-search-{int(time.time())}"
-                }
-                for k, v in video_dict.items():
-                    Common.logger(log_type, crawler).info(f"{k}:{v}")
-                if video_title is None or video_url is None:
-                    Common.logger(log_type, crawler).info("无效视频\n")
-                elif cls.repeat_out_video_id(log_type, crawler, out_video_id, env) != 0:
-                    Common.logger(log_type, crawler).info('视频已下载\n')
-                elif cls.repeat_video_url(log_type, crawler, video_url, env) != 0:
-                    Common.logger(log_type, crawler).info('视频已下载\n')
-                else:
-                    video_element.click()
+                    if video_element is None:
+                        Common.logger(log_type, crawler).info('到底啦~\n')
+                        return
+
+                    cls.i += 1
+                    cls.search_elements(driver, '//div[@class="vc active__mask"]')
+
+                    Common.logger(log_type, crawler).info(f'拖动"视频"列表第{cls.i}个至屏幕中间')
                     time.sleep(3)
-                    video_info_dict = cls.get_video_info(driver)
-                    video_dict["like_cnt"] = video_info_dict["like_cnt"]
-                    video_dict["share_cnt"] = video_info_dict["share_cnt"]
-                    video_dict["favorite_cnt"] = video_info_dict["favorite_cnt"]
-                    video_dict["comment_cnt"] = video_info_dict["comment_cnt"]
-                    video_dict["publish_time_str"] = video_info_dict["publish_time_str"]
-                    video_dict["publish_time_stamp"] = video_info_dict["publish_time_stamp"]
-
-                    cls.download_publish(log_type=log_type,
-                                         crawler=crawler,
-                                         word=word,
-                                         rule_dict=rule_dict,
-                                         video_dict=video_dict,
-                                         our_uid=our_uid,
-                                         oss_endpoint=oss_endpoint,
-                                         env=env)
-
-            Common.logger(log_type, crawler).info('已抓取完一组视频,休眠1秒\n')
-            time.sleep(1)
-            index = index + len(video_element_temp)
-            # except Exception as e:
-            #     Common.logger(log_type, crawler).info(f"get_videoList:{e}\n")
-            #     cls.i = 0
+                    driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})",
+                                          video_element)
+                    if len(video_element.find_elements(By.XPATH, "//*[@text='没有更多的搜索结果']")) != 0:
+                        Common.logger(log_type, crawler).info("没有更多的搜索结果\n")
+                        return
+                    video_title = video_element.find_elements(By.XPATH, '//div[@class="title ellipsis_2"]/*[2]')[index + i].text
+                    video_url = video_element.find_elements(By.XPATH, '//div[@class="video-player"]')[index+i].get_attribute('src')
+                    cover_url = video_element.find_elements(By.XPATH, '//div[@class="video-player__bd"]')[index+i].get_attribute('style')
+                    cover_url = cover_url.split('url("')[-1].split('")')[0]
+                    duration = video_element.find_elements(By.XPATH, '//div[@class="play-mask__text"]/*[2]')[index+i].text
+                    duration = int(duration.split(':')[0]) * 60 + int(duration.split(':')[-1])
+                    user_name = video_element.find_elements(By.XPATH, '//p[@class="vc-source__text"]')[index+i].text
+                    avatar_url = video_element.find_elements(By.XPATH, '//div[@class="ui-image-image ui-image vc-source__thumb"]')[index+i].get_attribute('style')
+                    avatar_url = avatar_url.split('url("')[-1].split('")')[0]
+                    out_video_id = md5(video_title.encode('utf8')).hexdigest()
+                    out_user_id = md5(user_name.encode('utf8')).hexdigest()
+
+                    video_dict = {
+                        "video_title": video_title,
+                        "video_id": out_video_id,
+                        "play_cnt": 0,
+                        "duration": duration,
+                        "user_name": user_name,
+                        "user_id": out_user_id,
+                        "avatar_url": avatar_url,
+                        "cover_url": cover_url,
+                        "video_url": video_url,
+                        "session": f"shipinhao-search-{int(time.time())}"
+                    }
+                    for k, v in video_dict.items():
+                        Common.logger(log_type, crawler).info(f"{k}:{v}")
+                    if video_title is None or video_url is None:
+                        Common.logger(log_type, crawler).info("无效视频\n")
+                    elif cls.repeat_out_video_id(log_type, crawler, out_video_id, env) != 0:
+                        Common.logger(log_type, crawler).info('视频已下载\n')
+                    elif cls.repeat_video_url(log_type, crawler, video_url, env) != 0:
+                        Common.logger(log_type, crawler).info('视频已下载\n')
+                    else:
+                        video_element.click()
+                        time.sleep(3)
+                        video_info_dict = cls.get_video_info(driver)
+                        video_dict["like_cnt"] = video_info_dict["like_cnt"]
+                        video_dict["share_cnt"] = video_info_dict["share_cnt"]
+                        video_dict["favorite_cnt"] = video_info_dict["favorite_cnt"]
+                        video_dict["comment_cnt"] = video_info_dict["comment_cnt"]
+                        video_dict["publish_time_str"] = video_info_dict["publish_time_str"]
+                        video_dict["publish_time_stamp"] = video_info_dict["publish_time_stamp"]
+
+                        cls.download_publish(log_type=log_type,
+                                             crawler=crawler,
+                                             word=word,
+                                             rule_dict=rule_dict,
+                                             video_dict=video_dict,
+                                             our_uid=our_uid,
+                                             oss_endpoint=oss_endpoint,
+                                             env=env)
+
+                Common.logger(log_type, crawler).info('已抓取完一组视频,休眠1秒\n')
+                time.sleep(1)
+                index = index + len(video_element_temp)
+            except Exception as e:
+                Common.logger(log_type, crawler).info(f"get_videoList:{e}\n")
+                cls.i = 0
 
     @classmethod
     def download_publish(cls, log_type, crawler, word, rule_dict, video_dict, our_uid, oss_endpoint, env):
@@ -633,16 +633,16 @@ class ShipinhaoSearchScheduling:
             search_word = user["search_word"]
             our_uid = user["our_uid"]
             Common.logger(log_type, crawler).info(f"开始抓取搜索词:{search_word}")
-            # try:
-            cls.start_wechat(log_type=log_type,
-                             crawler=crawler,
-                             word=search_word,
-                             rule_dict=rule_dict,
-                             our_uid=our_uid,
-                             oss_endpoint=oss_endpoint,
-                             env=env)
-            # except Exception as e:
-            #     Common.logger(log_type, crawler).error(f"search_video:{e}\n")
+            try:
+                cls.start_wechat(log_type=log_type,
+                                 crawler=crawler,
+                                 word=search_word,
+                                 rule_dict=rule_dict,
+                                 our_uid=our_uid,
+                                 oss_endpoint=oss_endpoint,
+                                 env=env)
+            except Exception as e:
+                Common.logger(log_type, crawler).error(f"search_video:{e}\n")
 
 
 if __name__ == '__main__':