|
@@ -164,6 +164,7 @@ class ShipinhaoSearch:
|
|
|
@classmethod
|
|
|
def start_wechat(cls, log_type, crawler, word, our_uid, env):
|
|
|
Common.logger(log_type, crawler).info('启动微信')
|
|
|
+ Common.logging(log_type, crawler, env, '启动微信')
|
|
|
if env == "dev":
|
|
|
chromedriverExecutable = "/Users/wangkun/Downloads/chromedriver/chromedriver_v107/chromedriver"
|
|
|
else:
|
|
@@ -207,12 +208,14 @@ class ShipinhaoSearch:
|
|
|
env=env)
|
|
|
cls.close_wechat(log_type=log_type,
|
|
|
crawler=crawler,
|
|
|
+ env=env,
|
|
|
driver=driver)
|
|
|
|
|
|
@classmethod
|
|
|
- def close_wechat(cls, log_type, crawler, driver: WebDriver):
|
|
|
+ def close_wechat(cls, log_type, crawler, env, driver: WebDriver):
|
|
|
driver.quit()
|
|
|
Common.logger(log_type, crawler).info(f"微信退出成功\n")
|
|
|
+ Common.logging(log_type, crawler, env, f"微信退出成功\n")
|
|
|
|
|
|
@classmethod
|
|
|
def is_contain_chinese(cls, strword):
|
|
@@ -274,6 +277,7 @@ class ShipinhaoSearch:
|
|
|
md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
|
|
|
shutil.rmtree(f"./{crawler}/videos/{md_title}/")
|
|
|
Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
|
|
|
+ Common.logging(log_type, crawler, env, "视频size=0,删除成功\n")
|
|
|
return
|
|
|
video_dict["video_width"] = ffmpeg_dict["width"]
|
|
|
video_dict["video_height"] = ffmpeg_dict["height"]
|
|
@@ -283,6 +287,7 @@ class ShipinhaoSearch:
|
|
|
md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
|
|
|
shutil.rmtree(f"./{crawler}/videos/{md_title}/")
|
|
|
Common.logger(log_type, crawler).info("宽高不满足抓取规则,删除成功\n")
|
|
|
+ Common.logging(log_type, crawler, env, "宽高不满足抓取规则,删除成功\n")
|
|
|
return
|
|
|
|
|
|
# 下载封面
|
|
@@ -292,6 +297,7 @@ class ShipinhaoSearch:
|
|
|
|
|
|
# 上传视频
|
|
|
Common.logger(log_type, crawler).info("开始上传视频...")
|
|
|
+ Common.logging(log_type, crawler, env, "开始上传视频...")
|
|
|
our_video_id = Publish.upload_and_publish(log_type=log_type,
|
|
|
crawler=crawler,
|
|
|
strategy="搜索爬虫策略",
|
|
@@ -303,6 +309,7 @@ class ShipinhaoSearch:
|
|
|
else:
|
|
|
our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
|
|
|
Common.logger(log_type, crawler).info("视频上传完成")
|
|
|
+ Common.logging(log_type, crawler, env, "视频上传完成")
|
|
|
|
|
|
if our_video_id is None:
|
|
|
try:
|
|
@@ -310,6 +317,7 @@ class ShipinhaoSearch:
|
|
|
md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
|
|
|
shutil.rmtree(f"./{crawler}/videos/{md_title}")
|
|
|
Common.logger(log_type, crawler).warning(f"our_video_id:{our_video_id}, 删除成功\n")
|
|
|
+ Common.logging(log_type, crawler, env, f"our_video_id:{our_video_id}, 删除成功\n")
|
|
|
return
|
|
|
except FileNotFoundError:
|
|
|
return
|
|
@@ -344,8 +352,10 @@ class ShipinhaoSearch:
|
|
|
{int(video_dict['video_width'])},
|
|
|
{int(video_dict['video_height'])}) """
|
|
|
Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
|
|
|
+ Common.logging(log_type, crawler, env, f"insert_sql:{insert_sql}")
|
|
|
MysqlHelper.update_values(log_type, crawler, insert_sql, env)
|
|
|
Common.logger(log_type, crawler).info('视频信息插入数据库成功!')
|
|
|
+ Common.logging(log_type, crawler, env, '视频信息插入数据库成功!')
|
|
|
|
|
|
# 写飞书
|
|
|
Feishu.insert_columns(log_type, crawler, "xYWCzf", "ROWS", 1, 2)
|
|
@@ -368,6 +378,7 @@ class ShipinhaoSearch:
|
|
|
video_dict["video_url"]]]
|
|
|
Feishu.update_values(log_type, crawler, "xYWCzf", "F2:Z2", values)
|
|
|
Common.logger(log_type, crawler).info("写入飞书成功\n")
|
|
|
+ Common.logging(log_type, crawler, env, "写入飞书成功\n")
|
|
|
cls.download_cnt += 1
|
|
|
|
|
|
@classmethod
|
|
@@ -483,6 +494,7 @@ class ShipinhaoSearch:
|
|
|
user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
|
|
|
if user_sheet is None:
|
|
|
Common.logger(log_type, crawler).warning(f"user_sheet:{user_sheet}, 3秒钟后重试")
|
|
|
+ Common.logging(log_type, crawler, env, f"user_sheet:{user_sheet}, 3秒钟后重试")
|
|
|
time.sleep(3)
|
|
|
continue
|
|
|
our_user_list = []
|
|
@@ -496,6 +508,7 @@ class ShipinhaoSearch:
|
|
|
tag4 = user_sheet[i][11]
|
|
|
tag5 = user_sheet[i][12]
|
|
|
Common.logger(log_type, crawler).info(f"正在更新 {search_word} 搜索词信息")
|
|
|
+ Common.logging(log_type, crawler, env, f"正在更新 {search_word} 搜索词信息")
|
|
|
if our_uid is None:
|
|
|
default_user = getUser.get_default_user()
|
|
|
# 用来创建our_id的信息
|
|
@@ -514,6 +527,7 @@ class ShipinhaoSearch:
|
|
|
Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}',
|
|
|
[[our_uid, our_user_link]])
|
|
|
Common.logger(log_type, crawler).info(f'站内用户主页创建成功:{our_user_link}\n')
|
|
|
+ Common.logging(log_type, crawler, env, f'站内用户主页创建成功:{our_user_link}\n')
|
|
|
our_user_dict = {
|
|
|
'out_uid': '',
|
|
|
'search_word': search_word,
|
|
@@ -530,11 +544,13 @@ class ShipinhaoSearch:
|
|
|
# 点击微信搜索框,并输入搜索词
|
|
|
driver.implicitly_wait(10)
|
|
|
Common.logger(log_type, crawler).info("点击搜索框")
|
|
|
+ Common.logging(log_type, crawler, env, "点击搜索框")
|
|
|
driver.find_element(By.ID, 'com.tencent.mm:id/j5t').click()
|
|
|
time.sleep(0.5)
|
|
|
driver.find_element(By.ID, 'com.tencent.mm:id/cd7').clear().send_keys(word)
|
|
|
driver.press_keycode(AndroidKey.ENTER)
|
|
|
Common.logger(log_type, crawler).info("进入搜索词页面")
|
|
|
+ Common.logging(log_type, crawler, env, "进入搜索词页面")
|
|
|
driver.find_elements(By.ID, 'com.tencent.mm:id/br8')[0].click()
|
|
|
time.sleep(5)
|
|
|
|
|
@@ -542,12 +558,14 @@ class ShipinhaoSearch:
|
|
|
check_to_webview = cls.check_to_webview(log_type, crawler, driver)
|
|
|
if check_to_webview is None:
|
|
|
Common.logger(log_type, crawler).info("切换到视频号 webview 失败\n")
|
|
|
+ Common.logging(log_type, crawler, env, "切换到视频号 webview 失败\n")
|
|
|
return
|
|
|
time.sleep(1)
|
|
|
|
|
|
# 切换到"视频号"分类
|
|
|
shipinhao_tags = cls.search_elements(driver, '//div[@class="unit"]/*[2]')
|
|
|
Common.logger(log_type, crawler).info('点击"视频号"分类')
|
|
|
+ Common.logging(log_type, crawler, env, '点击"视频号"分类')
|
|
|
shipinhao_tags[0].click()
|
|
|
time.sleep(5)
|
|
|
|
|
@@ -555,40 +573,49 @@ class ShipinhaoSearch:
|
|
|
while True:
|
|
|
if cls.search_elements(driver, '//*[@class="mixed-box__bd"]') is None:
|
|
|
Common.logger(log_type, crawler).info('窗口已销毁\n')
|
|
|
+ Common.logging(log_type, crawler, env, '窗口已销毁\n')
|
|
|
return
|
|
|
|
|
|
Common.logger(log_type, crawler).info('获取视频列表\n')
|
|
|
+ Common.logging(log_type, crawler, env, '获取视频列表\n')
|
|
|
video_elements = cls.search_elements(driver, '//div[@class="rich-media active__absolute"]')
|
|
|
if video_elements is None:
|
|
|
Common.logger(log_type, crawler).warning(f'video_elements:{video_elements}')
|
|
|
+ Common.logging(log_type, crawler, env, f'video_elements:{video_elements}')
|
|
|
return
|
|
|
|
|
|
video_element_temp = video_elements[index:]
|
|
|
if len(video_element_temp) == 0:
|
|
|
Common.logger(log_type, crawler).info('到底啦~~~~~~~~~~~~~\n')
|
|
|
+ Common.logging(log_type, crawler, env, '到底啦~~~~~~~~~~~~~\n')
|
|
|
return
|
|
|
|
|
|
for i, video_element in enumerate(video_element_temp):
|
|
|
try:
|
|
|
Common.logger(log_type, crawler).info(f"download_cnt:{cls.download_cnt}")
|
|
|
+ Common.logging(log_type, crawler, env, f"download_cnt:{cls.download_cnt}")
|
|
|
if cls.download_cnt >= cls.videos_cnt(log_type, crawler):
|
|
|
Common.logger(log_type, crawler).info(f'搜索词:"{word}",已抓取视频数:{cls.download_cnt}')
|
|
|
+ Common.logging(log_type, crawler, env, f'搜索词:"{word}",已抓取视频数:{cls.download_cnt}')
|
|
|
cls.download_cnt = 0
|
|
|
return
|
|
|
|
|
|
if video_element is None:
|
|
|
Common.logger(log_type, crawler).info('到底啦~\n')
|
|
|
+ Common.logging(log_type, crawler, env, '到底啦~\n')
|
|
|
return
|
|
|
|
|
|
cls.i += 1
|
|
|
cls.search_elements(driver, '//*[@class="rich-media active__absolute"]')
|
|
|
|
|
|
Common.logger(log_type, crawler).info(f'拖动"视频"列表第{cls.i}个至屏幕中间')
|
|
|
+ Common.logging(log_type, crawler, env, f'拖动"视频"列表第{cls.i}个至屏幕中间')
|
|
|
time.sleep(3)
|
|
|
driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})",
|
|
|
video_element)
|
|
|
if len(video_element.find_elements(By.XPATH, "//*[@text='没有更多的搜索结果']")) != 0:
|
|
|
Common.logger(log_type, crawler).info("没有更多的搜索结果\n")
|
|
|
+ Common.logging(log_type, crawler, env, "没有更多的搜索结果\n")
|
|
|
return
|
|
|
video_title = video_element.find_elements(By.XPATH, '//div[@class="rich-media__title ellipsis_2"]/span')[index + i].text[:40]
|
|
|
video_url = video_element.find_elements(By.XPATH, '//div[@class="video-player"]')[index+i].get_attribute('src')
|
|
@@ -616,12 +643,16 @@ class ShipinhaoSearch:
|
|
|
}
|
|
|
for k, v in video_dict.items():
|
|
|
Common.logger(log_type, crawler).info(f"{k}:{v}")
|
|
|
+ Common.logging(log_type, crawler, env, f"{video_dict}")
|
|
|
if video_title is None or video_url is None:
|
|
|
Common.logger(log_type, crawler).info("无效视频\n")
|
|
|
+ Common.logging(log_type, crawler, env, "无效视频\n")
|
|
|
elif cls.repeat_out_video_id(log_type, crawler, out_video_id, env) != 0:
|
|
|
Common.logger(log_type, crawler).info('视频已下载\n')
|
|
|
+ Common.logging(log_type, crawler, env, '视频已下载\n')
|
|
|
elif cls.repeat_video_url(log_type, crawler, video_url, env) != 0:
|
|
|
Common.logger(log_type, crawler).info('视频已下载\n')
|
|
|
+ Common.logging(log_type, crawler, env, '视频已下载\n')
|
|
|
else:
|
|
|
video_element.click()
|
|
|
time.sleep(3)
|
|
@@ -633,8 +664,10 @@ class ShipinhaoSearch:
|
|
|
video_dict["publish_time_str"] = video_info_dict["publish_time_str"]
|
|
|
video_dict["publish_time_stamp"] = video_info_dict["publish_time_stamp"]
|
|
|
Common.logger(log_type, crawler).info(f'publish_time:{video_dict["publish_time_str"]}')
|
|
|
+ Common.logging(log_type, crawler, env, f'publish_time:{video_dict["publish_time_str"]}')
|
|
|
if cls.download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict) is False:
|
|
|
Common.logger(log_type, crawler).info("不满足抓取规则\n")
|
|
|
+ Common.logging(log_type, crawler, env, "不满足抓取规则\n")
|
|
|
else:
|
|
|
cls.download_publish(log_type=log_type,
|
|
|
crawler=crawler,
|
|
@@ -644,8 +677,10 @@ class ShipinhaoSearch:
|
|
|
env=env)
|
|
|
except Exception as e:
|
|
|
Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
|
|
|
+ Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
|
|
|
|
|
|
Common.logger(log_type, crawler).info('已抓取完一组视频,休眠1秒\n')
|
|
|
+ Common.logging(log_type, crawler, env, '已抓取完一组视频,休眠1秒\n')
|
|
|
time.sleep(1)
|
|
|
index = index + len(video_element_temp)
|
|
|
|
|
@@ -660,6 +695,7 @@ class ShipinhaoSearch:
|
|
|
search_word = user["search_word"]
|
|
|
our_uid = user["our_uid"]
|
|
|
Common.logger(log_type, crawler).info(f"开始抓取:{search_word}")
|
|
|
+ Common.logging(log_type, crawler, env, f"开始抓取:{search_word}")
|
|
|
|
|
|
cls.start_wechat(log_type=log_type,
|
|
|
crawler=crawler,
|
|
@@ -668,6 +704,7 @@ class ShipinhaoSearch:
|
|
|
env=env)
|
|
|
except Exception as e:
|
|
|
Common.logger(log_type, crawler).error(f"抓取{user['search_word']}时异常:{e}\n")
|
|
|
+ Common.logging(log_type, crawler, env, f"抓取{user['search_word']}时异常:{e}\n")
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|