common.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/1/31
  4. """
  5. 公共方法,包含:生成log / 删除log / 下载方法 / 删除 weixinzhishu_chlsfiles / 过滤词库 / 保存视频信息至本地 txt / 翻译 / ffmpeg
  6. """
  7. from aliyun.log import LogClient, PutLogsRequest, LogItem
  8. from datetime import date, timedelta
  9. from datetime import datetime
  10. from loguru import logger
  11. from hashlib import md5
  12. # import datetime
  13. import os
  14. import json
  15. import time
  16. import requests
  17. import ffmpeg
  18. import urllib3
  19. import subprocess
  20. proxies = {"http": None, "https": None}
  21. class Common:
  22. # 统一获取当前时间 <class 'datetime.datetime'> 2022-04-14 20:13:51.244472
  23. now = datetime.now()
  24. # 昨天 <class 'str'> 2022-04-13
  25. yesterday = (date.today() + timedelta(days=-1)).strftime("%Y-%m-%d")
  26. # 今天 <class 'datetime.date'> 2022-04-14
  27. today = date.today()
  28. # 明天 <class 'str'> 2022-04-15
  29. tomorrow = (date.today() + timedelta(days=1)).strftime("%Y-%m-%d")
  30. # 使用 logger 模块生成日志
  31. @staticmethod
  32. def logger(log_type, crawler):
  33. """
  34. 使用 logger 模块生成日志
  35. """
  36. # 日志路径
  37. log_dir = f"./{crawler}/logs/"
  38. log_path = os.getcwd() + os.sep + log_dir
  39. if not os.path.isdir(log_path):
  40. os.makedirs(log_path)
  41. # 日志文件名
  42. # log_name = time.strftime("%Y-%m-%d", time.localtime(time.time())) + f'-{crawler}-{log_type}.log'
  43. # log_name = datetime.datetime.now().strftime('%Y-%m-%d') + f'-{crawler}-{log_type}.log'
  44. # log_name = f"{date.today():%Y-%m-%d}-{crawler}-{log_type}.log"
  45. log_name = f"{crawler}-{log_type}-{datetime.now().date().strftime('%Y-%m-%d')}.log"
  46. # 日志不打印到控制台
  47. logger.remove(handler_id=None)
  48. # rotation="500 MB",实现每 500MB 存储一个文件
  49. # rotation="12:00",实现每天 12:00 创建一个文件
  50. # rotation="1 week",每周创建一个文件
  51. # retention="10 days",每隔10天之后就会清理旧的日志
  52. # 初始化日志
  53. # logger.add(f"{log_dir}{log_name}", level="INFO", rotation="00:00", retention="10 days", enqueue=True)
  54. logger.add(os.path.join(log_dir, log_name), level="INFO", rotation="00:00", retention="10 days", enqueue=True)
  55. return logger
  56. # 写入阿里云日志
  57. @staticmethod
  58. def logging(log_type, crawler, env, message):
  59. """
  60. 写入阿里云日志
  61. 测试库: https://sls.console.aliyun.com/lognext/project/crawler-log-dev/logsearch/crawler-log-dev
  62. 正式库: https://sls.console.aliyun.com/lognext/project/crawler-log-prod/logsearch/crawler-log-prod
  63. :param log_type: 爬虫策略
  64. :param crawler: 哪款爬虫
  65. :param env: 环境
  66. :param message:日志内容
  67. :return: None
  68. """
  69. # 设置阿里云日志服务的访问信息
  70. accessKeyId = 'LTAIWYUujJAm7CbH'
  71. accessKey = 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P'
  72. if env == "dev":
  73. project = 'crawler-log-dev'
  74. logstore = 'crawler-log-dev'
  75. endpoint = 'cn-hangzhou.log.aliyuncs.com'
  76. elif crawler == "xigua" and log_type == "recommend":
  77. project = 'crawler-log-prod'
  78. logstore = 'crawler-log-prod'
  79. endpoint = 'cn-hangzhou.log.aliyuncs.com'
  80. elif crawler == "shipinhao"\
  81. or crawler == "kanyikan"\
  82. or crawler == "ganggangdouchuan"\
  83. or crawler == "zhiqingtiantiankan"\
  84. or crawler == "jixiangxingfu"\
  85. or crawler == "zhufuquanzi" \
  86. or crawler == "xiaoniangaoplus" \
  87. or crawler == "zhongmiaoyinxin" \
  88. or crawler == "huanhuanxixizhufudao":
  89. project = 'crawler-log-prod'
  90. logstore = 'crawler-log-prod'
  91. endpoint = 'cn-hangzhou.log.aliyuncs.com'
  92. else:
  93. project = 'crawler-log-prod'
  94. logstore = 'crawler-log-prod'
  95. endpoint = 'cn-hangzhou-intranet.log.aliyuncs.com'
  96. # 创建 LogClient 实例
  97. client = LogClient(endpoint, accessKeyId, accessKey)
  98. if '\r' in message:
  99. message = message.replace('\r', ' ')
  100. if '\n' in message:
  101. message = message.replace('\n', ' ')
  102. log_group = []
  103. log_item = LogItem()
  104. """
  105. 生成日志消息体格式,例如
  106. crawler:xigua
  107. message:不满足抓取规则
  108. mode:search
  109. timestamp:1686656143
  110. """
  111. contents = [(f"crawler", str(crawler)), (f"mode", str(log_type)), (f"message", str(message)), ("timestamp", str(int(time.time())))]
  112. log_item.set_contents(contents)
  113. log_group.append(log_item)
  114. # 写入日志
  115. request = PutLogsRequest(project=project,
  116. logstore=logstore,
  117. topic="",
  118. source="",
  119. logitems=log_group,
  120. compress=False)
  121. client.put_logs(request)
  122. # 清除日志,保留最近 10 个文件
  123. @classmethod
  124. def del_logs(cls, log_type, crawler):
  125. """
  126. 清除冗余日志文件
  127. :return: 保留最近 10 个日志
  128. """
  129. log_dir = f"./{crawler}/logs/"
  130. all_files = sorted(os.listdir(log_dir))
  131. all_logs = []
  132. for log in all_files:
  133. name = os.path.splitext(log)[-1]
  134. if name == ".log":
  135. all_logs.append(log)
  136. if len(all_logs) <= 30:
  137. pass
  138. else:
  139. for file in all_logs[:len(all_logs) - 30]:
  140. os.remove(log_dir + file)
  141. cls.logger(log_type, crawler).info("清除日志成功\n")
  142. @classmethod
  143. def get_session(cls, log_type, crawler, env):
  144. while True:
  145. # charles 抓包文件保存目录
  146. charles_file_dir = f"./{crawler}/chlsfiles/"
  147. if int(len(os.listdir(charles_file_dir))) == 1:
  148. Common.logger(log_type, crawler).info("未找到chlsfile文件,等待60s")
  149. cls.logging(log_type, crawler, env, "未找到chlsfile文件,等待60s")
  150. time.sleep(60)
  151. continue
  152. # 目标文件夹下所有文件
  153. all_file = sorted(os.listdir(charles_file_dir))
  154. # 获取到目标文件
  155. old_file = all_file[-2]
  156. # 分离文件名与扩展名
  157. new_file = os.path.splitext(old_file)
  158. # 重命名文件后缀
  159. os.rename(os.path.join(charles_file_dir, old_file),
  160. os.path.join(charles_file_dir, new_file[0] + ".txt"))
  161. with open(charles_file_dir + new_file[0] + ".txt", encoding='utf-8-sig', errors='ignore') as f:
  162. contents = json.load(f, strict=False)
  163. if "search.weixin.qq.com" in [text['host'] for text in contents]:
  164. for text in contents:
  165. if text["host"] == "search.weixin.qq.com" \
  166. and text["path"] == "/cgi-bin/recwxa/recwxagetunreadmessagecnt":
  167. sessions = text["query"].split("session=")[-1].split("&wxaVersion=")[0]
  168. if "&vid" in sessions:
  169. session = sessions.split("&vid")[0]
  170. return session
  171. elif "&offset" in sessions:
  172. session = sessions.split("&offset")[0]
  173. return session
  174. elif "&wxaVersion" in sessions:
  175. session = sessions.split("&wxaVersion")[0]
  176. return session
  177. elif "&limit" in sessions:
  178. session = sessions.split("&limit")[0]
  179. return session
  180. elif "&scene" in sessions:
  181. session = sessions.split("&scene")[0]
  182. return session
  183. elif "&count" in sessions:
  184. session = sessions.split("&count")[0]
  185. return session
  186. elif "&channelid" in sessions:
  187. session = sessions.split("&channelid")[0]
  188. return session
  189. elif "&subscene" in sessions:
  190. session = sessions.split("&subscene")[0]
  191. return session
  192. elif "&clientVersion" in sessions:
  193. session = sessions.split("&clientVersion")[0]
  194. return session
  195. elif "&sharesearchid" in sessions:
  196. session = sessions.split("&sharesearchid")[0]
  197. return session
  198. elif "&nettype" in sessions:
  199. session = sessions.split("&nettype")[0]
  200. return session
  201. elif "&switchprofile" in sessions:
  202. session = sessions.split("&switchprofile")[0]
  203. return session
  204. elif "&switchnewuser" in sessions:
  205. session = sessions.split("&switchnewuser")[0]
  206. return session
  207. else:
  208. return sessions
  209. else:
  210. cls.logger(log_type, crawler).info("未找到 session,10s后重新获取")
  211. cls.logging(log_type, crawler, env, "未找到 session,10s后重新获取")
  212. time.sleep(10)
  213. # 删除 charles 缓存文件,只保留最近的两个文件
  214. @classmethod
  215. def del_charles_files(cls, log_type, crawler):
  216. # 目标文件夹下所有文件
  217. all_file = sorted(os.listdir(f"./{crawler}/chlsfiles/"))
  218. for file in all_file[0:-3]:
  219. os.remove(f"./{crawler}/chlsfiles/{file}")
  220. cls.logger(log_type, crawler).info("删除 charles 缓存文件成功\n")
  221. # 保存视频信息至 "./videos/{video_dict['video_title}/info.txt"
  222. @classmethod
  223. def save_video_info(cls, log_type, crawler, video_dict):
  224. md_title = md5(video_dict['video_title'].encode('utf8')).hexdigest()
  225. save_dict = {
  226. "video_title": "video_title",
  227. "video_id": "video_id",
  228. "duration": 0,
  229. "play_cnt": 0,
  230. "comment_cnt": 0,
  231. "like_cnt": 0,
  232. "share_cnt": 0,
  233. "video_width": 1920,
  234. "video_height": 1080,
  235. "publish_time_stamp": 946656000, # 2000-01-01 00:00:00
  236. "user_name": "crawler",
  237. "avatar_url": "http://weapppiccdn.yishihui.com/resources/images/pic_normal.png",
  238. "video_url": "video_url",
  239. "cover_url": "cover_url",
  240. "session": f"session-{int(time.time())}",
  241. }
  242. for video_key, video_value in video_dict.items():
  243. for save_key, save_value in save_dict.items():
  244. if save_key == video_key:
  245. save_dict[save_key] = video_value
  246. with open(f"./{crawler}/videos/{md_title}/info.txt", "w", encoding="UTF-8") as f_w:
  247. f_w.write(str(video_dict['video_id']) + "\n" +
  248. str(video_dict['video_title']) + "\n" +
  249. str(video_dict['duration']) + "\n" +
  250. str(video_dict['play_cnt']) + "\n" +
  251. str(video_dict['comment_cnt']) + "\n" +
  252. str(video_dict['like_cnt']) + "\n" +
  253. str(video_dict['share_cnt']) + "\n" +
  254. f"{video_dict['video_width']}*{video_dict['video_height']}" + "\n" +
  255. str(video_dict['publish_time_stamp']) + "\n" +
  256. str(video_dict['user_name']) + "\n" +
  257. str(video_dict['avatar_url']) + "\n" +
  258. str(video_dict['video_url']) + "\n" +
  259. str(video_dict['cover_url']) + "\n" +
  260. str(video_dict['session']))
  261. Common.logger(log_type, crawler).info("==========视频信息已保存至info.txt==========")
  262. # 封装下载视频或封面的方法
  263. @classmethod
  264. def download_method(cls, log_type, crawler, text, title, url):
  265. """
  266. 下载封面:text == "cover" ; 下载视频:text == "video"
  267. 需要下载的视频标题:d_title
  268. 视频封面,或视频播放地址:d_url
  269. 下载保存路径:"./files/{d_title}/"
  270. """
  271. videos_dir = f"./{crawler}/videos/"
  272. if not os.path.exists(videos_dir):
  273. os.mkdir(videos_dir)
  274. # 首先创建一个保存该视频相关信息的文件夹
  275. md_title = md5(title.encode('utf8')).hexdigest()
  276. video_path = f"./{crawler}/videos/{md_title}/"
  277. if not os.path.exists(video_path):
  278. os.mkdir(video_path)
  279. # 下载视频
  280. if text == "video":
  281. # 需要下载的视频地址
  282. video_url = str(url).replace('http://', 'https://')
  283. # 视频名
  284. video_name = "video.mp4"
  285. for i in range(3):
  286. try:
  287. # 下载视频,最多重试三次
  288. urllib3.disable_warnings()
  289. # response = requests.get(video_url, stream=True, proxies=cls.tunnel_proxies(), verify=False)
  290. response = requests.get(video_url, stream=True, proxies=proxies, verify=False)
  291. with open(video_path + video_name, "wb") as f:
  292. for chunk in response.iter_content(chunk_size=10240):
  293. f.write(chunk)
  294. cls.logger(log_type, crawler).info("==========视频下载完成==========")
  295. return True
  296. except Exception as e:
  297. cls.logger(log_type, crawler).error(f"视频下载失败:{e}\n")
  298. time.sleep(1)
  299. return False
  300. # 下载音频
  301. elif text == "audio":
  302. # 需要下载的视频地址
  303. audio_url = str(url).replace('http://', 'https://')
  304. # 音频名
  305. audio_name = "audio.mp4"
  306. # 下载视频
  307. urllib3.disable_warnings()
  308. # response = requests.get(audio_url, stream=True, proxies=cls.tunnel_proxies(), verify=False)
  309. response = requests.get(audio_url, stream=True, proxies=proxies, verify=False)
  310. try:
  311. with open(video_path + audio_name, "wb") as f:
  312. for chunk in response.iter_content(chunk_size=10240):
  313. f.write(chunk)
  314. cls.logger(log_type, crawler).info("==========音频下载完成==========")
  315. except Exception as e:
  316. cls.logger(log_type, crawler).error(f"音频下载失败:{e}\n")
  317. # 下载封面
  318. elif text == "cover":
  319. # 需要下载的封面地址
  320. cover_url = str(url)
  321. # 封面名
  322. cover_name = "image.jpg"
  323. # 下载封面
  324. urllib3.disable_warnings()
  325. # response = requests.get(cover_url, proxies=cls.tunnel_proxies(), verify=False)
  326. response = requests.get(cover_url, verify=False)
  327. try:
  328. with open(video_path + cover_name, "wb") as f:
  329. f.write(response.content)
  330. cls.logger(log_type, crawler).info("==========封面下载完成==========")
  331. except Exception as e:
  332. cls.logger(log_type, crawler).error(f"封面下载失败:{e}\n")
  333. # youtube 视频下载
  334. elif text == "youtube_video":
  335. # 需要下载的视频地址
  336. video_url = url
  337. # 视频名
  338. video_name = "video.mp4"
  339. try:
  340. download_cmd = f'yt-dlp -f "bv[height<=720][ext=mp4]+ba[ext=m4a]" --merge-output-format mp4 "{video_url}-U" -o {video_path}{video_name}'
  341. Common.logger(log_type, crawler).info(f"download_cmd:{download_cmd}")
  342. os.system(download_cmd)
  343. # move_cmd = f"mv {video_name} {video_path}"
  344. # os.system(move_cmd)
  345. cls.logger(log_type, crawler).info("==========视频下载完成==========")
  346. except Exception as e:
  347. Common.logger(log_type, crawler).error(f"视频下载失败:{e}\n")
  348. # 西瓜视频 / 音频下载
  349. elif text == "xigua_video":
  350. # 需要下载的视频地址
  351. video_url = str(url).replace('http://', 'https://')
  352. # 视频名
  353. video_name = "video1.mp4"
  354. # 下载视频
  355. urllib3.disable_warnings()
  356. # response = requests.get(video_url, stream=True, proxies=cls.tunnel_proxies(), verify=False)
  357. response = requests.get(video_url, stream=True, proxies=proxies, verify=False)
  358. try:
  359. with open(video_path + video_name, "wb") as f:
  360. for chunk in response.iter_content(chunk_size=10240):
  361. f.write(chunk)
  362. cls.logger(log_type, crawler).info("==========视频下载完成==========")
  363. except Exception as e:
  364. cls.logger(log_type, crawler).error(f"视频下载失败:{e}\n")
  365. elif text == "xigua_audio":
  366. # 需要下载的视频地址
  367. audio_url = str(url).replace('http://', 'https://')
  368. # 音频名
  369. audio_name = "audio1.mp4"
  370. # 下载视频
  371. urllib3.disable_warnings()
  372. # response = requests.get(audio_url, stream=True, proxies=cls.tunnel_proxies(), verify=False)
  373. response = requests.get(audio_url, stream=True, proxies=proxies, verify=False)
  374. try:
  375. with open(video_path + audio_name, "wb") as f:
  376. for chunk in response.iter_content(chunk_size=10240):
  377. f.write(chunk)
  378. cls.logger(log_type, crawler).info("==========音频下载完成==========")
  379. except Exception as e:
  380. cls.logger(log_type, crawler).error(f"音频下载失败:{e}\n")
  381. @classmethod
  382. def ffmpeg(cls, log_type, crawler, video_path):
  383. Common.logger(log_type, crawler).info(f"video_path:{video_path}")
  384. video_title = video_path.replace(f"./{crawler}/videos/", "").replace("/video.mp4", "")
  385. Common.logger(log_type, crawler).info(f"video_title:{video_title}")
  386. md_title = md5(video_title.encode('utf8')).hexdigest()
  387. Common.logger(log_type, crawler).info(f"crawler:{crawler}")
  388. # if crawler == "zhiqingtiantiankan" \
  389. # or crawler == "ganggangdouchuan"\
  390. # or crawler == "jixiangxingfu"\
  391. # or crawler == "zhongmiaoyinxin":
  392. # # video_path = os.path.join("C:\\", "crawler", "piaoquan_crawler", f"{crawler}", "videos", f"{md_title}", "video.mp4")
  393. # video_path = os.path.join(".\\", f"{crawler}", "videos", f"{md_title}", "video.mp4")
  394. # else:
  395. video_path = f"./{crawler}/videos/{md_title}/video.mp4"
  396. Common.logger(log_type, crawler).info(f"video_path:{video_path}")
  397. if os.path.getsize(video_path) == 0:
  398. Common.logger(log_type, crawler).info(f'video_size:{os.path.getsize(video_path)}')
  399. return
  400. probe = ffmpeg.probe(video_path)
  401. video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
  402. if video_stream is None:
  403. Common.logger(log_type, crawler).info('No video Stream found!')
  404. return
  405. format1 = probe['format']
  406. size = int(int(format1['size']) / 1024 / 1024)
  407. width = int(video_stream['width'])
  408. height = int(video_stream['height'])
  409. duration = int(float(video_stream['duration']))
  410. ffmpeg_dict = {
  411. 'width': width,
  412. 'height': height,
  413. 'duration': duration,
  414. 'size': size
  415. }
  416. return ffmpeg_dict
  417. # 合并音视频
  418. @classmethod
  419. def video_compose(cls, log_type, crawler, video_dir):
  420. video_title = video_dir.replace(f"./{crawler}/videos/", "")
  421. md_title = md5(video_title.encode('utf8')).hexdigest()
  422. video_dir = f"./{crawler}/videos/{md_title}"
  423. try:
  424. video_path = f'{video_dir}/video1.mp4'
  425. audio_path = f'{video_dir}/audio1.mp4'
  426. out_path = f'{video_dir}/video.mp4'
  427. cmd = f'ffmpeg -i {video_path} -i {audio_path} -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 {out_path}'
  428. # print(cmd)
  429. subprocess.call(cmd, shell=True)
  430. for file in os.listdir(video_dir):
  431. if file.split('.mp4')[0] == 'video1' or file.split('.mp4')[0] == 'audio1':
  432. os.remove(f'{video_dir}/{file}')
  433. Common.logger(log_type, crawler).info('合成成功\n')
  434. except Exception as e:
  435. Common.logger(log_type, crawler).error(f'video_compose异常:{e}\n')
  436. # 快代理
  437. @classmethod
  438. def tunnel_proxies(cls):
  439. # 隧道域名:端口号
  440. tunnel = "q796.kdltps.com:15818"
  441. # 用户名密码方式
  442. username = "t17772369458618"
  443. password = "5zqcjkmy"
  444. tunnel_proxies = {
  445. "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
  446. "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
  447. }
  448. # 白名单方式(需提前设置白名单)
  449. # proxies = {
  450. # "http": "http://%(proxy)s/" % {"proxy": tunnel},
  451. # "https": "http://%(proxy)s/" % {"proxy": tunnel}
  452. # }
  453. # 要访问的目标网页
  454. # target_url = "https://www.kuaishou.com/profile/3xk9tkk6kkwkf7g"
  455. # target_url = "https://dev.kdlapi.com/testproxy"
  456. # # 使用隧道域名发送请求
  457. # response = requests.get(target_url, proxies=proxies)
  458. # print(response.text)
  459. return tunnel_proxies # {'http': 'http://t17772369458618:5zqcjkmy@q796.kdltps.com:15818/', 'https': 'http://t17772369458618:5zqcjkmy@q796.kdltps.com:15818/'}
  460. if __name__ == "__main__":
  461. # print(datetime.time(hour=0, minute=0))
  462. # print(f'{date.today():%Y-%m-%d}')
  463. print(datetime.now().date().strftime('%Y-%m-%d'))
  464. pass