common.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/1/31
  4. """
  5. 公共方法,包含:生成log / 删除log / 下载方法 / 删除 chlsfiles / 过滤词库 / 保存视频信息至本地 txt / 翻译 / ffmpeg
  6. """
  7. from datetime import date, timedelta
  8. from loguru import logger
  9. import datetime
  10. import os
  11. import time
  12. import requests
  13. import json
  14. import ffmpeg
  15. from urllib import parse, request
  16. import urllib3
  17. proxies = {"http": None, "https": None}
  18. class Common:
  19. # 统一获取当前时间 <class 'datetime.datetime'> 2022-04-14 20:13:51.244472
  20. now = datetime.datetime.now()
  21. # 昨天 <class 'str'> 2022-04-13
  22. yesterday = (date.today() + timedelta(days=-1)).strftime("%Y/%m/%d")
  23. # 今天 <class 'datetime.date'> 2022-04-14
  24. today = date.today()
  25. # 明天 <class 'str'> 2022-04-15
  26. tomorrow = (date.today() + timedelta(days=1)).strftime("%Y/%m/%d")
  27. # 使用 logger 模块生成日志
  28. @staticmethod
  29. def logger(log_type, crawler):
  30. """
  31. 使用 logger 模块生成日志
  32. """
  33. # 日志路径
  34. log_dir = f"./{crawler}/logs/"
  35. log_path = os.getcwd() + os.sep + log_dir
  36. if not os.path.isdir(log_path):
  37. os.makedirs(log_path)
  38. # 日志文件名
  39. log_name = time.strftime("%Y-%m-%d", time.localtime(time.time())) + f'-{crawler}-{log_type}.log'
  40. # 日志不打印到控制台
  41. logger.remove(handler_id=None)
  42. # rotation="500 MB",实现每 500MB 存储一个文件
  43. # rotation="12:00",实现每天 12:00 创建一个文件
  44. # rotation="1 week",每周创建一个文件
  45. # retention="10 days",每隔10天之后就会清理旧的日志
  46. # 初始化日志
  47. logger.add(log_dir + log_name, level="INFO", rotation='00:00')
  48. return logger
  49. # 清除日志,保留最近 10 个文件
  50. @classmethod
  51. def del_logs(cls, log_type, crawler):
  52. """
  53. 清除冗余日志文件
  54. :return: 保留最近 10 个日志
  55. """
  56. log_dir = f"./{crawler}/logs/"
  57. all_files = sorted(os.listdir(log_dir))
  58. all_logs = []
  59. for log in all_files:
  60. name = os.path.splitext(log)[-1]
  61. if name == ".log":
  62. all_logs.append(log)
  63. if len(all_logs) <= 10:
  64. pass
  65. else:
  66. for file in all_logs[:len(all_logs) - 10]:
  67. os.remove(log_dir + file)
  68. cls.logger(log_type, crawler).info("清除日志成功")
  69. # 删除 charles 缓存文件,只保留最近的两个文件
  70. @classmethod
  71. def del_charles_files(cls, log_type, crawler):
  72. # 目标文件夹下所有文件
  73. all_file = sorted(os.listdir(f"./{crawler}/chlsfiles/"))
  74. for file in all_file[0:-3]:
  75. os.remove(f"./{crawler}/chlsfiles/{file}")
  76. cls.logger(log_type, crawler).info("删除 charles 缓存文件成功")
  77. # 保存视频信息至 "./videos/{video_dict['video_title}/info.txt"
  78. @classmethod
  79. def save_video_info(cls, log_type, crawler, video_dict):
  80. with open(f"./{crawler}/videos/{video_dict['video_title']}/info.txt",
  81. "a", encoding="UTF-8") as f_a:
  82. f_a.write(str(video_dict['video_id']) + "\n" +
  83. str(video_dict['video_title']) + "\n" +
  84. str(video_dict['duration']) + "\n" +
  85. str(video_dict['play_cnt']) + "\n" +
  86. str(video_dict['comment_cnt']) + "\n" +
  87. str(video_dict['like_cnt']) + "\n" +
  88. str(video_dict['share_cnt']) + "\n" +
  89. f"{video_dict['video_width']}*{video_dict['video_height']}" + "\n" +
  90. str(video_dict['publish_time']) + "\n" +
  91. str(video_dict['user_name']) + "\n" +
  92. str(video_dict['avatar_url']) + "\n" +
  93. str(video_dict['video_url']) + "\n" +
  94. str(video_dict['cover_url']) + "\n" +
  95. str(video_dict['session']))
  96. Common.logger(log_type, crawler).info("==========视频信息已保存至info.txt==========")
  97. # 封装下载视频或封面的方法
  98. @classmethod
  99. def download_method(cls, log_type, crawler, text, title, url):
  100. """
  101. 下载封面:text == "cover" ; 下载视频:text == "video"
  102. 需要下载的视频标题:d_title
  103. 视频封面,或视频播放地址:d_url
  104. 下载保存路径:"./files/{d_title}/"
  105. """
  106. videos_dir = f"./{crawler}/videos/"
  107. if not os.path.exists(videos_dir):
  108. os.mkdir(videos_dir)
  109. # 首先创建一个保存该视频相关信息的文件夹
  110. video_dir = f"./{crawler}/videos/{title}/"
  111. if not os.path.exists(video_dir):
  112. os.mkdir(video_dir)
  113. # 下载视频
  114. if text == "video":
  115. # 需要下载的视频地址
  116. video_url = str(url).replace('http://', 'https://')
  117. # 视频名
  118. video_name = "video.mp4"
  119. # 下载视频
  120. urllib3.disable_warnings()
  121. response = requests.get(video_url, stream=True, proxies=proxies, verify=False)
  122. try:
  123. with open(video_dir + video_name, "wb") as f:
  124. for chunk in response.iter_content(chunk_size=10240):
  125. f.write(chunk)
  126. cls.logger(log_type, crawler).info("==========视频下载完成==========")
  127. except Exception as e:
  128. cls.logger(log_type, crawler).error(f"视频下载失败:{e}\n")
  129. # 下载音频
  130. elif text == "audio":
  131. # 需要下载的视频地址
  132. audio_url = str(url).replace('http://', 'https://')
  133. # 音频名
  134. audio_name = "audio.mp4"
  135. # 下载视频
  136. urllib3.disable_warnings()
  137. response = requests.get(audio_url, stream=True, proxies=proxies, verify=False)
  138. try:
  139. with open(video_dir + audio_name, "wb") as f:
  140. for chunk in response.iter_content(chunk_size=10240):
  141. f.write(chunk)
  142. cls.logger(log_type, crawler).info("==========音频下载完成==========")
  143. except Exception as e:
  144. cls.logger(log_type, crawler).error(f"音频下载失败:{e}\n")
  145. # 下载封面
  146. elif text == "cover":
  147. # 需要下载的封面地址
  148. cover_url = str(url)
  149. # 封面名
  150. cover_name = "image.jpg"
  151. # 下载封面
  152. urllib3.disable_warnings()
  153. response = requests.get(cover_url, proxies=proxies, verify=False)
  154. try:
  155. with open(video_dir + cover_name, "wb") as f:
  156. f.write(response.content)
  157. cls.logger(log_type, crawler).info("==========封面下载完成==========")
  158. except Exception as e:
  159. cls.logger(log_type, crawler).error(f"封面下载失败:{e}\n")
  160. # 有道翻译:英文 → 中文
  161. @classmethod
  162. def fanyi(cls, query):
  163. req_url = 'http://fanyi.youdao.com/translate' # 创建连接接口
  164. # 创建要提交的数据
  165. Form_Date = {'i': query,
  166. 'doctype': 'json',
  167. 'form': 'AUTO',
  168. 'to': 'AUTO',
  169. # 'to': 'Chinese',
  170. 'smartresult': 'dict',
  171. 'client': 'fanyideskweb',
  172. 'salt': '1526995097962',
  173. 'sign': '8e4c4765b52229e1f3ad2e633af89c76',
  174. 'version': '2.1',
  175. 'keyform': 'fanyi.web',
  176. 'action': 'FY_BY_REALTIME',
  177. 'typoResult': 'false'}
  178. data = parse.urlencode(Form_Date).encode('utf-8') # 数据转换
  179. response = request.urlopen(req_url, data) # 提交数据并解析
  180. html = response.read().decode('utf-8') # 服务器返回结果读取
  181. # print(html)
  182. # 可以看出html是一个json格式
  183. translate_results = json.loads(html) # 以json格式载入
  184. translate_results = translate_results['translateResult'][0][0]['tgt'] # json格式调取
  185. # print(translate_results) # 输出结果
  186. return translate_results # 返回结果
  187. @classmethod
  188. def ffmpeg(cls, log_type, crawler, video_path):
  189. probe = ffmpeg.probe(video_path)
  190. video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
  191. if video_stream is None:
  192. Common.logger(log_type, crawler).info('No video Stream found!')
  193. return
  194. format1 = probe['format']
  195. size = int(format1['size']) / 1024 / 1024
  196. width = int(video_stream['width'])
  197. height = int(video_stream['height'])
  198. duration = int(float(video_stream['duration']))
  199. ffmpeg_dict = {
  200. 'width': width,
  201. 'height': height,
  202. 'duration': duration,
  203. 'size': size
  204. }
  205. return ffmpeg_dict
  206. if __name__ == "__main__":
  207. res = Common.fanyi("10 MOST UNIQUE Dance Groups EVER On Britain's Got Talent!")
  208. print(res)