common.py 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2022/4/18
  4. """
  5. 公共方法,包含:生成log / 删除log / 获取session / 下载方法 / 读取文件 / 统计下载数
  6. """
  7. import json
  8. from datetime import date, timedelta
  9. import datetime
  10. import logging
  11. import os
  12. import time
  13. import requests
  14. import urllib3
  15. proxies = {"http": None, "https": None}
  16. class Common:
  17. # 统一获取当前时间 <class 'datetime.datetime'> 2022-04-14 20:13:51.244472
  18. now = datetime.datetime.now()
  19. # 昨天 <class 'str'> 2022-04-13
  20. yesterday = (date.today() + timedelta(days=-1)).strftime("%Y-%m-%d")
  21. # 今天 <class 'datetime.date'> 2022-04-14
  22. today = date.today()
  23. # 明天 <class 'str'> 2022-04-15
  24. tomorrow = (date.today() + timedelta(days=1)).strftime("%Y-%m-%d")
  25. @staticmethod
  26. def crawler_log():
  27. """
  28. 生成 log 日志
  29. """
  30. # 日志路径
  31. log_dir = r"./logs/"
  32. log_path = os.getcwd() + os.sep + log_dir
  33. if not os.path.isdir(log_path):
  34. os.makedirs(log_path)
  35. # 日志参数
  36. log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
  37. date_format = "%Y-%m-%d %p %H:%M:%S"
  38. log_name = time.strftime("%Y-%m-%d", time.localtime(time.time())) + '.log'
  39. # 日志初始化
  40. logging.basicConfig(filename=log_path + log_name, level=logging.INFO, format=log_format, datefmt=date_format)
  41. crawler_logger = logging.getLogger("crawler-log")
  42. return crawler_logger
  43. @classmethod
  44. def del_logs(cls):
  45. """
  46. 清除冗余日志文件
  47. :return: 保留最近 7 个日志
  48. """
  49. log_dir = r"./logs/"
  50. all_files = sorted(os.listdir(log_dir))
  51. all_logs = []
  52. for log in all_files:
  53. name = os.path.splitext(log)[-1]
  54. if name == ".log":
  55. all_logs.append(log)
  56. if len(all_logs) <= 7:
  57. pass
  58. else:
  59. for file in all_logs[:len(all_logs) - 7]:
  60. os.remove(log_dir + file)
  61. cls.crawler_log().info("清除冗余日志成功")
  62. # 删除 charles 缓存文件,只保留最近的两个文件
  63. @classmethod
  64. def del_charles_files(cls):
  65. # 目标文件夹下所有文件
  66. all_file = sorted(os.listdir(r"./chlsfiles/"))
  67. for file in all_file[0:-2]:
  68. os.remove(r"./chlsfiles/" + file)
  69. @classmethod
  70. def download_method(cls, text, d_name, d_url):
  71. """
  72. 下载封面:text == "cover" ; 下载视频:text == "video"
  73. 需要下载的视频标题:d_title
  74. 视频封面,或视频播放地址:d_url
  75. 下载保存路径:"./files/{d_title}/"
  76. """
  77. # 首先创建一个保存该视频相关信息的文件夹
  78. video_dir = "./videos/" + d_name + "/"
  79. if not os.path.exists(video_dir):
  80. os.mkdir(video_dir)
  81. # 下载视频
  82. if text == "video":
  83. # 需要下载的视频地址
  84. video_url = d_url
  85. # 视频名
  86. video_name = "video.mp4"
  87. # 下载视频
  88. urllib3.disable_warnings()
  89. response = requests.get(video_url, stream=True, proxies=proxies, verify=False)
  90. try:
  91. with open(video_dir + video_name, "wb") as f:
  92. for chunk in response.iter_content(chunk_size=10240):
  93. f.write(chunk)
  94. cls.crawler_log().info("==========视频下载完成==========")
  95. except Exception as e:
  96. cls.crawler_log().info("视频下载失败:{}".format(e))
  97. # 下载封面
  98. elif text == "cover":
  99. # 需要下载的封面地址
  100. cover_url = d_url
  101. # 封面名
  102. cover_name = "image.jpg"
  103. # 下载封面
  104. urllib3.disable_warnings()
  105. response = requests.get(cover_url, proxies=proxies, verify=False)
  106. try:
  107. with open(video_dir + cover_name, "wb") as f:
  108. f.write(response.content)
  109. cls.crawler_log().info("==========封面下载完成==========")
  110. except Exception as e:
  111. cls.crawler_log().info("封面下载失败:{}".format(e))
  112. @staticmethod
  113. def read_txt(t_name):
  114. """
  115. 读取 txt 文件
  116. :param t_name: 文件名
  117. :return: 文件内容
  118. """
  119. with open(r"./txt/" + t_name, "r", encoding="utf8") as f:
  120. return f.readlines()
  121. @classmethod
  122. def get_session(cls):
  123. # charles 抓包文件保存目录
  124. charles_file_dir = r"./chlsfiles/"
  125. if int(len(os.listdir(charles_file_dir))) == 1:
  126. Common.crawler_log().info("未找到chlsfile文件,等待60s")
  127. time.sleep(60)
  128. else:
  129. # 目标文件夹下所有文件
  130. all_file = sorted(os.listdir(charles_file_dir))
  131. # 获取到目标文件
  132. old_file = all_file[-1]
  133. # 分离文件名与扩展名
  134. new_file = os.path.splitext(old_file)
  135. # 重命名文件后缀
  136. os.rename(os.path.join(charles_file_dir, old_file),
  137. os.path.join(charles_file_dir, new_file[0] + ".txt"))
  138. try:
  139. with open(charles_file_dir + new_file[0] + ".txt", encoding='utf-8-sig', errors='ignore') as f:
  140. contents = json.load(f, strict=False)
  141. if "search.weixin.qq.com" in [text['host'] for text in contents]:
  142. for text in contents:
  143. if text["host"] == "search.weixin.qq.com" \
  144. and text["path"] == "/cgi-bin/recwxa/recwxagetunreadmessagecnt":
  145. sessions = text["query"].split("session=")[-1].split("&wxaVersion=")[0]
  146. if "&vid" in sessions:
  147. session = sessions.split("&vid")[0]
  148. return session
  149. elif "&offset" in sessions:
  150. session = sessions.split("&offset")[0]
  151. return session
  152. elif "&wxaVersion" in sessions:
  153. session = sessions.split("&wxaVersion")[0]
  154. return session
  155. elif "&limit" in sessions:
  156. session = sessions.split("&limit")[0]
  157. return session
  158. elif "&scene" in sessions:
  159. session = sessions.split("&scene")[0]
  160. return session
  161. elif "&count" in sessions:
  162. session = sessions.split("&count")[0]
  163. return session
  164. elif "&channelid" in sessions:
  165. session = sessions.split("&channelid")[0]
  166. return session
  167. elif "&subscene" in sessions:
  168. session = sessions.split("&subscene")[0]
  169. return session
  170. elif "&clientVersion" in sessions:
  171. session = sessions.split("&clientVersion")[0]
  172. return session
  173. elif "&sharesearchid" in sessions:
  174. session = sessions.split("&sharesearchid")[0]
  175. return session
  176. elif "&nettype" in sessions:
  177. session = sessions.split("&nettype")[0]
  178. return session
  179. elif "&switchprofile" in sessions:
  180. session = sessions.split("&switchprofile")[0]
  181. return session
  182. elif "&switchnewuser" in sessions:
  183. session = sessions.split("&switchnewuser")[0]
  184. return session
  185. else:
  186. return sessions
  187. else:
  188. cls.crawler_log().info("未找到 session,10s后重新获取")
  189. time.sleep(10)
  190. cls.get_session()
  191. except Exception as e:
  192. cls.crawler_log().info("获取 session 异常,30s后重试:{}".format(e))
  193. time.sleep(30)
  194. cls.get_session()
  195. @classmethod
  196. def kanyikan_download_count(cls):
  197. videoid_path = r"./txt/kanyikan_videoid.txt"
  198. count = 0
  199. for count, line in enumerate(open(videoid_path, "rb").readlines()):
  200. count += 1
  201. cls.crawler_log().info('累计下载视频数: {}\n'.format(count))
  202. @classmethod
  203. def kanyikan_today_download_count(cls):
  204. """
  205. 统计快手渠道当日下载视频数
  206. :return:
  207. """
  208. # 创建空文件
  209. with open(r"./txt/" + str(cls.today) + "_kanyikan_videoid.txt", "a") as f:
  210. f.write("")
  211. videoid_path = r"./txt/" + str(cls.today) + "_kanyikan_videoid.txt"
  212. count = 0
  213. for count, line in enumerate(open(videoid_path, "rb").readlines()):
  214. count += 1
  215. return count
  216. @classmethod
  217. def del_yesterday_kanyikan_videoid_txt(cls):
  218. """
  219. 删除快手渠道昨日下载视频数的 txt 文件
  220. :return:
  221. """
  222. yesterday_kanyikan_videoid_txt_dir = r"./txt/"
  223. all_files = sorted(os.listdir(yesterday_kanyikan_videoid_txt_dir))
  224. for file in all_files:
  225. name = os.path.splitext(file)[0]
  226. if name == cls.yesterday + "_kanyikan_videoid":
  227. os.remove(yesterday_kanyikan_videoid_txt_dir + file)
  228. Common.crawler_log().info("删除快手昨天下载统计文件成功")
  229. if __name__ == "__main__":
  230. common = Common()
  231. common.del_yesterday_kanyikan_videoid_txt()
  232. print(common.kanyikan_today_download_count())