common.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2022/6/27
  4. """
  5. 公共方法,包含:生成log / 删除log / 获取session / 下载方法 / 读取文件 / 统计下载数
  6. """
  7. import json
  8. from datetime import date, timedelta
  9. import datetime
  10. import os
  11. import time
  12. import requests
  13. import urllib3
  14. from loguru import logger
  15. proxies = {"http": None, "https": None}
  16. class Common:
  17. # 统一获取当前时间 <class 'datetime.datetime'> 2022-04-14 20:13:51.244472
  18. now = datetime.datetime.now()
  19. # 昨天 <class 'str'> 2022-04-13
  20. yesterday = (date.today() + timedelta(days=-1)).strftime("%Y-%m-%d")
  21. # 今天 <class 'datetime.date'> 2022-04-14
  22. today = date.today()
  23. # 明天 <class 'str'> 2022-04-15
  24. tomorrow = (date.today() + timedelta(days=1)).strftime("%Y-%m-%d")
  25. # 使用 logger 模块生成日志
  26. @staticmethod
  27. def logger(log_type):
  28. """
  29. 使用 logger 模块生成日志
  30. """
  31. # 日志路径
  32. log_dir = "./crawler_monitor/logs/"
  33. log_path = os.getcwd() + os.sep + log_dir
  34. if not os.path.isdir(log_path):
  35. os.makedirs(log_path)
  36. # 日志文件名
  37. if log_type == "kanyikan":
  38. log_name = time.strftime("%Y-%m-%d", time.localtime(time.time())) + '-monitor-kanyikan.log'
  39. elif log_type == "xiaoniangao":
  40. log_name = time.strftime("%Y-%m-%d", time.localtime(time.time())) + '-monitor-xiaoniangao.log'
  41. else:
  42. log_name = time.strftime("%Y-%m-%d", time.localtime(time.time())) + '.log'
  43. # 日志不打印到控制台
  44. logger.remove(handler_id=None)
  45. # rotation="500 MB",实现每 500MB 存储一个文件
  46. # rotation="12:00",实现每天 12:00 创建一个文件
  47. # rotation="1 week",每周创建一个文件
  48. # retention="10 days",每隔10天之后就会清理旧的日志
  49. # 初始化日志
  50. logger.add(log_dir + log_name, level="INFO", rotation='00:00')
  51. return logger
  52. @classmethod
  53. def del_logs(cls, log_type):
  54. """
  55. 清除冗余日志文件
  56. :return: 保留最近 6 个日志
  57. """
  58. log_dir = "./crawler_monitor/logs/"
  59. all_files = sorted(os.listdir(log_dir))
  60. all_logs = []
  61. for log in all_files:
  62. name = os.path.splitext(log)[-1]
  63. if name == ".log":
  64. all_logs.append(log)
  65. if len(all_logs) <= 6:
  66. pass
  67. else:
  68. for file in all_logs[:len(all_logs) - 7]:
  69. os.remove(log_dir + file)
  70. cls.logger(log_type).info("清除冗余日志成功")
  71. # 删除 charles 缓存文件,只保留最近的两个文件
  72. @classmethod
  73. def del_charles_files(cls):
  74. # 目标文件夹下所有文件
  75. all_file = sorted(os.listdir("./crawler-kanyikan-recommend/chlsfiles/"))
  76. for file in all_file[0:-2]:
  77. os.remove("./crawler-kanyikan-recommend/chlsfiles/" + file)
  78. cls.logger("kanyikan").info("删除 charles 缓存文件成功")
  79. @classmethod
  80. def download_method(cls, log_type, text, d_name, d_url):
  81. """
  82. 下载封面:text == "cover" ; 下载视频:text == "video"
  83. 需要下载的视频标题:d_title
  84. 视频封面,或视频播放地址:d_url
  85. 下载保存路径:"./videos/{d_title}/"
  86. """
  87. # 首先创建一个保存该视频相关信息的文件夹
  88. video_dir = "./videos/" + d_name + "/"
  89. if not os.path.exists(video_dir):
  90. os.mkdir(video_dir)
  91. # 下载视频
  92. if text == "video":
  93. # 需要下载的视频地址
  94. video_url = d_url
  95. # 视频名
  96. video_name = "video.mp4"
  97. # 下载视频
  98. urllib3.disable_warnings()
  99. response = requests.get(video_url, stream=True, proxies=proxies, verify=False)
  100. try:
  101. with open(video_dir + video_name, "wb") as f:
  102. for chunk in response.iter_content(chunk_size=10240):
  103. f.write(chunk)
  104. cls.logger(log_type).info("==========视频下载完成==========")
  105. except Exception as e:
  106. cls.logger(log_type).exception("视频下载失败:{}", e)
  107. # 下载封面
  108. elif text == "cover":
  109. # 需要下载的封面地址
  110. cover_url = d_url
  111. # 封面名
  112. cover_name = "image.jpg"
  113. # 下载封面
  114. urllib3.disable_warnings()
  115. response = requests.get(cover_url, proxies=proxies, verify=False)
  116. try:
  117. with open(video_dir + cover_name, "wb") as f:
  118. f.write(response.content)
  119. cls.logger(log_type).info("==========封面下载完成==========")
  120. except Exception as e:
  121. cls.logger(log_type).exception("封面下载失败:{}", e)
  122. @classmethod
  123. def get_session(cls):
  124. # charles 抓包文件保存目录
  125. charles_file_dir = "./crawler-kanyikan-recommend/chlsfiles/"
  126. if int(len(os.listdir(charles_file_dir))) == 1:
  127. Common.logger("kanyikan").info("未找到chlsfile文件,等待60s")
  128. time.sleep(60)
  129. else:
  130. try:
  131. # 目标文件夹下所有文件
  132. all_file = sorted(os.listdir(charles_file_dir))
  133. # 获取到目标文件
  134. old_file = all_file[-3]
  135. # 分离文件名与扩展名
  136. new_file = os.path.splitext(old_file)
  137. # 重命名文件后缀
  138. os.rename(os.path.join(charles_file_dir, old_file),
  139. os.path.join(charles_file_dir, new_file[0] + ".txt"))
  140. with open(charles_file_dir + new_file[0] + ".txt", encoding='utf-8-sig', errors='ignore') as f:
  141. contents = json.load(f, strict=False)
  142. if "search.weixin.qq.com" in [text['host'] for text in contents]:
  143. for text in contents:
  144. if text["host"] == "search.weixin.qq.com" \
  145. and text["path"] == "/cgi-bin/recwxa/recwxagetunreadmessagecnt":
  146. sessions = text["query"].split("session=")[-1].split("&wxaVersion=")[0]
  147. if "&vid" in sessions:
  148. session = sessions.split("&vid")[0]
  149. return session
  150. elif "&offset" in sessions:
  151. session = sessions.split("&offset")[0]
  152. return session
  153. elif "&wxaVersion" in sessions:
  154. session = sessions.split("&wxaVersion")[0]
  155. return session
  156. elif "&limit" in sessions:
  157. session = sessions.split("&limit")[0]
  158. return session
  159. elif "&scene" in sessions:
  160. session = sessions.split("&scene")[0]
  161. return session
  162. elif "&count" in sessions:
  163. session = sessions.split("&count")[0]
  164. return session
  165. elif "&channelid" in sessions:
  166. session = sessions.split("&channelid")[0]
  167. return session
  168. elif "&subscene" in sessions:
  169. session = sessions.split("&subscene")[0]
  170. return session
  171. elif "&clientVersion" in sessions:
  172. session = sessions.split("&clientVersion")[0]
  173. return session
  174. elif "&sharesearchid" in sessions:
  175. session = sessions.split("&sharesearchid")[0]
  176. return session
  177. elif "&nettype" in sessions:
  178. session = sessions.split("&nettype")[0]
  179. return session
  180. elif "&switchprofile" in sessions:
  181. session = sessions.split("&switchprofile")[0]
  182. return session
  183. elif "&switchnewuser" in sessions:
  184. session = sessions.split("&switchnewuser")[0]
  185. return session
  186. else:
  187. return sessions
  188. else:
  189. cls.logger("kanyikan").info("未找到 session,10s后重新获取")
  190. time.sleep(10)
  191. cls.get_session()
  192. except Exception as e:
  193. cls.logger("kanyikan").exception("获取 session 异常,30s后重试:{}", e)
  194. time.sleep(30)
  195. cls.get_session()
  196. if __name__ == "__main__":
  197. common = Common()