xiaoniangao_follow.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/3/13
  4. import json
  5. import os
  6. import random
  7. import shutil
  8. import sys
  9. import time
  10. import requests
  11. import urllib3
  12. sys.path.append(os.getcwd())
  13. from common.common import Common
  14. from common.scheduling_db import MysqlHelper
  15. from common.publish import Publish
  16. from common.feishu import Feishu
  17. proxies = {"http": None, "https": None}
  18. class XiaoniangaoFollow:
  19. platform = "小年糕"
  20. # 小程序个人主页视频列表翻页参数
  21. next_t = None
  22. # 配置微信
  23. # wechat_sheet = Feishu.get_values_batch("follow", "xiaoniangao", "dzcWHw")
  24. # follow_x_mid = wechat_sheet[2][3]
  25. # follow_x_token_id = wechat_sheet[3][3]
  26. # follow_referer = wechat_sheet[4][3]
  27. # follow_uid = wechat_sheet[5][3]
  28. # follow_token = wechat_sheet[6][3]
  29. # 过滤敏感词
  30. @classmethod
  31. def filter_words(cls, log_type):
  32. # 敏感词库列表
  33. word_list = []
  34. # 从云文档读取所有敏感词,添加到词库列表
  35. lists = Feishu.get_values_batch(log_type, "xiaoniangao", "DRAnZh")
  36. for i in lists:
  37. for j in i:
  38. # 过滤空的单元格内容
  39. if j is None:
  40. pass
  41. else:
  42. word_list.append(j)
  43. return word_list
  44. # 基础门槛规则
  45. @staticmethod
  46. def download_rule(video_dict):
  47. """
  48. 下载视频的基本规则
  49. :param video_dict: 视频信息,字典格式
  50. :return: 满足规则,返回 True;反之,返回 False
  51. """
  52. # 视频时长
  53. if int(float(video_dict['duration'])) >= 40:
  54. # 宽或高
  55. if int(video_dict['video_width']) >= 0 or int(video_dict['video_height']) >= 0:
  56. # 播放量
  57. if int(video_dict['play_cnt']) >= 500:
  58. # 分享量
  59. if int(video_dict['share_cnt']) >= 0:
  60. return True
  61. else:
  62. return False
  63. else:
  64. return False
  65. else:
  66. return False
  67. else:
  68. return False
  69. # 从云文档获取关注用户列表
  70. @classmethod
  71. def get_users(cls, log_type, crawler):
  72. try:
  73. while True:
  74. follow_sheet = Feishu.get_values_batch(log_type, "xiaoniangao", "oNpThi")
  75. if follow_sheet is None:
  76. time.sleep(1)
  77. continue
  78. if len(follow_sheet) == 1:
  79. Common.logger(log_type, crawler).info("暂无定向爬取账号")
  80. else:
  81. user_list = []
  82. for i in range(1, len(follow_sheet)):
  83. profile_id = follow_sheet[i][0]
  84. profile_mid = follow_sheet[i][1]
  85. user_name = follow_sheet[i][2]
  86. user_dict = {
  87. "profile_id": profile_id,
  88. "profile_mid": profile_mid,
  89. "user_name": user_name,
  90. }
  91. user_list.append(user_dict)
  92. return user_list
  93. except Exception as e:
  94. Common.logger(log_type, crawler).error("从云文档获取关注用户列表异常:{}", e)
  95. # 获取个人主页视频
  96. @classmethod
  97. def get_videoList(cls, log_type, crawler, strategy, p_mid, oss_endpoint, env, machine):
  98. try:
  99. while True:
  100. url = "https://api.xiaoniangao.cn/profile/list_album"
  101. headers = {
  102. # "X-Mid": str(cls.follow_x_mid),
  103. "X-Mid": '1fb47aa7a860d9',
  104. # "X-Token-Id": str(cls.follow_x_token_id),
  105. "X-Token-Id": '9f2cb91f9952c107ecb73642083e1dec-1145266232',
  106. "content-type": "application/json",
  107. # "uuid": str(cls.follow_uid),
  108. "uuid": 'f40c2e7c-3cfb-4804-b513-608c0280268c',
  109. "Accept-Encoding": "gzip,compress,br,deflate",
  110. "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X)"
  111. " AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 "
  112. "MicroMessenger/8.0.20(0x18001435) NetType/WIFI Language/zh_CN",
  113. # "Referer": str(cls.follow_referer)
  114. "Referer": 'https://servicewechat.com/wxd7911e4c177690e4/654/page-frame.html'
  115. }
  116. json_text = {
  117. "visited_mid": p_mid,
  118. "start_t": cls.next_t,
  119. "qs": "imageMogr2/gravity/center/rotate/$/thumbnail/!690x385r/crop/690x385/interlace/1/format/jpg",
  120. "h_qs": "imageMogr2/gravity/center/rotate/$/thumbnail/!120x120r/crop/120x120/interlace/1/format/jpg",
  121. "limit": 20,
  122. # "token": str(cls.follow_token),
  123. "token": '54e4c603f7bf3dc009c86b49ed91be36',
  124. # "uid": str(cls.follow_uid),
  125. "uid": 'f40c2e7c-3cfb-4804-b513-608c0280268c',
  126. "proj": "ma",
  127. "wx_ver": "8.0.23",
  128. "code_ver": "3.68.0",
  129. "log_common_params": {
  130. "e": [{
  131. "data": {
  132. "page": "profilePage",
  133. "topic": "public"
  134. }
  135. }],
  136. "ext": {
  137. "brand": "iPhone",
  138. "device": "iPhone 11",
  139. "os": "iOS 14.7.1",
  140. "weixinver": "8.0.23",
  141. "srcver": "2.24.7",
  142. "net": "wifi",
  143. "scene": "1089"
  144. },
  145. "pj": "1",
  146. "pf": "2",
  147. "session_id": "7468cf52-00ea-432e-8505-6ea3ad7ec164"
  148. }
  149. }
  150. urllib3.disable_warnings()
  151. r = requests.post(url=url, headers=headers, json=json_text, proxies=proxies, verify=False)
  152. if 'data' not in r.text or r.status_code != 200:
  153. Common.logger(log_type, crawler).info(f"get_videoList:{r.text}\n")
  154. cls.next_t = None
  155. return
  156. elif 'list' not in r.json()['data']:
  157. Common.logger(log_type, crawler).info(f"get_videoList:{r.json()}\n")
  158. cls.next_t = None
  159. return
  160. elif len(r.json()['data']['list']) == 0:
  161. Common.logger(log_type, crawler).info(f"没有更多数据啦~\n")
  162. cls.next_t = None
  163. return
  164. else:
  165. cls.next_t = r.json()["data"]["next_t"]
  166. feeds = r.json()["data"]["list"]
  167. for i in range(len(feeds)):
  168. # 标题,表情随机加在片头、片尾,或替代句子中间的标点符号
  169. char_sheet = Feishu.get_values_batch("hour", "xiaoniangao", "BhlbST")
  170. expression_list = []
  171. char_list = []
  172. for q in range(len(char_sheet)):
  173. if char_sheet[q][0] is not None:
  174. expression_list.append(char_sheet[q][0])
  175. if char_sheet[q][1] is not None:
  176. char_list.append(char_sheet[q][1])
  177. befor_video_title = feeds[i]["title"].strip().replace("\n", "") \
  178. .replace("/", "").replace("\r", "").replace("#", "") \
  179. .replace(".", "。").replace("\\", "").replace("&NBSP", "") \
  180. .replace(":", "").replace("*", "").replace("?", "") \
  181. .replace("?", "").replace('"', "").replace("<", "") \
  182. .replace(">", "").replace("|", "").replace(" ", "")
  183. expression = random.choice(expression_list)
  184. expression_title_list = [expression + befor_video_title, befor_video_title + expression]
  185. # 标题,表情随机加在片头
  186. title_list1 = random.choice(expression_title_list)
  187. # 标题,表情随机加在片尾
  188. title_list2 = befor_video_title + random.choice(char_list)
  189. # # 替代句子中间的标点符号
  190. # title_list3 = befor_video_title.replace(
  191. # ",", random.choice(expression_list)).replace(",", random.choice(expression_list))
  192. title_list4 = [title_list1, title_list2]
  193. video_title = random.choice(title_list4)
  194. # 用户名
  195. user_name = feeds[i]["album_user"]["nick"].strip().replace("\n", "") \
  196. .replace("/", "").replace("快手", "").replace(" ", "") \
  197. .replace(" ", "").replace("&NBSP", "").replace("\r", "")
  198. # 视频 ID
  199. if "vid" in feeds[i]:
  200. video_id = feeds[i]["vid"]
  201. else:
  202. video_id = 0
  203. # 播放量
  204. if "play_pv" in feeds[i]:
  205. video_play_cnt = feeds[i]["play_pv"]
  206. else:
  207. video_play_cnt = 0
  208. # 点赞
  209. if "total" in feeds[i]["favor"]:
  210. video_like_cnt = feeds[i]["favor"]["total"]
  211. else:
  212. video_like_cnt = 0
  213. # 评论数
  214. if "comment_count" in feeds[i]:
  215. video_comment_cnt = feeds[i]["comment_count"]
  216. else:
  217. video_comment_cnt = 0
  218. # 分享
  219. if "share" in feeds[i]:
  220. video_share_cnt = feeds[i]["share"]
  221. else:
  222. video_share_cnt = 0
  223. # 时长
  224. if "du" in feeds[i]:
  225. video_duration = int(feeds[i]["du"] / 1000)
  226. else:
  227. video_duration = 0
  228. # 发布时间
  229. if "t" in feeds[i]:
  230. publish_time_stamp = int(feeds[i]["t"] / 1000)
  231. else:
  232. publish_time_stamp = 0
  233. publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
  234. # 宽和高
  235. if "w" in feeds[i] or "h" in feeds[i]:
  236. video_width = feeds[i]["w"]
  237. video_height = feeds[i]["h"]
  238. else:
  239. video_width = 0
  240. video_height = 0
  241. # 头像
  242. if "hurl" in feeds[i]["album_user"]:
  243. head_url = feeds[i]["album_user"]["hurl"]
  244. else:
  245. head_url = 0
  246. # 用户 ID
  247. if "id" in feeds[i]:
  248. profile_id = feeds[i]["id"]
  249. else:
  250. profile_id = 0
  251. # 用户 mid
  252. if "mid" in feeds[i]:
  253. profile_mid = feeds[i]["mid"]
  254. else:
  255. profile_mid = 0
  256. # 封面
  257. if "url" in feeds[i]:
  258. cover_url = feeds[i]["url"]
  259. else:
  260. cover_url = 0
  261. # 视频播放地址
  262. if "v_url" in feeds[i]:
  263. video_url = feeds[i]["v_url"]
  264. else:
  265. video_url = 0
  266. # 过滤无效视频
  267. if video_id == 0 \
  268. or video_title == 0 \
  269. or publish_time_stamp == 0 \
  270. or video_duration == 0 \
  271. or video_url == 0:
  272. Common.logger(log_type, crawler).info("无效视频\n")
  273. elif int(time.time()) - publish_time_stamp > 3600*24*3:
  274. Common.logger(log_type, crawler).info(f"发布时间超过3天:{publish_time_str}\n")
  275. cls.next_t = None
  276. return
  277. else:
  278. video_dict = {
  279. "video_id": video_id,
  280. "video_title": video_title,
  281. "duration": video_duration,
  282. "play_cnt": video_play_cnt,
  283. "like_cnt": video_like_cnt,
  284. "comment_cnt": video_comment_cnt,
  285. "share_cnt": video_share_cnt,
  286. "user_name": user_name,
  287. "publish_time_stamp": publish_time_stamp,
  288. "publish_time_str": publish_time_str,
  289. "video_width": video_width,
  290. "video_height": video_height,
  291. "avatar_url": head_url,
  292. "profile_id": profile_id,
  293. "profile_mid": profile_mid,
  294. "cover_url": cover_url,
  295. "video_url": video_url,
  296. "session": f"xiaoniangao-follow-{int(time.time())}"
  297. }
  298. for k, v in video_dict.items():
  299. Common.logger(log_type, crawler).info(f"{k}:{v}")
  300. cls.download_publish(log_type=log_type,
  301. crawler=crawler,
  302. strategy=strategy,
  303. video_dict=video_dict,
  304. oss_endpoint=oss_endpoint,
  305. env=env,
  306. machine=machine)
  307. except Exception as error:
  308. Common.logger(log_type, crawler).error(f"获取个人主页视频异常:{error}\n")
  309. @classmethod
  310. def repeat_video(cls, log_type, crawler, video_id, env, machine):
  311. sql = f""" select * from crawler_video where platform="小年糕" and out_video_id="{video_id}"; """
  312. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
  313. return len(repeat_video)
  314. # 下载/上传
  315. @classmethod
  316. def download_publish(cls, log_type, crawler, strategy, video_dict, oss_endpoint, env, machine):
  317. try:
  318. if cls.download_rule(video_dict) is False:
  319. Common.logger(log_type, crawler).info("不满足基础门槛\n")
  320. elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
  321. Common.logger(log_type, crawler).info('视频已下载\n')
  322. elif any(str(word) if str(word) in video_dict['video_title'] else False for word in cls.filter_words(log_type)) is True:
  323. Common.logger(log_type, crawler).info("视频已中过滤词\n")
  324. else:
  325. # 下载封面
  326. Common.download_method(log_type=log_type, crawler=crawler, text="cover", title=video_dict["video_title"], url=video_dict["cover_url"])
  327. # 下载视频
  328. Common.download_method(log_type=log_type, crawler=crawler, text="video", title=video_dict["video_title"], url=video_dict["video_url"])
  329. # 保存视频信息至 "./videos/{download_video_title}/info.txt"
  330. Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_dict)
  331. # 上传视频
  332. Common.logger(log_type, crawler).info("开始上传视频...")
  333. our_video_id = Publish.upload_and_publish(log_type=log_type,
  334. crawler=crawler,
  335. strategy=strategy,
  336. our_uid="follow",
  337. env=env,
  338. oss_endpoint=oss_endpoint)
  339. if env == "dev":
  340. our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  341. else:
  342. our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
  343. Common.logger(log_type, crawler).info("视频上传完成")
  344. if our_video_id is None:
  345. # 删除视频文件夹
  346. shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}")
  347. return
  348. # 视频信息保存数据库
  349. rule_dict = {
  350. "duration": {"min": 40, "max": 100000000},
  351. "play_cnt": {"min": 500}
  352. }
  353. insert_sql = f""" insert into crawler_video(video_id,
  354. out_user_id,
  355. platform,
  356. strategy,
  357. out_video_id,
  358. video_title,
  359. cover_url,
  360. video_url,
  361. duration,
  362. publish_time,
  363. play_cnt,
  364. crawler_rule,
  365. width,
  366. height)
  367. values({our_video_id},
  368. "{video_dict['profile_id']}",
  369. "{cls.platform}",
  370. "定向爬虫策略",
  371. "{video_dict['video_id']}",
  372. "{video_dict['video_title']}",
  373. "{video_dict['cover_url']}",
  374. "{video_dict['video_url']}",
  375. {int(video_dict['duration'])},
  376. "{video_dict['publish_time_str']}",
  377. {int(video_dict['play_cnt'])},
  378. '{json.dumps(rule_dict)}',
  379. {int(video_dict['video_width'])},
  380. {int(video_dict['video_height'])}) """
  381. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  382. MysqlHelper.update_values(log_type, crawler, insert_sql, env, machine)
  383. Common.logger(log_type, crawler).info('视频信息插入数据库成功!')
  384. # 视频写入飞书
  385. Feishu.insert_columns(log_type, crawler, "Wu0CeL", "ROWS", 1, 2)
  386. # 视频ID工作表,首行写入数据
  387. upload_time = int(time.time())
  388. values = [[time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)),
  389. "用户主页",
  390. str(video_dict['video_id']),
  391. str(video_dict['video_title']),
  392. our_video_link,
  393. video_dict['play_cnt'],
  394. video_dict['comment_cnt'],
  395. video_dict['like_cnt'],
  396. video_dict['share_cnt'],
  397. video_dict['duration'],
  398. f"{video_dict['video_width']}*{video_dict['video_height']}",
  399. str(video_dict['publish_time_str']),
  400. str(video_dict['user_name']),
  401. str(video_dict['profile_id']),
  402. str(video_dict['profile_mid']),
  403. str(video_dict['avatar_url']),
  404. str(video_dict['cover_url']),
  405. str(video_dict['video_url'])]]
  406. time.sleep(1)
  407. Feishu.update_values(log_type, crawler, "Wu0CeL", "F2:Z2", values)
  408. Common.logger(log_type, crawler).info('视频信息写入飞书成功\n')
  409. except Exception as e:
  410. Common.logger(log_type, crawler).error("下载/上传异常:{}", e)
  411. # 获取所有关注列表的用户视频
  412. @classmethod
  413. def get_follow_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
  414. try:
  415. # 已关注的用户列表 mids
  416. user_list = cls.get_users(log_type, crawler)
  417. for user in user_list:
  418. user_name = user['user_name']
  419. profile_mid = user['profile_mid']
  420. Common.logger(log_type, crawler).info(f"获取 {user_name} 主页视频")
  421. cls.get_videoList(log_type=log_type,
  422. crawler=crawler,
  423. strategy=strategy,
  424. p_mid=profile_mid,
  425. oss_endpoint=oss_endpoint,
  426. env=env,
  427. machine=machine)
  428. cls.next_t = None
  429. time.sleep(1)
  430. except Exception as e:
  431. Common.logger(log_type, crawler).error(f"get_follow_videos:{e}\n")
  432. if __name__ == "__main__":
  433. # print(XiaoniangaoFollow.repeat_video("follow", "xiaoniangao", "4919087666", "prod", "aliyun"))
  434. print(XiaoniangaoFollow.repeat_video("follow", "xiaoniangao", "4919087666", "dev", "local"))
  435. pass