kanyikan_recommend_plus.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438
  1. # -*- coding: utf-8 -*-
  2. # @Time: 2023/10/26
  3. import json
  4. import os
  5. import random
  6. import sys
  7. import time
  8. from datetime import datetime
  9. import requests
  10. import urllib3
  11. sys.path.append(os.getcwd())
  12. from common.mq import MQ
  13. from common.common import Common
  14. from common.scheduling_db import MysqlHelper
  15. from common import AliyunLogger
  16. from common.public import get_config_from_mysql, download_rule
  17. from common.feishu import Feishu
  18. proxies = {"http": None, "https": None}
  19. class KanyikanRecommend:
  20. platform = "看一看-plus"
  21. strategy = "随机数据抓取"
  22. @classmethod
  23. def repeat_video(cls, log_type, crawler, video_id, env):
  24. sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and create_time>='2023-10-09' and out_video_id="{video_id}"; """
  25. repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
  26. return len(repeat_video)
  27. @classmethod
  28. def insert_video_id(cls, log_type, crawler, video_id, env):
  29. insert_sql = f"""insert into crawler_kyk_video_id( kyk_video_id , status) values ("{video_id}",0)"""
  30. MysqlHelper.update_values(log_type, crawler, insert_sql, env, action='')
  31. @classmethod
  32. def get_videoList(cls, log_type, crawler, our_uid, rule_dict, env):
  33. mq = MQ(topic_name="topic_crawler_etl_" + env)
  34. try:
  35. session = Common.get_session(log_type, crawler, env)
  36. if session is None:
  37. time.sleep(1)
  38. cls.get_videoList(log_type, crawler, our_uid, rule_dict, env)
  39. sharesearchid = 0
  40. for i in range(20):
  41. url = 'https://search.weixin.qq.com/cgi-bin/recwxa/recwxavideolist?'
  42. vid = random.choice(
  43. ["wxv_3183841422983217154", "wxv_2930758110737334272", "wxv_2988109621326512134",
  44. "wxv_2676332817823432706", "wxv_3176172124915433476", "wxv_2844480939899650049",
  45. "wxv_2801905452978274308", "wxv_2946787506342117382", "wxv_2935943471797125120",
  46. "wxv_2756464139115659264", "wxv_3174430452460453896", "wxv_3126758748858908674",
  47. "wxv_3182262442043621385", "wxv_3058491263710314497", "wxv_2952726055449051140",
  48. "wxv_3076106053748015108", "wxv_2074265064492040192", "wxv_2999570992006021122"])
  49. channelid = random.choice(
  50. ["200201", "200", "208", "208201"])
  51. switchnewuser = random.choice(
  52. ["0", "1"])
  53. isFromUgc = random.choice(
  54. ["false", "true"])
  55. switchprofile = random.choice(
  56. ["0", "1"])
  57. subscene = random.choice(
  58. ["1089", "1074", "208", "1007", "1008"])
  59. params = random.choice([{
  60. 'session': session,
  61. "offset": 0,
  62. "wxaVersion": "3.17.12",
  63. "count": "10",
  64. "channelid": channelid,
  65. "scene": '310',
  66. "subscene": subscene,
  67. "clientVersion": '3.8.6',
  68. "sharesearchid": sharesearchid,
  69. "nettype": 'wifi',
  70. "switchprofile": switchprofile,
  71. "switchnewuser": switchnewuser,
  72. }, {
  73. "session": session,
  74. "wxaVersion": "3.17.8",
  75. "channelid": channelid,
  76. "vid": vid,
  77. "offset": 0,
  78. "count": "15",
  79. "scene": '310',
  80. "subscene": subscene,
  81. "model": "华为",
  82. "nettype": '4g',
  83. "clientVersion": '3.8.6',
  84. "sharesearchid": sharesearchid,
  85. "presearchid": "17530764723864413041",
  86. "sharesource": "0",
  87. "isFromUgc": isFromUgc,
  88. "ad": 0,
  89. "switchprofile": switchprofile,
  90. "switchnewuser": switchnewuser,
  91. }])
  92. header = {
  93. 'Host': 'search.weixin.qq.com',
  94. 'Content-Type': 'application/json',
  95. 'X-WX-ClientVersion': '0x33050520',
  96. 'X-WECHAT-UIN': 'b2hfbTQ1WGNjSzQxemdfanpMSml1TEtfbEtsVQ==',
  97. 'Accept': '*/*',
  98. 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E217 MicroMessenger/6.8.0(0x16080000) NetType/WIFI Language/en Branch/Br_trunk MiniProgramEnv/Mac',
  99. 'Referer': 'https://servicewechat.com/wxbb9a805eb4f9533c/268/page-frame.html',
  100. 'Accept-Language': 'zh-cn'
  101. }
  102. urllib3.disable_warnings()
  103. response = requests.get(url=url, headers=header, params=params, proxies=proxies, verify=False)
  104. # print(response)
  105. if "data" not in response.text:
  106. Common.logger(log_type, crawler).info("获取视频list时,session过期,随机睡眠 31-50 秒")
  107. Common.logging(log_type, crawler, env, "获取视频list时,session过期,随机睡眠 31-50 秒")
  108. AliyunLogger.logging(
  109. code="2000",
  110. platform=crawler,
  111. mode=log_type,
  112. env=env,
  113. message=f"获取视频list时,session过期,随机睡眠 31-50 秒"
  114. )
  115. # 如果返回空信息,则随机睡眠 31-40 秒
  116. time.sleep(random.randint(31, 40))
  117. cls.get_videoList(log_type, crawler, our_uid, rule_dict, env)
  118. elif "items" not in response.json()["data"]:
  119. Common.logger(log_type, crawler).info(f"get_feeds:{response.json()},随机睡眠 1-3 分钟")
  120. Common.logging(log_type, crawler, env, f"get_feeds:{response.json()},随机睡眠 1-3 分钟")
  121. AliyunLogger.logging(
  122. code="2000",
  123. platform=crawler,
  124. mode=log_type,
  125. env=env,
  126. message=f"get_feeds:{response.json()},随机睡眠 1-3 分钟"
  127. )
  128. # 如果返回空信息,则随机睡眠 1-3 分钟
  129. time.sleep(random.randint(60, 180))
  130. cls.get_videoList(log_type, crawler, our_uid, rule_dict, env)
  131. feeds = response.json().get("data", {}).get("items", "")
  132. sharesearchid = response.json().get("searchid", {})
  133. if feeds == "":
  134. Common.logger(log_type, crawler).info(f"feeds:{feeds}")
  135. Common.logging(log_type, crawler, env, f"feeds:{feeds}")
  136. return
  137. for i in range(len(feeds)):
  138. try:
  139. AliyunLogger.logging(
  140. code="1001",
  141. platform=crawler,
  142. mode=log_type,
  143. env=env,
  144. message='扫描到一条视频\n'
  145. )
  146. video_title = feeds[i].get("title", "").strip().replace("\n", "") \
  147. .replace("/", "").replace("\\", "").replace("\r", "") \
  148. .replace(":", "").replace("*", "").replace("?", "") \
  149. .replace("?", "").replace('"', "").replace("<", "") \
  150. .replace(">", "").replace("|", "").replace(" ", "") \
  151. .replace("&NBSP", "").replace(".", "。").replace(" ", "") \
  152. .replace("'", "").replace("#", "").replace("Merge", "")
  153. publish_time_stamp = feeds[i].get("date", 0)
  154. publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
  155. # 获取播放地址
  156. if "videoInfo" not in feeds[i]:
  157. video_url = ""
  158. elif "mpInfo" in feeds[i]["videoInfo"]["videoCdnInfo"]:
  159. if len(feeds[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"]) > 2:
  160. video_url = feeds[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"][2]["url"]
  161. else:
  162. video_url = feeds[i]["videoInfo"]["videoCdnInfo"]["mpInfo"]["urlInfo"][0]["url"]
  163. elif "ctnInfo" in feeds[i]["videoInfo"]["videoCdnInfo"]:
  164. video_url = feeds[i]["videoInfo"]["videoCdnInfo"]["ctnInfo"]["urlInfo"][0]["url"]
  165. else:
  166. video_url = feeds[i]["videoInfo"]["videoCdnInfo"]["urlInfo"][0]["url"]
  167. video_id = feeds[i].get("videoId", "")
  168. videoId = "{}kyk_plus".format(video_id)
  169. playCount = int(feeds[i].get("playCount", 0))
  170. shared_cnt = int(feeds[i].get("shared_cnt", 0))
  171. video_dict = {
  172. "video_title": video_title,
  173. "video_id": videoId,
  174. "play_cnt": feeds[i].get("playCount", 0),
  175. "like_cnt": feeds[i].get("liked_cnt", 0),
  176. "comment_cnt": feeds[i].get("comment_cnt", 0),
  177. "share_cnt": feeds[i].get("shared_cnt", 0),
  178. "duration": feeds[i].get("mediaDuration", 0),
  179. "video_width": feeds[i].get("short_video_info", {}).get("width", 0),
  180. "video_height": feeds[i].get("short_video_info", {}).get("height", 0),
  181. "publish_time_stamp": publish_time_stamp,
  182. "publish_time_str": publish_time_str,
  183. "user_name": feeds[i].get("source", "").strip().replace("\n", ""),
  184. "user_id": feeds[i].get("openid", ""),
  185. "avatar_url": feeds[i].get("bizIcon", ""),
  186. "cover_url": feeds[i].get("thumbUrl", ""),
  187. "video_url": video_url,
  188. "session": session,
  189. }
  190. # 获取当前时间
  191. current_time = datetime.now()
  192. formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
  193. for k, v in video_dict.items():
  194. Common.logger(log_type, crawler).info(f"{k}:{v}")
  195. Common.logging(log_type, crawler, env, f"video_dict:{video_dict}")
  196. AliyunLogger.logging(
  197. code="1000",
  198. platform=crawler,
  199. mode=log_type,
  200. env=env,
  201. message=f"{video_dict}\n"
  202. )
  203. video_percent = '%.2f' % (shared_cnt / playCount)
  204. if float(video_percent) < 0.1:
  205. Common.logger(log_type, crawler).info(f"分享/播放:{video_percent}\n")
  206. Common.logging(log_type, crawler, env, f"分享/播放:{video_percent}\n")
  207. AliyunLogger.logging(
  208. code="2004",
  209. platform=crawler,
  210. mode=log_type,
  211. env=env,
  212. message=f"不符合抓取条件,分享/播放:{video_percent}\n"
  213. )
  214. values = [[
  215. videoId,
  216. video_title,
  217. feeds[i].get("playCount", 0),
  218. feeds[i].get("liked_cnt", 0),
  219. feeds[i].get("comment_cnt", 0),
  220. feeds[i].get("shared_cnt", 0),
  221. feeds[i].get("mediaDuration", 0),
  222. publish_time_str,
  223. formatted_time,
  224. feeds[i].get("thumbUrl", ""),
  225. video_url,
  226. f"channelid:{channelid},switchnewuser:{switchnewuser},sharesearchid:{sharesearchid},isFromUgc:{isFromUgc},switchprofile:{switchprofile},subscene:{subscene}",
  227. "否",
  228. f"不符合抓取条件,分享/播放:{video_percent}"
  229. ]]
  230. Feishu.insert_columns('kanyikan', 'kanyikan', "zS0vxs", "ROWS", 1, 2)
  231. time.sleep(0.5)
  232. Feishu.update_values('kanyikan', 'kanyikan', "zS0vxs", "A2:Z2", values)
  233. continue
  234. if video_dict["video_id"] == "" or video_dict["video_title"] == "" or video_dict["video_url"] == "":
  235. Common.logger(log_type, crawler).info("无效视频\n")
  236. Common.logging(log_type, crawler, env, "无效视频\n")
  237. AliyunLogger.logging(
  238. code="2004",
  239. platform=crawler,
  240. mode=log_type,
  241. env=env,
  242. message=f"无效视频"
  243. )
  244. values = [[
  245. videoId,
  246. video_title,
  247. feeds[i].get("playCount", 0),
  248. feeds[i].get("liked_cnt", 0),
  249. feeds[i].get("comment_cnt", 0),
  250. feeds[i].get("shared_cnt", 0),
  251. feeds[i].get("mediaDuration", 0),
  252. publish_time_str,
  253. formatted_time,
  254. feeds[i].get("thumbUrl", ""),
  255. video_url,
  256. f"channelid:{channelid},switchnewuser:{switchnewuser},sharesearchid:{sharesearchid},isFromUgc:{isFromUgc},switchprofile:{switchprofile},subscene:{subscene}",
  257. "否",
  258. f"无效视频"
  259. ]]
  260. Feishu.insert_columns('kanyikan', 'kanyikan', "zS0vxs", "ROWS", 1, 2)
  261. time.sleep(0.5)
  262. Feishu.update_values('kanyikan', 'kanyikan', "zS0vxs", "A2:Z2", values)
  263. elif download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
  264. Common.logger(log_type, crawler).info("不满足抓取规则\n")
  265. Common.logging(log_type, crawler, env, "不满足抓取规则\n")
  266. AliyunLogger.logging(
  267. code="2004",
  268. platform=crawler,
  269. mode=log_type,
  270. env=env,
  271. message='不满足抓取规则\n'
  272. )
  273. values = [[
  274. videoId,
  275. video_title,
  276. feeds[i].get("playCount", 0),
  277. feeds[i].get("liked_cnt", 0),
  278. feeds[i].get("comment_cnt", 0),
  279. feeds[i].get("shared_cnt", 0),
  280. feeds[i].get("mediaDuration", 0),
  281. publish_time_str,
  282. formatted_time,
  283. feeds[i].get("thumbUrl", ""),
  284. video_url,
  285. f"channelid:{channelid},switchnewuser:{switchnewuser},sharesearchid:{sharesearchid},isFromUgc:{isFromUgc},switchprofile:{switchprofile},subscene:{subscene}",
  286. "否",
  287. f"不满足抓取规则"
  288. ]]
  289. Feishu.insert_columns('kanyikan', 'kanyikan', "zS0vxs", "ROWS", 1, 2)
  290. time.sleep(0.5)
  291. Feishu.update_values('kanyikan', 'kanyikan', "zS0vxs", "A2:Z2", values)
  292. elif any(str(word) if str(word) in video_dict["video_title"] else False
  293. for word in get_config_from_mysql(log_type=log_type,
  294. source=crawler,
  295. env=env,
  296. text="filter",
  297. action="")) is True:
  298. Common.logger(log_type, crawler).info('已中过滤词\n')
  299. Common.logging(log_type, crawler, env, '已中过滤词\n')
  300. AliyunLogger.logging(
  301. code="2004",
  302. platform=crawler,
  303. mode=log_type,
  304. env=env,
  305. message='已中过滤词\n'
  306. )
  307. values = [[
  308. videoId,
  309. video_title,
  310. feeds[i].get("playCount", 0),
  311. feeds[i].get("liked_cnt", 0),
  312. feeds[i].get("comment_cnt", 0),
  313. feeds[i].get("shared_cnt", 0),
  314. feeds[i].get("mediaDuration", 0),
  315. publish_time_str,
  316. formatted_time,
  317. feeds[i].get("thumbUrl", ""),
  318. video_url,
  319. f"channelid:{channelid},switchnewuser:{switchnewuser},sharesearchid:{sharesearchid},isFromUgc:{isFromUgc},switchprofile:{switchprofile},subscene:{subscene}",
  320. "否",
  321. f"已中过滤词"
  322. ]]
  323. Feishu.insert_columns('kanyikan', 'kanyikan', "zS0vxs", "ROWS", 1, 2)
  324. time.sleep(0.5)
  325. Feishu.update_values('kanyikan', 'kanyikan', "zS0vxs", "A2:Z2", values)
  326. elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
  327. Common.logger(log_type, crawler).info('视频已下载\n')
  328. Common.logging(log_type, crawler, env, '视频已下载\n')
  329. AliyunLogger.logging(
  330. code="2002",
  331. platform=crawler,
  332. mode=log_type,
  333. env=env,
  334. message='视频已下载\n'
  335. )
  336. values = [[
  337. videoId,
  338. video_title,
  339. feeds[i].get("playCount", 0),
  340. feeds[i].get("liked_cnt", 0),
  341. feeds[i].get("comment_cnt", 0),
  342. feeds[i].get("shared_cnt", 0),
  343. feeds[i].get("mediaDuration", 0),
  344. publish_time_str,
  345. formatted_time,
  346. feeds[i].get("thumbUrl", ""),
  347. video_url,
  348. f"channelid:{channelid},switchnewuser:{switchnewuser},sharesearchid:{sharesearchid},isFromUgc:{isFromUgc},switchprofile:{switchprofile},subscene:{subscene}",
  349. "否",
  350. f"视频已下载"
  351. ]]
  352. Feishu.insert_columns('kanyikan', 'kanyikan', "zS0vxs", "ROWS", 1, 2)
  353. time.sleep(0.5)
  354. Feishu.update_values('kanyikan', 'kanyikan', "zS0vxs", "A2:Z2", values)
  355. else:
  356. video_dict["out_user_id"] = video_dict["user_id"]
  357. video_dict["platform"] = crawler
  358. video_dict["strategy"] = log_type
  359. video_dict["strategy_type"] = "data"
  360. video_dict["out_video_id"] = video_dict["video_id"]
  361. video_dict["width"] = video_dict["video_width"]
  362. video_dict["height"] = video_dict["video_height"]
  363. video_dict["crawler_rule"] = json.dumps(rule_dict)
  364. video_dict["user_id"] = our_uid
  365. video_dict["publish_time"] = video_dict["publish_time_str"]
  366. cls.insert_video_id(log_type, crawler, video_id, env)
  367. AliyunLogger.logging(
  368. code="1010",
  369. platform=crawler,
  370. mode=log_type,
  371. env=env,
  372. message=f"看一看video_id:{video_id}入库",
  373. )
  374. values = [[
  375. videoId,
  376. video_title,
  377. feeds[i].get("playCount", 0),
  378. feeds[i].get("liked_cnt", 0),
  379. feeds[i].get("comment_cnt", 0),
  380. feeds[i].get("shared_cnt", 0),
  381. feeds[i].get("mediaDuration", 0),
  382. publish_time_str,
  383. formatted_time,
  384. feeds[i].get("thumbUrl", ""),
  385. video_url,
  386. f"channelid:{channelid},switchnewuser:{switchnewuser},sharesearchid:{sharesearchid},isFromUgc:{isFromUgc},switchprofile:{switchprofile},subscene:{subscene}",
  387. "是",
  388. ""
  389. ]]
  390. Feishu.insert_columns('kanyikan', 'kanyikan', "zS0vxs", "ROWS", 1, 2)
  391. time.sleep(0.5)
  392. Feishu.update_values('kanyikan', 'kanyikan', "zS0vxs", "A2:Z2", values)
  393. mq.send_msg(video_dict)
  394. time.sleep(random.randint(10, 15))
  395. except Exception as e:
  396. Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
  397. Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
  398. AliyunLogger.logging(
  399. code="3000",
  400. platform=crawler,
  401. mode=log_type,
  402. env=env,
  403. message=f"抓取单条视频异常:{e}\n"
  404. )
  405. except Exception as e:
  406. Common.logger(log_type, crawler).error(f"抓取列表页时异常:{e}\n")
  407. Common.logging(log_type, crawler, env, f"抓取列表页时异常:{e}\n")
  408. AliyunLogger.logging(
  409. code="3000",
  410. platform=crawler,
  411. mode=log_type,
  412. env=env,
  413. message=f"抓取列表页时异常:{e}\n"
  414. )
  415. if __name__ == "__main__":
  416. KanyikanRecommend.get_videoList(
  417. log_type="recommend",
  418. crawler="kanyikan",
  419. env="prod",
  420. rule_dict={'share_cnt': {'min': 300, 'max': 0}},
  421. our_uid=64080779
  422. )