gzh_bug_review.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. import json
  2. import time
  3. import requests
  4. import urllib3
  5. from common.public import task_fun_mq, get_consumer, ack_message
  6. from common.scheduling_db import MysqlHelper
  7. from common.common import Common
  8. import asyncio
  9. from gongzhonghao.gongzhonghao_author.gongzhonghao_author import GongzhonghaoAuthor
  10. token_d = {
  11. "token": "883406306",
  12. "cookie": "appmsglist_action_3524986952=card; ua_id=j6t2xNuC0mv6dLVbAAAAAMPRLKj1sVGSlMDwNFJKE3s=; wxuin=93278011749821; mm_lang=zh_CN; pgv_pvid=6815195556; noticeLoginFlag=1; remember_acct=2071735594%40qq.com; rewardsn=; wxtokenkey=777; _clck=3930572231|1|ff1|0; uuid=680bd7f128bf80058bc62dd82ff85c96; rand_info=CAESIBtaIUDyVXWwBRD33d7CafRp3rV5rXK7mcvYCy4Yvnn+; slave_bizuin=3236647229; data_bizuin=3236647229; bizuin=3236647229; data_ticket=Dx0Yxt5o9JJuMyndtyu3+JZBym0Dcjy6QqjPcfp+xwsLHf3Y+L9ZmP+kDX6o4t9r; slave_sid=WjV0MXhZZXlrcG9BTGVOZjBEOUlyUFptMWEyN2JNcXlpeU5kcGIyVm9IZUZOV3J1RElKb29KTDJIRHRYaGZtNnVSbklua1FOdUNsX3NoQWE4RFVKM0lKbDkzU25wblRGTDhDWFJteExtMHBjZGwyanZKOVVCWmE1UmNxT3FaZWNsd0VrVm52eEpLakFocGVz; slave_user=gh_d284c09295eb; xid=675798a4e148cb559bed6bb65681ebf9; _clsk=1a6iklq|1694746372692|2|1|mp.weixin.qq.com/weheat-agent/payload/record"
  13. }
  14. def get_user_info(token_dict):
  15. url = "https://mp.weixin.qq.com/cgi-bin/searchbiz?"
  16. headers = {
  17. "accept": "*/*",
  18. "accept-encoding": "gzip, deflate, br",
  19. "accept-language": "zh-CN,zh;q=0.9",
  20. "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?"
  21. "t=media/appmsg_edit_v2&action=edit&isNew=1"
  22. "&type=77&createType=5&token=1011071554&lang=zh_CN",
  23. "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
  24. "sec-ch-ua-mobile": "?0",
  25. "sec-ch-ua-platform": '"Windows"',
  26. "sec-fetch-dest": "empty",
  27. "sec-fetch-mode": "cors",
  28. "sec-fetch-site": "same-origin",
  29. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
  30. " (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
  31. "x-requested-with": "XMLHttpRequest",
  32. "cookie": token_dict["cookie"],
  33. }
  34. params = {
  35. "action": "search_biz",
  36. "begin": "0",
  37. "count": "5",
  38. "query": "生活小妙招小助手",
  39. "token": token_dict["token"],
  40. "lang": "zh_CN",
  41. "f": "json",
  42. "ajax": "1",
  43. }
  44. # proxies = Common.tunnel_proxies()
  45. # print(proxies)
  46. urllib3.disable_warnings()
  47. r = requests.get(url=url, headers=headers, params=params, verify=False)
  48. r.close()
  49. print(r.json())
  50. # if r.json()["base_resp"]["err_msg"] == "invalid session":
  51. # Common.logger(log_type, crawler).warning(
  52. # f"status_code:{r.status_code}, get_fakeid:{r.text}\n"
  53. # )
  54. # # Common.logging(
  55. # # log_type,
  56. # # crawler,
  57. # # env,
  58. # # f"status_code:{r.status_code}, get_fakeid:{r.text}\n",
  59. # # )
  60. # cls.release_token(log_type, crawler, env, token_dict["token_id"], -2)
  61. # if 20 >= datetime.datetime.now().hour >= 10:
  62. # Feishu.bot(
  63. # log_type,
  64. # crawler,
  65. # f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n过期啦,请扫码更换token\nhttps://mp.weixin.qq.com/",
  66. # )
  67. # time.sleep(60 * 15)
  68. # continue
  69. # if r.json()["base_resp"]["err_msg"] == "freq control":
  70. # Common.logger(log_type, crawler).warning(
  71. # f"status_code:{r.status_code}, get_fakeid:{r.text}\n"
  72. # )
  73. # # Common.logging(
  74. # # log_type,
  75. # # crawler,
  76. # # env,
  77. # # f"status_code:{r.status_code}, get_fakeid:{r.text}\n",
  78. # # )
  79. # cls.release_token(log_type, crawler, env, token_dict["token_id"], -2)
  80. # if 20 >= datetime.datetime.now().hour >= 10:
  81. # Feishu.bot(
  82. # log_type,
  83. # crawler,
  84. # f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/",
  85. # )
  86. # time.sleep(60 * 15)
  87. # continue
  88. # if r.json()["base_resp"]["err_msg"] == "ok" and len(r.json()["list"]) == 0:
  89. # Common.logger(log_type, crawler).warning(
  90. # f"status_code:{r.status_code}, get_fakeid:{r.text}\n"
  91. # )
  92. # # Common.logging(
  93. # # log_type,
  94. # # crawler,
  95. # # env,
  96. # # f"status_code:{r.status_code}, get_fakeid:{r.text}\n",
  97. # # )
  98. # unbind_msg = task_unbind(
  99. # log_type=log_type,
  100. # crawler=crawler,
  101. # taskid=task_dict["id"],
  102. # uids=str(user_dict["uid"]),
  103. # env=env,
  104. # )
  105. # if unbind_msg == "success":
  106. # if 20 >= datetime.datetime.now().hour >= 10:
  107. # Feishu.bot(
  108. # log_type,
  109. # crawler,
  110. # f"公众号:{user_dict['link']}, 站内昵称:{user_dict['nick_name']}\n抓取异常, 已取消抓取该公众号\n",
  111. # )
  112. # # Common.logging(
  113. # # log_type,
  114. # # crawler,
  115. # # env,
  116. # # f"公众号:{user_dict['link']}, 站内昵称:{user_dict['nick_name']}\n抓取异常, 已取消抓取该公众号\n",
  117. # # )
  118. # else:
  119. # Common.logger(log_type, crawler).warning(f"unbind_msg:{unbind_msg}")
  120. # # Common.logging(log_type, crawler, env, f"unbind_msg:{unbind_msg}")
  121. # return None
  122. user_info_dict = {
  123. "user_name": r.json()["list"][0]["nickname"],
  124. "user_id": r.json()["list"][0]["fakeid"],
  125. "avatar_url": r.json()["list"][0]["round_head_img"],
  126. }
  127. return user_info_dict
  128. def get_videoList(token_dict, user_dict):
  129. begin = 0
  130. url = "https://mp.weixin.qq.com/cgi-bin/appmsg?"
  131. headers = {
  132. "accept": "*/*",
  133. "accept-encoding": "gzip, deflate, br",
  134. "accept-language": "zh-CN,zh;q=0.9",
  135. "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?"
  136. "t=media/appmsg_edit_v2&action=edit&isNew=1"
  137. "&type=77&createType=5&token="
  138. + str(token_dict["token"])
  139. + "&lang=zh_CN",
  140. "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
  141. "sec-ch-ua-mobile": "?0",
  142. "sec-ch-ua-platform": '"Windows"',
  143. "sec-fetch-dest": "empty",
  144. "sec-fetch-mode": "cors",
  145. "sec-fetch-site": "same-origin",
  146. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
  147. " (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
  148. "x-requested-with": "XMLHttpRequest",
  149. "cookie": token_dict["cookie"],
  150. }
  151. params = {
  152. "action": "list_ex",
  153. "begin": str(begin),
  154. "count": "5",
  155. "fakeid": user_dict["user_id"],
  156. "type": "9",
  157. "query": "",
  158. "token": str(token_dict["token"]),
  159. "lang": "zh_CN",
  160. "f": "json",
  161. "ajax": "1",
  162. }
  163. urllib3.disable_warnings()
  164. r = requests.get(url=url, headers=headers, params=params, verify=False)
  165. print(r.url)
  166. r.close()
  167. print(r.json())
  168. if r.json()["base_resp"]["err_msg"] == "invalid session":
  169. time.sleep(60 * 15)
  170. print("invalid session")
  171. if r.json()["base_resp"]["err_msg"] == "freq control":
  172. print("freq control")
  173. if (
  174. r.json()["base_resp"]["err_msg"] == "invalid args"
  175. and r.json()["base_resp"]["ret"] == 200002
  176. ):
  177. print("invalid args")
  178. if "app_msg_list" not in r.json():
  179. print("no app_msg_list")
  180. if len(r.json()["app_msg_list"]) == 0:
  181. print("没有更多视频了\n")
  182. return
  183. else:
  184. begin += 5
  185. app_msg_list = r.json()["app_msg_list"]
  186. for article in app_msg_list:
  187. # try:
  188. create_time = article.get("create_time", 0)
  189. publish_time_stamp = int(create_time)
  190. publish_time_str = time.strftime(
  191. "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
  192. )
  193. article_url = article.get("link", "")
  194. video_dict = {
  195. "video_id": article.get("aid", ""),
  196. "video_title": article.get("title", "")
  197. .replace(" ", "")
  198. .replace('"', "")
  199. .replace("'", ""),
  200. "publish_time_stamp": publish_time_stamp,
  201. "publish_time_str": publish_time_str,
  202. "user_name": user_dict["user_name"],
  203. "play_cnt": 0,
  204. "comment_cnt": 0,
  205. "like_cnt": 0,
  206. "share_cnt": 0,
  207. "user_id": user_dict["user_id"],
  208. "avatar_url": user_dict["avatar_url"],
  209. "cover_url": article.get("cover", ""),
  210. "article_url": article.get("link", ""),
  211. # "video_url": cls.get_video_url(article_url, env),
  212. "video_url": "url",
  213. "session": f"gongzhonghao-author1-{int(time.time())}",
  214. }
  215. print(video_dict)
  216. # for k, v in video_dict.items():
  217. # Common.logger(log_type, crawler).info(f"{k}:{v}")
  218. # Common.logging(
  219. # log_type, crawler, env, f"video_dict:{video_dict}"
  220. # )
  221. # if int(time.time()) - publish_time_stamp > 3600 * 24 * int(
  222. # rule_dict.get("period", {}).get("max", 1000)
  223. # ):
  224. # Common.logger(log_type, crawler).info(
  225. # f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n"
  226. # )
  227. # # Common.logging(
  228. # # log_type,
  229. # # crawler,
  230. # # env,
  231. # # f"发布时间超过{int(rule_dict.get('period', {}).get('max', 1000))}天\n",
  232. # # )
  233. # return
  234. #
  235. # if (
  236. # video_dict["article_url"] == 0
  237. # or video_dict["video_url"] == 0
  238. # ):
  239. # Common.logger(log_type, crawler).info("文章涉嫌违反相关法律法规和政策\n")
  240. # # Common.logging(log_type, crawler, env, "文章涉嫌违反相关法律法规和政策\n")
  241. # # 标题敏感词过滤
  242. # elif (
  243. # any(
  244. # str(word)
  245. # if str(word) in video_dict["video_title"]
  246. # else False
  247. # for word in get_config_from_mysql(
  248. # log_type=log_type,
  249. # source=crawler,
  250. # env=env,
  251. # text="filter",
  252. # action="",
  253. # )
  254. # )
  255. # is True
  256. # ):
  257. # Common.logger(log_type, crawler).info("标题已中过滤词\n")
  258. # # Common.logging(log_type, crawler, env, "标题已中过滤词\n")
  259. # # 已下载判断
  260. # elif (
  261. # cls.repeat_video(
  262. # log_type, crawler, video_dict["video_id"], env
  263. # )
  264. # != 0
  265. # ):
  266. # Common.logger(log_type, crawler).info("视频已下载\n")
  267. # # Common.logging(log_type, crawler, env, "视频已下载\n")
  268. # # 标题相似度
  269. # elif (
  270. # title_like(
  271. # log_type,
  272. # crawler,
  273. # video_dict["video_title"],
  274. # cls.platform,
  275. # env,
  276. # )
  277. # is True
  278. # ):
  279. # Common.logger(log_type, crawler).info(
  280. # f'标题相似度>=80%:{video_dict["video_title"]}\n'
  281. # )
  282. # # Common.logging(
  283. # # log_type,
  284. # # crawler,
  285. # # env,
  286. # # f'标题相似度>=80%:{video_dict["video_title"]}\n',
  287. # # )
  288. # else:
  289. # video_dict["out_user_id"] = video_dict["user_id"]
  290. # video_dict["platform"] = crawler
  291. # video_dict["strategy"] = log_type
  292. # video_dict["out_video_id"] = video_dict["video_id"]
  293. # video_dict["width"] = 0
  294. # video_dict["height"] = 0
  295. # video_dict["crawler_rule"] = json.dumps(rule_dict)
  296. # video_dict["user_id"] = user_dict["uid"]
  297. # video_dict["publish_time"] = video_dict["publish_time_str"]
  298. # mq.send_msg(video_dict)
  299. # except Exception as e:
  300. # Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
  301. # Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
  302. # Common.logger(log_type, crawler).info("休眠 60 秒\n")
  303. # Common.logging(log_type, crawler, env, "休眠 60 秒\n")
  304. time.sleep(60)
  305. # 分割列表
  306. def chunks(data_list, chunk_size):
  307. """
  308. :param data_list: 列表
  309. :param chunk_size: 每个子列表的长度
  310. :return: 大列表包小列表[[], [], [], []......]
  311. """
  312. for i in range(0, len(data_list), chunk_size):
  313. yield data_list[i: i + chunk_size]
  314. async def get_author_videos(args):
  315. await asyncio.sleep(1)
  316. print(args['log_type'])
  317. await GongzhonghaoAuthor.get_all_videos(
  318. log_type=args['log_type'],
  319. crawler=args['crawler'],
  320. task_dict=args['task_dict'],
  321. token_index=args['token_index'],
  322. rule_dict=args['rule_dict'],
  323. user_list=args['user_list'],
  324. env=args['env']
  325. )
  326. if __name__ == "__main__":
  327. mess = {
  328. "createTime": 1684500378438,
  329. "id": 27,
  330. "interval": 86400,
  331. "machine": "aliyun",
  332. "mode": "author",
  333. "operator": "xxl",
  334. "rule": "[{\"duration\":{\"min\":20,\"max\":2700}},{\"period\":{\"min\":1,\"max\":2}}]",
  335. "source": "gongzhonghao",
  336. "spiderName": "run_gzh2_author",
  337. "startTime": 1693493854438,
  338. "status": 0,
  339. "taskName": "公众号_2",
  340. "updateTime": 1688572800179
  341. }
  342. # 解析 task_dict
  343. rule_list = json.loads(mess['rule'])
  344. rule_dict = {}
  345. for item in rule_list:
  346. for key, val in item.items():
  347. rule_dict[key] = val
  348. mess['rule'] = rule_dict
  349. task_dict = mess
  350. # 解析 user_list
  351. task_id = task_dict["id"]
  352. select_user_sql = (
  353. f"""select * from crawler_user_v3 where task_id={task_id}"""
  354. )
  355. user_list = MysqlHelper.get_values(
  356. "author", "gongzhonghao", select_user_sql, "prod", action=""
  357. )
  358. print(len(user_list))
  359. user_list = chunks(user_list, 250)
  360. print(user_list)
  361. for index, i in enumerate(user_list):
  362. with open("/Users/luojunhui/cyber/gzh_spider/test_AB/200/user_list_{}.json".format(index + 1), "w", encoding="utf-8") as f:
  363. f.write(json.dumps(i, ensure_ascii=False, indent=4))
  364. # print(user_list)
  365. # loop = asyncio.get_event_loop()
  366. # arg_list = []
  367. # for index, sub_list in enumerate(user_list):
  368. # arg = {'log_type': "author{}".format(index + 1), 'crawler': "gongzhonghao", 'token_index': index + 1,
  369. # 'task_dict': task_dict, 'rule_dict': rule_dict, 'user_list': sub_list, 'env': 'prod'}
  370. # arg_list.append(arg)
  371. #
  372. # coroutines_list = [get_author_videos(arg) for arg in arg_list]
  373. #
  374. #
  375. # async def test():
  376. # await asyncio.gather(*coroutines_list)
  377. # asyncio.run(test())
  378. user_d = get_user_info(token_d)
  379. # print(user_d)
  380. # # #
  381. # get_videoList(token_d, user_d)