hksp_author.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. import os
  2. import re
  3. import base64
  4. import json
  5. import random
  6. import sys
  7. import time
  8. import uuid
  9. import requests
  10. sys.path.append(os.getcwd())
  11. from common.video_item import VideoItem
  12. from common import PiaoQuanPipeline, AliyunLogger
  13. from common.mq import MQ
  14. from common.scheduling_db import MysqlHelper
  15. def tunnel_proxies():
  16. # 隧道域名:端口号
  17. tunnel = "q796.kdltps.com:15818"
  18. # 用户名密码方式
  19. username = "t17772369458618"
  20. password = "5zqcjkmy"
  21. proxies = {
  22. "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
  23. "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
  24. }
  25. return proxies
  26. class HaoKanVideoAccount(object):
  27. def __init__(self, platform, mode, rule_dict, user_dict, env):
  28. self.account_id = user_dict["link"].split("/")[-1]
  29. self.platform = platform
  30. self.mode = mode
  31. self.rule_dict = rule_dict
  32. self.user_dict = user_dict
  33. self.env = env
  34. self.download_cnt = 0
  35. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  36. self.url_key = "guanghui456"
  37. self.expire_flag = False
  38. def decrypt(self, ori_str):
  39. # base64解码
  40. base64_bytes = base64.b64decode(ori_str)
  41. crypt_str = base64_bytes.decode("utf-8")
  42. # 异或解密
  43. n = ""
  44. for i in range(len(crypt_str)):
  45. o = ord(crypt_str[i])
  46. s = ord(self.url_key[i % len(self.url_key)])
  47. n += chr(o ^ s)
  48. return n
  49. def get_video_list(self):
  50. page_limit = 10
  51. p_page = ""
  52. headers = {
  53. "Accept": "*/*",
  54. "Accept-Language": "en,zh-CN;q=0.9,zh;q=0.8",
  55. "Cache-Control": "no-cache",
  56. "Connection": "keep-alive",
  57. "Content-Type": "application/x-www-form-urlencoded",
  58. "Cookie": 'BIDUPSID=504D4A3A8D0584CA8C3BE27ACFED5323; PSTM=1695297510; BAIDUID=504D4A3A8D0584CA7398158209FA507F:FG=1; BAIDUID_BFESS=504D4A3A8D0584CA7398158209FA507F:FG=1; H_WISE_SIDS=213352_214793_110085_244721_236312_265883_265985_269905_271172_270102_234295_234207_272282_263618_272473_260335_273141_273244_273397_273481_275098_275007_275853_276196_275170_271562_253022_275870_277354_251972_277631_277642_277635_277611_275732_276665_275209_277554_259642_278057_278166_278163_278300_274784_275167_278263_272560_278573_278575_277542_278790_278388_256739_278920_279021_279045_278237_279267_276573_279367_279385_278392_274947_276269_278946_279086_279610_279605_279680_276983_279877_279307_279695_279945_279703_279975_279998_278249_278213_280132_280209_277699_280161_280227_274286_280405_280368_278674_280485_280541_270366_278414_276929_275856_280614_256223_280488_280636_276438_280560_277759_279896_280768_280809_279850_280771_280107_280583; H_WISE_SIDS_BFESS=213352_214793_110085_244721_236312_265883_265985_269905_271172_270102_234295_234207_272282_263618_272473_260335_273141_273244_273397_273481_275098_275007_275853_276196_275170_271562_253022_275870_277354_251972_277631_277642_277635_277611_275732_276665_275209_277554_259642_278057_278166_278163_278300_274784_275167_278263_272560_278573_278575_277542_278790_278388_256739_278920_279021_279045_278237_279267_276573_279367_279385_278392_274947_276269_278946_279086_279610_279605_279680_276983_279877_279307_279695_279945_279703_279975_279998_278249_278213_280132_280209_277699_280161_280227_274286_280405_280368_278674_280485_280541_270366_278414_276929_275856_280614_256223_280488_280636_276438_280560_277759_279896_280768_280809_279850_280771_280107_280583; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2218b98064c148fe-08a95b8d70a5fe-17525634-1901520-18b98064c151479%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%2218b98064c148fe-08a95b8d70a5fe-17525634-1901520-18b98064c151479%22%7D; H_PS_PSSID=39624_39663_39684_39690_39676_39678_39713; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; hkpcSearch=%u674E%u6709%u8D22; BA_HECTOR=0h240k018l2ka0a000a00g0g1il6sn21q; ZFY=PnfJdvQrpMNOIoZw1LsOKMDW8:BWcrmDvtL60fANYxmQ:C; Hm_lvt_4aadd610dfd2f5972f1efee2653a2bc5=1699950106,1700038092; PC_TAB_LOG=video_details_page; COMMON_LID=c414c76cb1b11dfb04b7062a5ae09ce2; ab_sr=1.0.1_ZGEwNWY3OTFhZmNhNzZiN2YyZjljYjI3ZmE2YzRiZDA5NjY3YTI0OTdiMWM0ZWZhMjU3YWMzMThmZjU5YzQ5MDM0NDA4N2FmZWM5M2I0ZmQzZmE0ZWI2MjJkMjE0MDQ1ZTIyYmU5OGQ3OWY3ZDNlYTI2NmFiMGZlYTliODFmZmJiY2U0MDRmMzkwZWMzOTQxMTc0MDU3NmQ2ZWVlNjM3ZA==; reptileData=%7B%22data%22%3A%22636c55e0319da5169a60acec4a264a35c10862f8abfe2f2cc32c55eb6b0ab4de0efdfa115ea522d6d4d361dea07feae27710e59370b70671d347daddec662182e4c514cbcba8dae1736cf735f29e19b9a42be523d0836d04fa3b15dd57de52f4%22%2C%22key_id%22%3A%2230%22%2C%22sign%22%3A%22551bc5f4%22%7D; Hm_lpvt_4aadd610dfd2f5972f1efee2653a2bc5=1700038278; RT="z=1&dm=baidu.com&si=bd56d547-8a2c-433d-9eee-70f64ccf646a&ss=lozishrc&sl=9&tt=c2s&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=6z26"',
  59. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
  60. }
  61. url = "https://haokan.baidu.com/web/author/listall"
  62. while True:
  63. params = {
  64. "app_id": self.account_id, # 账号id
  65. "ctime": p_page, # 翻页指示,用时间戳表示,为空表示从头开始
  66. "rn": page_limit, # 每一页的页数,默认是 10
  67. "_api": 1,
  68. }
  69. response = requests.request("GET", url, headers=headers, params=params, proxies=tunnel_proxies())
  70. result = response.json()
  71. p_page = result['data']['ctime']
  72. self.process_video_list(result)
  73. if self.expire_flag:
  74. return
  75. if result['data']['has_more'] == 0:
  76. AliyunLogger.logging(
  77. code="2000",
  78. platform=self.platform,
  79. mode=self.mode,
  80. env=self.env,
  81. message="没有更多视频了",
  82. data=result
  83. )
  84. return
  85. time.sleep(random.randint(5, 10))
  86. def get_tiny_video_list(self):
  87. url = "https://haokan.baidu.com/web/author/listall"
  88. headers = {
  89. "Accept": "*/*",
  90. "Accept-Language": "en,zh-CN;q=0.9,zh;q=0.8",
  91. "Cache-Control": "no-cache",
  92. "Connection": "keep-alive",
  93. "Content-Type": "application/x-www-form-urlencoded",
  94. "Cookie": 'BIDUPSID=504D4A3A8D0584CA8C3BE27ACFED5323; PSTM=1695297510; BAIDUID=504D4A3A8D0584CA7398158209FA507F:FG=1; BAIDUID_BFESS=504D4A3A8D0584CA7398158209FA507F:FG=1; H_WISE_SIDS=213352_214793_110085_244721_236312_265883_265985_269905_271172_270102_234295_234207_272282_263618_272473_260335_273141_273244_273397_273481_275098_275007_275853_276196_275170_271562_253022_275870_277354_251972_277631_277642_277635_277611_275732_276665_275209_277554_259642_278057_278166_278163_278300_274784_275167_278263_272560_278573_278575_277542_278790_278388_256739_278920_279021_279045_278237_279267_276573_279367_279385_278392_274947_276269_278946_279086_279610_279605_279680_276983_279877_279307_279695_279945_279703_279975_279998_278249_278213_280132_280209_277699_280161_280227_274286_280405_280368_278674_280485_280541_270366_278414_276929_275856_280614_256223_280488_280636_276438_280560_277759_279896_280768_280809_279850_280771_280107_280583; H_WISE_SIDS_BFESS=213352_214793_110085_244721_236312_265883_265985_269905_271172_270102_234295_234207_272282_263618_272473_260335_273141_273244_273397_273481_275098_275007_275853_276196_275170_271562_253022_275870_277354_251972_277631_277642_277635_277611_275732_276665_275209_277554_259642_278057_278166_278163_278300_274784_275167_278263_272560_278573_278575_277542_278790_278388_256739_278920_279021_279045_278237_279267_276573_279367_279385_278392_274947_276269_278946_279086_279610_279605_279680_276983_279877_279307_279695_279945_279703_279975_279998_278249_278213_280132_280209_277699_280161_280227_274286_280405_280368_278674_280485_280541_270366_278414_276929_275856_280614_256223_280488_280636_276438_280560_277759_279896_280768_280809_279850_280771_280107_280583; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2218b98064c148fe-08a95b8d70a5fe-17525634-1901520-18b98064c151479%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%2218b98064c148fe-08a95b8d70a5fe-17525634-1901520-18b98064c151479%22%7D; H_PS_PSSID=39624_39663_39684_39690_39676_39678_39713; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; hkpcSearch=%u674E%u6709%u8D22; BA_HECTOR=0h240k018l2ka0a000a00g0g1il6sn21q; ZFY=PnfJdvQrpMNOIoZw1LsOKMDW8:BWcrmDvtL60fANYxmQ:C; Hm_lvt_4aadd610dfd2f5972f1efee2653a2bc5=1699950106,1700038092; PC_TAB_LOG=video_details_page; COMMON_LID=c414c76cb1b11dfb04b7062a5ae09ce2; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; ab_sr=1.0.1_ODM2M2M4NzY2MWE3Yjg0MGQ4NDk2YTQ4ZTRlMWVlYjdiY2JmMzQ5ZjU1MjAxODUxZTQ0NTg4YjJjNzNmYTU2MzZiMjI2Y2EwZTU2OGIwYjdmMDc3NmRhMGJjODZkZmIyN2U0YWFjMjUzZWEwNTRlZWQ1N2U0MzkxY2YwMzk4Zjk1N2NiYWM1OGZlN2M0NWU4ZWJiZjFmNWE5YWU3YjFmMA==; reptileData=%7B%22data%22%3A%22636c55e0319da5169a60acec4a264a35c10862f8abfe2f2cc32c55eb6b0ab4de0efdfa115ea522d6d4d361dea07feae27710e59370b70671d347daddec6621825f6963cc8f86c5fee1d0664c82edf0ae5c838cf8bde5188e6a7757f1998a7c48%22%2C%22key_id%22%3A%2230%22%2C%22sign%22%3A%22f302bd06%22%7D; Hm_lpvt_4aadd610dfd2f5972f1efee2653a2bc5=1700046227; RT="z=1&dm=baidu.com&si=bd56d547-8a2c-433d-9eee-70f64ccf646a&ss=loznmnju&sl=1&tt=1w3&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=2oq"',
  95. "Referer": "https://haokan.baidu.com/author/1702511677416581",
  96. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
  97. }
  98. cursor = ""
  99. while True:
  100. params = {
  101. "app_id": self.account_id,
  102. "ctime": cursor,
  103. "video_type": "haokan|tabhubVideo",
  104. "rn": 20,
  105. }
  106. response = requests.request("GET", url, headers=headers, params=params, proxies=tunnel_proxies())
  107. result = response.json()
  108. cursor = result['data']['ctime']
  109. self.process_video_list(result)
  110. if self.expire_flag:
  111. return
  112. if result['data']['has_more'] == 0:
  113. AliyunLogger.logging(
  114. code="2000",
  115. platform=self.platform,
  116. mode=self.mode,
  117. env=self.env,
  118. message="没有更多视频了",
  119. data=result
  120. )
  121. return
  122. time.sleep(random.randint(5, 10))
  123. def process_video_list(self, result):
  124. if result["errmsg"] == "成功":
  125. video_list = result["data"]["results"]
  126. for index, video_obj in enumerate(video_list):
  127. # self.get_video_info(video_obj)
  128. try:
  129. if self.expire_flag:
  130. return
  131. else:
  132. self.get_video_info(video_obj)
  133. except Exception as e:
  134. AliyunLogger.logging(
  135. code="3000",
  136. platform=self.platform,
  137. mode=self.mode,
  138. env=self.env,
  139. message="抓取单条视频异常,报错是:{}".format(e),
  140. )
  141. else:
  142. AliyunLogger.logging(
  143. code="3000",
  144. platform=self.platform,
  145. mode=self.mode,
  146. env=self.env,
  147. message="请求视频链接列表失败,需要 review 代码验证",
  148. )
  149. def get_video_info(self, video_obj):
  150. url = "https://haokan.baidu.com/v?vid={}&collection_id=".format(
  151. video_obj["content"]["vid"]
  152. )
  153. header = {
  154. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
  155. }
  156. response_text = requests.get(url, headers=header, proxies=tunnel_proxies()).text
  157. encrypted_data = json.loads(
  158. re.search(r"window\.__PRELOADED_STATE__\s*=\s*(.*?);", response_text)
  159. .group(1)
  160. .strip()
  161. )
  162. encrypted_video_obj = encrypted_data["encrptedVideoMeta"]
  163. d_obj = json.loads(self.decrypt(encrypted_video_obj))
  164. AliyunLogger.logging(
  165. code="1001",
  166. platform=self.platform,
  167. mode=self.mode,
  168. env=self.env,
  169. data=d_obj,
  170. message="扫描到一条视频",
  171. )
  172. trace_id = self.platform + str(uuid.uuid1())
  173. item = VideoItem()
  174. item.add_video_info("video_id", d_obj["id"])
  175. item.add_video_info("video_title", d_obj["title"])
  176. item.add_video_info("play_cnt", d_obj["playcnt"])
  177. item.add_video_info("publish_time_stamp", d_obj["publish_time"])
  178. item.add_video_info("duration", d_obj["duration"])
  179. item.add_video_info("video_url", d_obj["playurl"])
  180. item.add_video_info("like_cnt", d_obj["like"])
  181. item.add_video_info("comment_cnt", d_obj["comment"])
  182. item.add_video_info("cover_url", video_obj["content"]["cover_src"])
  183. item.add_video_info("user_id", self.user_dict['uid'])
  184. item.add_video_info("user_name", self.user_dict['nick_name'])
  185. item.add_video_info("out_video_id", d_obj["id"])
  186. item.add_video_info("platform", self.platform)
  187. item.add_video_info("strategy", self.mode)
  188. item.add_video_info("session", "{}-{}".format(self.platform, int(time.time())))
  189. # 准备发往 MQ 的消息
  190. mq_obj = item.produce_item()
  191. # 筛选规则的 pipeline
  192. pipeline = PiaoQuanPipeline(
  193. platform=self.platform,
  194. mode=self.mode,
  195. rule_dict=self.rule_dict,
  196. env=self.env,
  197. item=mq_obj,
  198. trace_id=trace_id,
  199. )
  200. if not pipeline.publish_time_flag():
  201. self.expire_flag = True
  202. return
  203. if pipeline.process_item():
  204. # 再增加一条特殊规则
  205. if int(mq_obj['play_cnt']) > 300 and float(mq_obj['like_cnt']) / float(mq_obj['play_cnt']) >= 0.02:
  206. self.mq.send_msg(mq_obj)
  207. # print(mq_obj)
  208. AliyunLogger.logging(
  209. code="1002",
  210. platform=self.platform,
  211. mode=self.mode,
  212. env=self.env,
  213. message="成功发送至 ETL",
  214. data=mq_obj,
  215. trace_id=trace_id,
  216. )
  217. else:
  218. AliyunLogger.logging(
  219. code="2008",
  220. platform=self.platform,
  221. mode=self.mode,
  222. env=self.env,
  223. message="不满足特殊规则",
  224. data=mq_obj
  225. )
  226. def schedule(self):
  227. """
  228. small: 只抓取小视频
  229. big: 只抓取视频
  230. both: 抓取小视频和视频, 两者之间休息 5 分钟
  231. """
  232. flag = self.user_dict['link'].split("_")[0]
  233. match flag:
  234. case "big":
  235. self.get_video_list()
  236. case "small":
  237. self.get_tiny_video_list()
  238. case "both":
  239. self.get_tiny_video_list()
  240. time.sleep(300)
  241. self.get_video_list()
  242. if __name__ == "__main__":
  243. select_user_sql = f"""select * from crawler_user_v3 where source = 'haokanshipin';"""
  244. user_list = MysqlHelper.get_values("author", "haokanshipin", select_user_sql, "prod", action="")
  245. print(json.dumps(user_list[0], ensure_ascii=False, indent=4))
  246. # T = HaoKanVideoAccount(
  247. # platform="haokanshipin",
  248. # mode="author",
  249. # rule_dict={},
  250. # user_dict={"link": 1657075178605219},
  251. # env="prod",
  252. # )
  253. # T.get_tiny_video_list()