weixin_search.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409
  1. """
  2. @author: luojunhui
  3. 微信 search
  4. """
  5. import os
  6. import sys
  7. import json
  8. import time
  9. import requests
  10. sys.path.append(os.getcwd())
  11. from application.items import VideoItem
  12. from application.common.messageQueue import MQ
  13. from application.common.log import AliyunLogger
  14. ETL_MQ = MQ(topic_name="topic_crawler_etl_prod")
  15. aliyun_logger = AliyunLogger(platform="weixin_search", mode="search")
  16. async def weixin_search(params):
  17. """
  18. 通过搜索爬虫 + search_keys 来获取视频信息,并且以 MQ 的方式发送给 ETL, 正常上传发布
  19. 只抓一页,不做去重
  20. :param params: []
  21. :return:
  22. """
  23. gh_id_dict = {
  24. "gh_01f8afd03366": {
  25. "uid": 69637520,
  26. "nick_name": "非亲非故"
  27. },
  28. "gh_058e41145a0c": {
  29. "uid": 69637476,
  30. "nick_name": "甜腻梦话"
  31. },
  32. "gh_084a485e859a": {
  33. "uid": 69637472,
  34. "nick_name": "梦星月"
  35. },
  36. "gh_0921c03402cd": {
  37. "uid": 69637531,
  38. "nick_name": "你的女友"
  39. },
  40. "gh_0c89e11f8bf3": {
  41. "uid": 69637508,
  42. "nick_name": "粟米"
  43. },
  44. "gh_171cec079b2a": {
  45. "uid": 69637501,
  46. "nick_name": "海上"
  47. },
  48. "gh_183d80deffb8": {
  49. "uid": 69637491,
  50. "nick_name": "论趣"
  51. },
  52. "gh_1ee2e1b39ccf": {
  53. "uid": 69637473,
  54. "nick_name": "纵有疾风起"
  55. },
  56. "gh_234ef02cdee5": {
  57. "uid": 69637513,
  58. "nick_name": "夹逼"
  59. },
  60. "gh_26a307578776": {
  61. "uid": 69637490,
  62. "nick_name": "最宝贝的宝贝"
  63. },
  64. "gh_29074b51f2b7": {
  65. "uid": 69637530,
  66. "nick_name": "沉舸"
  67. },
  68. "gh_2b8c6aa035ae": {
  69. "uid": 69637470,
  70. "nick_name": "懶得取名"
  71. },
  72. "gh_34318194fd0e": {
  73. "uid": 69637517,
  74. "nick_name": "徒四壁"
  75. },
  76. "gh_3845af6945d0": {
  77. "uid": 69637545,
  78. "nick_name": "秋水娉婷"
  79. },
  80. "gh_3ac6d7208961": {
  81. "uid": 69637497,
  82. "nick_name": "小熊的少女梦"
  83. },
  84. "gh_3c7d38636846": {
  85. "uid": 69637519,
  86. "nick_name": "油腻腻"
  87. },
  88. "gh_3df10391639c": {
  89. "uid": 69637541,
  90. "nick_name": "六郎娇面"
  91. },
  92. "gh_40a0ad154478": {
  93. "uid": 69637516,
  94. "nick_name": "禁止"
  95. },
  96. "gh_424c8eeabced": {
  97. "uid": 69637522,
  98. "nick_name": "认命"
  99. },
  100. "gh_4568b5a7e2fe": {
  101. "uid": 69637482,
  102. "nick_name": "香腮"
  103. },
  104. "gh_45beb952dc74": {
  105. "uid": 69637488,
  106. "nick_name": "毋庸"
  107. },
  108. "gh_484de412b0ef": {
  109. "uid": 69637481,
  110. "nick_name": "婪"
  111. },
  112. "gh_4c058673c07e": {
  113. "uid": 69637474,
  114. "nick_name": "影帝"
  115. },
  116. "gh_538f78f9d3aa": {
  117. "uid": 69637478,
  118. "nick_name": "伤痕"
  119. },
  120. "gh_56a6765df869": {
  121. "uid": 69637514,
  122. "nick_name": "风月"
  123. },
  124. "gh_56ca3dae948c": {
  125. "uid": 69637538,
  126. "nick_name": "留下太多回忆"
  127. },
  128. "gh_5e543853d8f0": {
  129. "uid": 69637543,
  130. "nick_name": "不知春秋"
  131. },
  132. "gh_5ff48e9fb9ef": {
  133. "uid": 69637494,
  134. "nick_name": "寻她找他"
  135. },
  136. "gh_671f460c856c": {
  137. "uid": 69637523,
  138. "nick_name": "绝不改悔"
  139. },
  140. "gh_6b7c2a257263": {
  141. "uid": 69637528,
  142. "nick_name": "奶牙"
  143. },
  144. "gh_6d205db62f04": {
  145. "uid": 69637509,
  146. "nick_name": "怕羞"
  147. },
  148. "gh_6d9f36e3a7be": {
  149. "uid": 69637498,
  150. "nick_name": "望长安"
  151. },
  152. "gh_73be0287bb94": {
  153. "uid": 69637537,
  154. "nick_name": "戏剧"
  155. },
  156. "gh_744cb16f6e16": {
  157. "uid": 69637505,
  158. "nick_name": "反駁"
  159. },
  160. "gh_7b4a5f86d68c": {
  161. "uid": 69637477,
  162. "nick_name": "我很想你"
  163. },
  164. "gh_7bca1c99aea0": {
  165. "uid": 69637511,
  166. "nick_name": "从小就很傲"
  167. },
  168. "gh_7e5818b2dd83": {
  169. "uid": 69637532,
  170. "nick_name": "二八佳人"
  171. },
  172. "gh_89ef4798d3ea": {
  173. "uid": 69637533,
  174. "nick_name": "彼岸花"
  175. },
  176. "gh_901b0d722749": {
  177. "uid": 69637518,
  178. "nick_name": "深情不为我"
  179. },
  180. "gh_9161517e5676": {
  181. "uid": 69637495,
  182. "nick_name": "折磨"
  183. },
  184. "gh_93e00e187787": {
  185. "uid": 69637504,
  186. "nick_name": "理会"
  187. },
  188. "gh_9877c8541764": {
  189. "uid": 69637506,
  190. "nick_name": "我沿着悲伤"
  191. },
  192. "gh_9cf3b7ff486b": {
  193. "uid": 69637492,
  194. "nick_name": "hoit"
  195. },
  196. "gh_9e559b3b94ca": {
  197. "uid": 69637471,
  198. "nick_name": "我与你相遇"
  199. },
  200. "gh_9f8dc5b0c74e": {
  201. "uid": 69637496,
  202. "nick_name": "港口"
  203. },
  204. "gh_a182cfc94dad": {
  205. "uid": 69637539,
  206. "nick_name": "四海八荒"
  207. },
  208. "gh_a2901d34f75b": {
  209. "uid": 69637535,
  210. "nick_name": "听腻了谎话"
  211. },
  212. "gh_a307072c04b9": {
  213. "uid": 69637521,
  214. "nick_name": "踏步"
  215. },
  216. "gh_a6351b447819": {
  217. "uid": 69637540,
  218. "nick_name": "七猫酒馆"
  219. },
  220. "gh_ac43e43b253b": {
  221. "uid": 69637499,
  222. "nick_name": "一厢情愿"
  223. },
  224. "gh_adca24a8f429": {
  225. "uid": 69637483,
  226. "nick_name": "对你何止一句喜欢"
  227. },
  228. "gh_b15de7c99912": {
  229. "uid": 69637536,
  230. "nick_name": "糖炒板栗"
  231. },
  232. "gh_b32125c73861": {
  233. "uid": 69637493,
  234. "nick_name": "发尾"
  235. },
  236. "gh_b3ffc1ca3a04": {
  237. "uid": 69637546,
  238. "nick_name": "主宰你心"
  239. },
  240. "gh_b8baac4296cb": {
  241. "uid": 69637489,
  242. "nick_name": "生性"
  243. },
  244. "gh_b9b99173ff8a": {
  245. "uid": 69637524,
  246. "nick_name": "养一只月亮"
  247. },
  248. "gh_bd57b6978e06": {
  249. "uid": 69637527,
  250. "nick_name": "厌遇"
  251. },
  252. "gh_be8c29139989": {
  253. "uid": 69637502,
  254. "nick_name": "不负"
  255. },
  256. "gh_bfe5b705324a": {
  257. "uid": 69637529,
  258. "nick_name": "乐极"
  259. },
  260. "gh_bff0bcb0694a": {
  261. "uid": 69637534,
  262. "nick_name": "简迷离"
  263. },
  264. "gh_c69776baf2cd": {
  265. "uid": 69637512,
  266. "nick_name": "骄纵"
  267. },
  268. "gh_c91b42649690": {
  269. "uid": 69637503,
  270. "nick_name": "荟萃"
  271. },
  272. "gh_d2cc901deca7": {
  273. "uid": 69637487,
  274. "nick_name": "恶意调笑"
  275. },
  276. "gh_d5f935d0d1f2": {
  277. "uid": 69637500,
  278. "nick_name": "青少年哪吒"
  279. },
  280. "gh_da76772d8d15": {
  281. "uid": 69637526,
  282. "nick_name": "独揽风月"
  283. },
  284. "gh_de9f9ebc976b": {
  285. "uid": 69637475,
  286. "nick_name": "剑出鞘恩怨了"
  287. },
  288. "gh_e0eb490115f5": {
  289. "uid": 69637486,
  290. "nick_name": "赋别"
  291. },
  292. "gh_e24da99dc899": {
  293. "uid": 69637484,
  294. "nick_name": "恋雨夏季"
  295. },
  296. "gh_e2576b7181c6": {
  297. "uid": 69637515,
  298. "nick_name": "满天星"
  299. },
  300. "gh_e75dbdc73d80": {
  301. "uid": 69637542,
  302. "nick_name": "情战"
  303. },
  304. "gh_e9d819f9e147": {
  305. "uid": 69637525,
  306. "nick_name": "与卿"
  307. },
  308. "gh_efaf7da157f5": {
  309. "uid": 69637547,
  310. "nick_name": "心野性子浪"
  311. },
  312. "gh_f4594783f5b8": {
  313. "uid": 69637544,
  314. "nick_name": "自缚"
  315. },
  316. "gh_fe6ef3a65a48": {
  317. "uid": 69637480,
  318. "nick_name": "风间"
  319. }
  320. }
  321. aliyun_logger.logging(
  322. code="2000",
  323. message="请求参数",
  324. data=params
  325. )
  326. search_keys = params['title']
  327. user = gh_id_dict.get(params['ghId'])
  328. trace_id = params['trace_id']
  329. url = "http://8.217.190.241:8888/crawler/wei_xin/keyword"
  330. payload = json.dumps({
  331. "keyword": search_keys,
  332. "cursor": "0",
  333. "content_type": "video"
  334. })
  335. headers = {
  336. 'Content-Type': 'application/json'
  337. }
  338. response = requests.request("POST", url, headers=headers, data=payload)
  339. aliyun_logger.logging(
  340. code="2000",
  341. message="微信抓取成功",
  342. data=response.json()
  343. )
  344. try:
  345. data_list = response.json()['data']['data']
  346. for item in data_list[:10]:
  347. video_obj = item['items'][0]
  348. # await process_weixin_video_obj(video_obj, user)
  349. try:
  350. aliyun_logger.logging(
  351. code="1001",
  352. message="扫描到一条视频",
  353. account=user['uid'],
  354. data=video_obj
  355. )
  356. await process_weixin_video_obj(video_obj, user, trace_id)
  357. except Exception as e:
  358. aliyun_logger.logging(
  359. code="3000",
  360. message="有报错信息---{}".format(e),
  361. account=user['uid']
  362. )
  363. except Exception as e:
  364. aliyun_logger.logging(
  365. code="3000",
  366. message="有报错信息---{}---微信搜索视频失败".format(e),
  367. account=user['uid']
  368. )
  369. async def process_weixin_video_obj(video_obj, user, trace_id):
  370. """
  371. 异步处理微信 video_obj
  372. 公众号和站内账号一一对应
  373. :param trace_id:
  374. :param user:
  375. :param video_obj:
  376. :return:
  377. """
  378. platform = "weixin_search"
  379. publish_time_stamp = int(video_obj['pubTime'])
  380. title = video_obj['title'].replace('<em class=\"highlight\">', '').replace('</em>', '').replace("#", "")
  381. item = VideoItem()
  382. item.add_video_info("user_id", user["uid"])
  383. item.add_video_info("user_name", user["nick_name"])
  384. item.add_video_info("video_id", video_obj['hashDocID'])
  385. item.add_video_info("video_title", title)
  386. item.add_video_info("publish_time_stamp", int(publish_time_stamp))
  387. item.add_video_info("video_url", video_obj["videoUrl"])
  388. item.add_video_info("cover_url", video_obj["image"])
  389. item.add_video_info("out_video_id", video_obj['hashDocID'])
  390. item.add_video_info("out_user_id", trace_id)
  391. item.add_video_info("platform", platform)
  392. item.add_video_info("strategy", "search")
  393. item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
  394. mq_obj = item.produce_item()
  395. ETL_MQ.send_msg(video_dict=mq_obj)
  396. aliyun_logger.logging(
  397. code="1002",
  398. message="成功发送到 ETL",
  399. account=user["uid"],
  400. data=mq_obj
  401. )