xiaoniangaotuijianliu.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. import os
  2. import random
  3. import sys
  4. import time
  5. import uuid
  6. import json
  7. from datetime import datetime
  8. import cv2
  9. import requests
  10. from application.common.feishu import FsData
  11. from application.common.feishu.feishu_utils import FeishuUtils
  12. from application.common.gpt import GPT4oMini
  13. from application.common.mysql.sql import Sql
  14. from application.common.redis.xng_redis import xng_in_video_data
  15. sys.path.append(os.getcwd())
  16. from application.items import VideoItem
  17. from application.pipeline import PiaoQuanPipeline
  18. from application.common.messageQueue import MQ
  19. from application.common.log import AliyunLogger
  20. from application.common.mysql import MysqlHelper
  21. class XNGTJLRecommend(object):
  22. """
  23. 小年糕推荐流
  24. """
  25. def __init__(self, platform, mode, rule_dict, user_list, env="prod"):
  26. self.limit_flag = False
  27. self.platform = platform
  28. self.mode = mode
  29. self.rule_dict = rule_dict
  30. self.user_list = user_list
  31. self.env = env
  32. self.download_cnt = 0
  33. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  34. self.expire_flag = False
  35. self.aliyun_log = AliyunLogger(mode=self.mode, platform=self.platform)
  36. self.mysql = MysqlHelper(mode=self.mode, platform=self)
  37. def get_video_duration(self, video_link: str) -> int:
  38. cap = cv2.VideoCapture(video_link)
  39. if cap.isOpened():
  40. rate = cap.get(5)
  41. frame_num = cap.get(7)
  42. duration = int(frame_num / rate)
  43. return duration
  44. return 0
  45. def get_recommend_list(self):
  46. print("小年糕推荐流开始")
  47. """
  48. 获取推荐页视频
  49. """
  50. headers = {
  51. 'Content-Type': 'application/json'
  52. }
  53. data_rule = FsData()
  54. title_rule = data_rule.get_title_rule()
  55. for i in range(3):
  56. url = "http://8.217.192.46:8889/crawler/xiao_nian_gao_plus/recommend"
  57. payload = json.dumps({})
  58. response = requests.request("POST", url, headers=headers, data=payload)
  59. response = response.json()
  60. if response['code'] != 0:
  61. self.aliyun_log.logging(
  62. code="3000",
  63. message="抓取单条视频失败,请求失败"
  64. ),
  65. return
  66. for index, video_obj in enumerate(response['data']['data'], 1):
  67. try:
  68. self.aliyun_log.logging(
  69. code="1001", message="扫描到一条视频", data=video_obj
  70. )
  71. self.process_video_obj(video_obj, title_rule)
  72. except Exception as e:
  73. self.aliyun_log.logging(
  74. code="3000",
  75. message="抓取单条视频失败, 该视频位于第{}页第{}条报错原因是{}".format(
  76. 1, index, e
  77. ),
  78. )
  79. if self.limit_flag:
  80. return
  81. time.sleep(random.randint(5, 10))
  82. def process_video_obj(self, video_obj, title_rule):
  83. """
  84. 处理视频
  85. :param video_obj:
  86. """
  87. time.sleep(random.randint(3, 8))
  88. trace_id = self.platform + str(uuid.uuid1())
  89. our_user = random.choice(self.user_list)
  90. item = VideoItem()
  91. try:
  92. mid = int(video_obj['user']['mid'])
  93. print(f"id:{mid}")
  94. user_name = video_obj['user']['nick']
  95. avatar_url = video_obj['user']['hurl']
  96. sql = Sql()
  97. max_id = sql.select_id(mid)
  98. if max_id:
  99. sql.update_name_url(mid, avatar_url, user_name)
  100. else:
  101. time.sleep(1)
  102. link = sql.select_id_status(mid)
  103. if link:
  104. sql.insert_name_url(mid, avatar_url, user_name)
  105. print(f"开始写入{mid}")
  106. xng_in_video_data(json.dumps({"mid": mid}))
  107. except Exception as e:
  108. print(f"写入异常{e}")
  109. pass
  110. url = video_obj["v_url"]
  111. duration = self.get_video_duration(url)
  112. item.add_video_info("video_id", video_obj["id"])
  113. item.add_video_info("video_title", video_obj["title"])
  114. item.add_video_info("play_cnt", int(video_obj["play_pv"]))
  115. item.add_video_info("publish_time_stamp", int(int(video_obj["t"]) / 1000))
  116. item.add_video_info("out_user_id", video_obj["id"])
  117. item.add_video_info("cover_url", video_obj["url"])
  118. item.add_video_info("like_cnt", 0)
  119. item.add_video_info("share_cnt", int(video_obj["share"]))
  120. item.add_video_info("comment_cnt", int(video_obj["comment_count"]))
  121. item.add_video_info("video_url", video_obj["v_url"])
  122. item.add_video_info("out_video_id", video_obj["id"])
  123. item.add_video_info("duration", int(duration))
  124. item.add_video_info("platform", self.platform)
  125. item.add_video_info("strategy", self.mode)
  126. item.add_video_info("session", "{}-{}".format(self.platform, int(time.time())))
  127. item.add_video_info("user_id", our_user["uid"])
  128. item.add_video_info("user_name", our_user["nick_name"])
  129. mq_obj = item.produce_item()
  130. pipeline = PiaoQuanPipeline(
  131. platform=self.platform,
  132. mode=self.mode,
  133. rule_dict=self.rule_dict,
  134. env=self.env,
  135. item=mq_obj,
  136. trace_id=trace_id,
  137. )
  138. if pipeline.process_item():
  139. title_list = title_rule.split(",")
  140. title = video_obj["title"]
  141. contains_keyword = any(keyword in title for keyword in title_list)
  142. if contains_keyword:
  143. new_title = GPT4oMini.get_ai_mini_title(title)
  144. if new_title:
  145. item.add_video_info("video_title", new_title)
  146. current_time = datetime.now()
  147. formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
  148. values = [
  149. [
  150. video_obj["v_url"],
  151. video_obj["url"],
  152. title,
  153. new_title,
  154. formatted_time,
  155. ]
  156. ]
  157. FeishuUtils.insert_columns("U5dXsSlPOhiNNCtEfgqcm1iYnpf", "D1nVxQ", "ROWS", 1, 2)
  158. time.sleep(0.5)
  159. FeishuUtils.update_values("U5dXsSlPOhiNNCtEfgqcm1iYnpf", "D1nVxQ", "A2:Z2", values)
  160. self.download_cnt += 1
  161. self.mq.send_msg(mq_obj)
  162. self.aliyun_log.logging(code="1002", message="成功发送至 ETL", data=mq_obj)
  163. if self.download_cnt >= int(
  164. self.rule_dict.get("videos_cnt", {}).get("min", 200)
  165. ):
  166. self.limit_flag = True
  167. """
  168. 查询用户id是否存在
  169. """
  170. def select_id(self, uid):
  171. sql = f""" select uid from xng_uid where uid = "{uid}"; """
  172. db = MysqlHelper()
  173. repeat_video = db.select(sql=sql)
  174. if repeat_video:
  175. return True
  176. return False
  177. """
  178. 查询用户id是否之前已添加过
  179. """
  180. def select_id_status(self, uid):
  181. sql = f""" select uid from crawler_user_v3 where link = "{uid}"; """
  182. db = MysqlHelper()
  183. repeat_video = db.select(sql=sql)
  184. if repeat_video:
  185. return False
  186. return True
  187. def run(self):
  188. self.get_recommend_list()
  189. if __name__ == '__main__':
  190. J = XNGTJLRecommend(
  191. platform="xiaonianggaotuijianliu",
  192. mode="recommend",
  193. rule_dict={},
  194. user_list=[{'uid': "123456", 'nick_name': "xiaoxiao"}],
  195. )
  196. J.get_recommend_list()
  197. # J.logic()