top_data_processing.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. from datetime import datetime
  2. from loguru import logger
  3. from common.aliyun_log import AliyunLogger
  4. from common.feishu_utils import Feishu
  5. from common.redis import get_top_data, in_job_video_data
  6. import html
  7. import json
  8. import random
  9. import re
  10. import time
  11. import requests
  12. from urllib.parse import urlparse, parse_qs
  13. from loguru import logger
  14. from common.sql_help import sqlCollect
  15. class Top:
  16. def get_text_dy_video(self,url):
  17. max_retries = 3
  18. retry_count = 0
  19. while retry_count < max_retries:
  20. try:
  21. if "http" not in url:
  22. video_id = url
  23. elif "&vid=" in url:
  24. parsed_url = urlparse(url)
  25. params = parse_qs(parsed_url.query)
  26. video_id = params.get('vid', [None])[0]
  27. elif "?modal_id=" in url:
  28. parsed_url = urlparse(url)
  29. params = parse_qs(parsed_url.query)
  30. video_id = params.get('modal_id', [None])[0]
  31. else:
  32. headers = {
  33. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;'
  34. 'q=0.8,application/signed-exchange;v=b3;q=0.7',
  35. 'Accept-Language': 'zh-CN,zh;q=0.9',
  36. 'Cache-Control': 'no-cache',
  37. 'Pragma': 'no-cache',
  38. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
  39. 'Chrome/127.0.0.0 Safari/537.36',
  40. }
  41. response = requests.request(url=url, method='GET', headers=headers, allow_redirects=False, timeout=30)
  42. location = response.headers.get('Location', None)
  43. video_id = re.search(r'/video/(\d+)/?', location.split('?')[0] if location else url).group(1)
  44. url = "http://8.217.192.46:8889/crawler/dou_yin/detail"
  45. if not video_id or not video_id.strip():
  46. return None, None, None
  47. payload = json.dumps({
  48. "content_id": str(video_id)
  49. })
  50. headers = {
  51. 'Content-Type': 'application/json'
  52. }
  53. time.sleep(random.uniform(5, 10))
  54. response = requests.request("POST", url, headers=headers, data=payload, timeout= 60)
  55. response = response.json()
  56. code = response["code"]
  57. if code == 0:
  58. data = response["data"]["data"]
  59. channel_account_id = data["channel_account_id"]
  60. return channel_account_id
  61. if code == 22002:
  62. if '抖音内容已被删除或无法访问' in response['msg']:
  63. return "作品不存在"
  64. except Exception as e:
  65. retry_count += 1
  66. logger.error(f"[+] 抖音{url}获取视频链接失败,失败信息{e}")
  67. time.sleep(1)
  68. return None
  69. def get_text_ks_video(self,url):
  70. try:
  71. if "http" not in url:
  72. video_id = url
  73. else:
  74. headers = {
  75. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;'
  76. 'q=0.8,application/signed-exchange;v=b3;q=0.7',
  77. 'Accept-Language': 'zh-CN,zh;q=0.9',
  78. 'Cache-Control': 'no-cache',
  79. 'Pragma': 'no-cache',
  80. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
  81. 'Chrome/127.0.0.0 Safari/537.36',
  82. }
  83. response = requests.request(url=url, method='GET', headers=headers, allow_redirects=False, timeout= 30)
  84. location = response.headers.get('Location', None)
  85. video_id = re.search(r'/(f|photo|short-video|long-video)/(.*)/?',
  86. location.split('?')[0] if location else url).group(2)
  87. url = "http://8.217.192.46:8889/crawler/kuai_shou/detail"
  88. if not video_id or not video_id.strip():
  89. return None
  90. payload = json.dumps({
  91. "content_id": str(video_id)
  92. })
  93. headers = {
  94. 'Content-Type': 'application/json'
  95. }
  96. time.sleep(random.uniform(5, 10))
  97. response = requests.request("POST", url, headers=headers, data=payload, timeout= 30)
  98. response = response.json()
  99. code = response["code"]
  100. if code == 0:
  101. data = response["data"]["data"]
  102. channel_account_id = data['channel_account_id']
  103. return channel_account_id
  104. elif code == 27006:
  105. if "作品不存在" in response['msg'] or "内容不存在" in response['msg'] or "私密作品" in response['msg']:
  106. return "作品不存在"
  107. time.sleep(3)
  108. except Exception as e:
  109. logger.error(f"[+] 快手{url}获取视频链接失败,失败信息{e}")
  110. return None
  111. def main(self,data):
  112. channel_account_id = None
  113. tag_transport_channel = None
  114. data = json.loads(data)
  115. AliyunLogger.logging(data['channel'], data, "开始获取","fail")
  116. channel_id = data['channel']
  117. url_id, data_channel = sqlCollect.get_channle_id(data['videoid'])
  118. if not url_id:
  119. logger.info(f"[+] 任务{data},没有该视频信息")
  120. AliyunLogger.logging(data['channel'], data, "没有该视频信息","fail")
  121. return
  122. if "&vid=" in url_id or "?modal_id=" in url_id:
  123. host = urlparse(url_id).netloc
  124. else:
  125. msg = html.unescape(url_id).split('?')[0]
  126. pattern = re.search(r'https?://[^\s<>"\'\u4e00-\u9fff]+', msg)
  127. if not pattern:
  128. in_job_video_data("task:top_all_data", json.dumps(data, ensure_ascii=False, indent=4))
  129. return
  130. url_id = pattern.group()
  131. host = urlparse(url_id).netloc
  132. if host in ['v.douyin.com', 'www.douyin.com', 'www.iesdouyin.com'] or data_channel in "抖音":
  133. tag_transport_channel = "抖音"
  134. logger.info(f"[+] {url_id}开始获取抖音视频链接")
  135. channel_account_id = self.get_text_dy_video(url=url_id)
  136. elif host in ['v.kuaishou.com', 'www.kuaishou.com', 'v.m.chenzhongtech.com', 'creater.eozatvmq.com'] or data_channel in "快手":
  137. tag_transport_channel = "快手"
  138. logger.info(f"[+] {url_id}开始获取快手视频链接")
  139. channel_account_id= self.get_text_ks_video(url=url_id)
  140. if not channel_account_id or channel_account_id == "作品不存在":
  141. in_job_video_data("task:top_all_data", json.dumps(data, ensure_ascii=False, indent=4))
  142. AliyunLogger.logging(data['channel'], data, f"没有获取到视频用户ID,等待重新获取/{channel_account_id}","fail")
  143. return
  144. status = sqlCollect.select_pj_video_data(channel_account_id)
  145. if status:
  146. logger.info(f"[+] 任务{data},该用户站外ID已添加过")
  147. AliyunLogger.logging(data['channel'], data, "该用户站外ID已添加过", channel_account_id)
  148. return
  149. data["channel_account_id"] = channel_account_id
  150. if channel_id in ["抖音关键词抓取", "快手关键词抓取"]:
  151. data["tag_transport_channel"] = tag_transport_channel
  152. redis_data = f"task:top_data_{'dy' if channel_id == '抖音关键词抓取' else 'ks'}_gjc"
  153. else:
  154. data["tag_transport_channel"] = tag_transport_channel
  155. redis_data = f"task:top_data_{'ks' if tag_transport_channel == '快手' else 'dy'}_gz"
  156. AliyunLogger.logging(data['channel'], data, "获取成功等待写入改造任务", channel_account_id)
  157. in_job_video_data(redis_data, json.dumps(data, ensure_ascii=False, indent=4))
  158. sqlCollect.insert_pj_video_data(channel_account_id, channel_id)
  159. logger.info(f"[+] 开始写入飞书表格")
  160. current_time = datetime.now()
  161. formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
  162. values = [
  163. [
  164. data['uid'],
  165. data['videoid'],
  166. data['return_uv'],
  167. data['type'],
  168. data['type_owner'],
  169. data['channel'],
  170. data['channel_owner'],
  171. data['title'],
  172. data['dt'],
  173. channel_account_id,
  174. tag_transport_channel,
  175. formatted_time
  176. ]
  177. ]
  178. Feishu.insert_columns("KUIksoqZkhvZOrtqA1McPwObn7d", "57c076", "ROWS", 1, 2)
  179. time.sleep(0.5)
  180. Feishu.update_values("KUIksoqZkhvZOrtqA1McPwObn7d", "57c076", "A2:Z2", values)
  181. logger.info(f"[+] 成功写入飞书表格")
  182. return
  183. if __name__ == '__main__':
  184. url,channel = sqlCollect.get_channle_id(45843781)
  185. print(url,channel)
  186. # top = Top()
  187. # top.main("task:top_all_data")