weixinzhishu_hot_search.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/3/27
  4. import os
  5. import sys
  6. from datetime import date, timedelta
  7. from lxml import etree
  8. import requests
  9. sys.path.append(os.getcwd())
  10. from common.scheduling_db import MysqlHelper
  11. from common.common import Common
  12. proxies = {"http": None, "https": None}
  13. class HotSearch:
  14. # 日期,格式 年-月-日
  15. today = (date.today() + timedelta(days=0)).strftime("%Y-%m-%d")
  16. # 百度热搜
  17. @classmethod
  18. def baidu_hot_search(cls, log_type, crawler, env):
  19. try:
  20. url = "https://top.baidu.com/board?tab=realtime"
  21. headers = {
  22. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
  23. }
  24. response = requests.get(url=url, headers=headers, proxies=proxies).text
  25. if len(response) == 0:
  26. Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
  27. return
  28. response_html = etree.HTML(response)
  29. title_list_html = response_html.xpath('//*[@class="c-single-text-ellipsis"]')
  30. for title_html in title_list_html:
  31. source = "百度"
  32. title = title_html.xpath("./text()")[0].strip()
  33. publish_time = cls.today
  34. insert_sql = f"""insert into crawler_hot_title(source, title, publish_time)
  35. values("{source}", '{title}', "{publish_time}")"""
  36. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  37. MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
  38. Common.logger(log_type, crawler).info("写入数据库成功\n")
  39. except Exception as e:
  40. Common.logger(log_type, crawler).info(f"baidu_hot_search:{e}\n")
  41. # 抖音热搜
  42. @classmethod
  43. def douyin_hot_search(cls, log_type, crawler, env):
  44. try:
  45. url = "https://tophub.today/n/K7GdaMgdQy"
  46. headers = {
  47. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
  48. }
  49. response = requests.get(url=url, headers=headers, proxies=proxies).text
  50. if len(response) == 0:
  51. Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
  52. return
  53. response_html = etree.HTML(response)
  54. tr_list = response_html.xpath("//tr")
  55. for tr in tr_list:
  56. source = "抖音"
  57. publish_day = tr.xpath("./td[1]/text()")[0]
  58. if len(publish_day) < 10:
  59. publish_day = cls.today
  60. elif publish_day != cls.today:
  61. pass
  62. publish_time = publish_day
  63. if publish_time != cls.today:
  64. continue
  65. title = tr.xpath("./td[2]/*[1]/text()")[0].strip()
  66. insert_sql = f"""insert into crawler_hot_title(source, title, publish_time)
  67. values("{source}", '{title}', "{publish_time}")"""
  68. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  69. MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
  70. Common.logger(log_type, crawler).info("写入数据库成功\n")
  71. except Exception as e:
  72. Common.logger(log_type, crawler).error(f"douyin_hot_search:{e}\n")
  73. # 快手热搜
  74. @classmethod
  75. def kuaishou_hot_search(cls, log_type, crawler, env):
  76. try:
  77. url = "https://tophub.today/n/MZd7PrPerO"
  78. headers = {
  79. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
  80. }
  81. response = requests.get(url=url, headers=headers, proxies=proxies).text
  82. if len(response) == 0:
  83. Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
  84. return
  85. response_html = etree.HTML(response)
  86. tr_list = response_html.xpath("//tr")
  87. for tr in tr_list:
  88. source = "快手"
  89. publish_day = tr.xpath("./td[1]/text()")[0]
  90. if len(publish_day) < 10:
  91. publish_day = cls.today
  92. elif publish_day != cls.today:
  93. pass
  94. publish_time = publish_day
  95. if publish_time != cls.today:
  96. continue
  97. title = tr.xpath("./td[2]/*[1]/text()")[0].strip()
  98. # print(source)
  99. # print(publish_time)
  100. # print(title)
  101. # print('\n')
  102. insert_sql = f"""insert into crawler_hot_title(source, title, publish_time)
  103. values("{source}", '{title}', "{publish_time}")"""
  104. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  105. MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
  106. Common.logger(log_type, crawler).info("写入数据库成功\n")
  107. except Exception as e:
  108. Common.logger(log_type, crawler).error(f"kuaishou_hot_search:{e}\n")
  109. # 微博热搜
  110. @classmethod
  111. def weibo_hot_search(cls, log_type, crawler, env):
  112. try:
  113. url = "https://tophub.today/n/KqndgxeLl9"
  114. headers = {
  115. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
  116. }
  117. response = requests.get(url=url, headers=headers, proxies=proxies).text
  118. if len(response) == 0:
  119. Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
  120. return
  121. response_html = etree.HTML(response)
  122. tr_list = response_html.xpath("//tr")
  123. for tr in tr_list:
  124. source = "微博"
  125. publish_day = tr.xpath("./td[1]/text()")[0]
  126. if len(publish_day) < 10:
  127. publish_day = cls.today
  128. elif publish_day != cls.today:
  129. pass
  130. publish_time = publish_day
  131. if publish_time != cls.today:
  132. continue
  133. title = tr.xpath("./td[2]/*[1]/text()")[0].strip()
  134. # print(source)
  135. # print(publish_time)
  136. # print(title)
  137. insert_sql = f"""insert into crawler_hot_title(source, title, publish_time)
  138. values("{source}", '{title}', "{publish_time}")"""
  139. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  140. MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
  141. Common.logger(log_type, crawler).info("写入数据库成功\n")
  142. except Exception as e:
  143. Common.logger(log_type, crawler).error(f"weibo_hot_search:{e}\n")
  144. # 微信热搜
  145. @classmethod
  146. def weixin_hot_search(cls, log_type, crawler, env):
  147. try:
  148. url = "https://tophub.today/n/W1VdJPZoLQ"
  149. headers = {
  150. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
  151. }
  152. response = requests.get(url=url, headers=headers, proxies=proxies).text
  153. if len(response) == 0:
  154. Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
  155. return
  156. response_html = etree.HTML(response)
  157. tr_list = response_html.xpath("//tr")
  158. for tr in tr_list:
  159. source = "微信"
  160. publish_day = tr.xpath("./td[1]/text()")[0]
  161. if len(publish_day) < 10:
  162. publish_day = cls.today
  163. elif publish_day != cls.today:
  164. pass
  165. publish_time = publish_day
  166. if publish_time != cls.today:
  167. continue
  168. title = tr.xpath("./td[2]/*[1]/text()")[0].strip()
  169. # print(source)
  170. # print(publish_time)
  171. # print(title)
  172. insert_sql = f"""insert into crawler_hot_title(source, title, publish_time)
  173. values("{source}", '{title}', "{publish_time}")"""
  174. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  175. MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
  176. Common.logger(log_type, crawler).info("写入数据库成功\n")
  177. except Exception as e:
  178. Common.logger(log_type, crawler).error(f"weixin_hot_search:{e}\n")
  179. if __name__ == "__main__":
  180. # HotSearch.baidu_hot_search("hot-search-baidu", "weixinzhishu", "dev")
  181. # HotSearch.douyin_hot_search("hot-search-douyin", "weixinzhishu", "dev")
  182. # HotSearch.kuaishou_hot_search("hot-search-kuaishou", "weixinzhishu", "dev")
  183. # HotSearch.weibo_hot_search("hot-search-weibo", "weixinzhishu", "dev")
  184. HotSearch.weixin_hot_search("hot-search-weixin", "weixinzhishu", "dev")