123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201 |
- # -*- coding: utf-8 -*-
- # @Author: wangkun
- # @Time: 2023/3/27
- import os
- import sys
- from datetime import date, timedelta
- from lxml import etree
- import requests
- sys.path.append(os.getcwd())
- from common.scheduling_db import MysqlHelper
- from common.common import Common
- proxies = {"http": None, "https": None}
- class HotSearch:
- # 日期,格式 年-月-日
- today = (date.today() + timedelta(days=0)).strftime("%Y-%m-%d")
- # 百度热搜
- @classmethod
- def baidu_hot_search(cls, log_type, crawler, env):
- try:
- url = "https://top.baidu.com/board?tab=realtime"
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
- }
- response = requests.get(url=url, headers=headers, proxies=proxies).text
- if len(response) == 0:
- Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
- return
- response_html = etree.HTML(response)
- title_list_html = response_html.xpath('//*[@class="c-single-text-ellipsis"]')
- for title_html in title_list_html:
- source = "百度"
- title = title_html.xpath("./text()")[0].strip()
- publish_time = cls.today
- insert_sql = f"""insert into crawler_hot_title(source, title, publish_time) values("{source}", '{title}', "{publish_time}")"""
- Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
- MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
- Common.logger(log_type, crawler).info("写入数据库成功\n")
- except Exception as e:
- Common.logger(log_type, crawler).info(f"baidu_hot_search:{e}\n")
- # 抖音热搜
- @classmethod
- def douyin_hot_search(cls, log_type, crawler, env):
- try:
- url = "https://tophub.today/n/K7GdaMgdQy"
- headers = {
- "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
- }
- response = requests.get(url=url, headers=headers, proxies=proxies).text
- if len(response) == 0:
- Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
- return
- response_html = etree.HTML(response)
- tr_list = response_html.xpath("//tr")
- for tr in tr_list:
- source = "抖音"
- publish_day = tr.xpath("./td[1]/text()")[0]
- if len(publish_day) < 10:
- publish_day = cls.today
- elif publish_day != cls.today:
- pass
- publish_time = publish_day
- if publish_time != cls.today:
- continue
- title = tr.xpath("./td[2]/*[1]/text()")[0].strip()
- insert_sql = f"""insert into crawler_hot_title(source, title, publish_time) values("{source}", '{title}', "{publish_time}")"""
- Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
- MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
- Common.logger(log_type, crawler).info("写入数据库成功\n")
- except Exception as e:
- Common.logger(log_type, crawler).error(f"douyin_hot_search:{e}\n")
- # 快手热搜
- @classmethod
- def kuaishou_hot_search(cls, log_type, crawler, env):
- try:
- url = "https://tophub.today/n/MZd7PrPerO"
- headers = {
- "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
- }
- response = requests.get(url=url, headers=headers, proxies=proxies).text
- if len(response) == 0:
- Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
- return
- response_html = etree.HTML(response)
- tr_list = response_html.xpath("//tr")
- for tr in tr_list:
- source = "快手"
- publish_day = tr.xpath("./td[1]/text()")[0]
- if len(publish_day) < 10:
- publish_day = cls.today
- elif publish_day != cls.today:
- pass
- publish_time = publish_day
- if publish_time != cls.today:
- continue
- title = tr.xpath("./td[2]/*[1]/text()")[0].strip()
- # print(source)
- # print(publish_time)
- # print(title)
- # print('\n')
- insert_sql = f"""insert into crawler_hot_title(source, title, publish_time) values("{source}", '{title}', "{publish_time}")"""
- Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
- MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
- Common.logger(log_type, crawler).info("写入数据库成功\n")
- except Exception as e:
- Common.logger(log_type, crawler).error(f"kuaishou_hot_search:{e}\n")
- # 微博热搜
- @classmethod
- def weibo_hot_search(cls, log_type, crawler, env):
- try:
- url = "https://tophub.today/n/KqndgxeLl9"
- headers = {
- "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
- }
- response = requests.get(url=url, headers=headers, proxies=proxies).text
- if len(response) == 0:
- Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
- return
- response_html = etree.HTML(response)
- tr_list = response_html.xpath("//tr")
- for tr in tr_list:
- source = "微博"
- publish_day = tr.xpath("./td[1]/text()")[0]
- if len(publish_day) < 10:
- publish_day = cls.today
- elif publish_day != cls.today:
- pass
- publish_time = publish_day
- if publish_time != cls.today:
- continue
- title = tr.xpath("./td[2]/*[1]/text()")[0].strip()
- # print(source)
- # print(publish_time)
- # print(title)
- insert_sql = f"""insert into crawler_hot_title(source, title, publish_time) values("{source}", '{title}', "{publish_time}")"""
- Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
- MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
- Common.logger(log_type, crawler).info("写入数据库成功\n")
- except Exception as e:
- Common.logger(log_type, crawler).error(f"weibo_hot_search:{e}\n")
- # 微信热搜
- @classmethod
- def weixin_hot_search(cls, log_type, crawler, env):
- try:
- url = "https://tophub.today/n/W1VdJPZoLQ"
- headers = {
- "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
- }
- response = requests.get(url=url, headers=headers, proxies=proxies).text
- if len(response) == 0:
- Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
- return
- response_html = etree.HTML(response)
- tr_list = response_html.xpath("//tr")
- for tr in tr_list:
- source = "微信"
- publish_day = tr.xpath("./td[1]/text()")[0]
- if len(publish_day) < 10:
- publish_day = cls.today
- elif publish_day != cls.today:
- pass
- publish_time = publish_day
- if publish_time != cls.today:
- continue
- title = tr.xpath("./td[2]/*[1]/text()")[0].strip()
- # print(source)
- # print(publish_time)
- # print(title)
- insert_sql = f"""insert into crawler_hot_title(source, title, publish_time) values("{source}", '{title}', "{publish_time}")"""
- Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
- MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
- Common.logger(log_type, crawler).info("写入数据库成功\n")
- except Exception as e:
- Common.logger(log_type, crawler).error(f"weixin_hot_search:{e}\n")
- if __name__ == "__main__":
- HotSearch.baidu_hot_search("hot-search", "weixinzhishu", "dev")
- HotSearch.douyin_hot_search("hot-search", "weixinzhishu", "dev")
- HotSearch.kuaishou_hot_search("hot-search", "weixinzhishu", "dev")
- HotSearch.weibo_hot_search("hot-search", "weixinzhishu", "dev")
- HotSearch.weixin_hot_search("hot-search", "weixinzhishu", "dev")
|