Server
/
piaoquan_crawler


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
							# -*- coding: utf-8 -*-
# @Author: wangkun
# @Time: 2023/3/27
import os
import sys
from datetime import date, timedelta
from lxml import etree
import requests
sys.path.append(os.getcwd())
from common.scheduling_db import MysqlHelper
from common.common import Common
proxies = {"http": None, "https": None}


class HotSearch:
    # 日期，格式 年-月-日
    today = (date.today() + timedelta(days=0)).strftime("%Y-%m-%d")

    # 百度热搜
    @classmethod
    def baidu_hot_search(cls, log_type, crawler, env):
        try:
            url = "https://top.baidu.com/board?tab=realtime"
            headers = {
                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
            }
            response = requests.get(url=url, headers=headers, proxies=proxies).text
            if len(response) == 0:
                Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
                return
            response_html = etree.HTML(response)
            title_list_html = response_html.xpath('//*[@class="c-single-text-ellipsis"]')
            for title_html in title_list_html:
                source = "百度"
                title = title_html.xpath("./text()")[0].strip()
                publish_time = cls.today
                insert_sql = f"""insert into crawler_hot_title(source, title, publish_time) values("{source}", '{title}', "{publish_time}")"""
                Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
                MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
                Common.logger(log_type, crawler).info("写入数据库成功\n")
        except Exception as e:
            Common.logger(log_type, crawler).info(f"baidu_hot_search:{e}\n")

    # 抖音热搜
    @classmethod
    def douyin_hot_search(cls, log_type, crawler, env):
        try:
            url = "https://tophub.today/n/K7GdaMgdQy"
            headers = {
                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
            }
            response = requests.get(url=url, headers=headers, proxies=proxies).text
            if len(response) == 0:
                Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
                return
            response_html = etree.HTML(response)
            tr_list = response_html.xpath("//tr")
            for tr in tr_list:
                source = "抖音"

                publish_day = tr.xpath("./td[1]/text()")[0]
                if len(publish_day) < 10:
                    publish_day = cls.today
                elif publish_day != cls.today:
                    pass
                publish_time = publish_day
                if publish_time != cls.today:
                    continue

                title = tr.xpath("./td[2]/*[1]/text()")[0].strip()

                insert_sql = f"""insert into crawler_hot_title(source, title, publish_time) values("{source}", '{title}', "{publish_time}")"""
                Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
                MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
                Common.logger(log_type, crawler).info("写入数据库成功\n")
        except Exception as e:
            Common.logger(log_type, crawler).error(f"douyin_hot_search:{e}\n")

    # 快手热搜
    @classmethod
    def kuaishou_hot_search(cls, log_type, crawler, env):
        try:
            url = "https://tophub.today/n/MZd7PrPerO"
            headers = {
                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
            }
            response = requests.get(url=url, headers=headers, proxies=proxies).text
            if len(response) == 0:
                Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
                return
            response_html = etree.HTML(response)
            tr_list = response_html.xpath("//tr")
            for tr in tr_list:
                source = "快手"

                publish_day = tr.xpath("./td[1]/text()")[0]
                if len(publish_day) < 10:
                    publish_day = cls.today
                elif publish_day != cls.today:
                    pass
                publish_time = publish_day
                if publish_time != cls.today:
                    continue

                title = tr.xpath("./td[2]/*[1]/text()")[0].strip()
                # print(source)
                # print(publish_time)
                # print(title)
                # print('\n')

                insert_sql = f"""insert into crawler_hot_title(source, title, publish_time) values("{source}", '{title}', "{publish_time}")"""
                Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
                MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
                Common.logger(log_type, crawler).info("写入数据库成功\n")
        except Exception as e:
            Common.logger(log_type, crawler).error(f"kuaishou_hot_search:{e}\n")

    # 微博热搜
    @classmethod
    def weibo_hot_search(cls, log_type, crawler, env):
        try:
            url = "https://tophub.today/n/KqndgxeLl9"
            headers = {
                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
            }
            response = requests.get(url=url, headers=headers, proxies=proxies).text
            if len(response) == 0:
                Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
                return
            response_html = etree.HTML(response)
            tr_list = response_html.xpath("//tr")
            for tr in tr_list:
                source = "微博"

                publish_day = tr.xpath("./td[1]/text()")[0]
                if len(publish_day) < 10:
                    publish_day = cls.today
                elif publish_day != cls.today:
                    pass
                publish_time = publish_day
                if publish_time != cls.today:
                    continue

                title = tr.xpath("./td[2]/*[1]/text()")[0].strip()
                # print(source)
                # print(publish_time)
                # print(title)

                insert_sql = f"""insert into crawler_hot_title(source, title, publish_time) values("{source}", '{title}', "{publish_time}")"""
                Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
                MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
                Common.logger(log_type, crawler).info("写入数据库成功\n")
        except Exception as e:
            Common.logger(log_type, crawler).error(f"weibo_hot_search:{e}\n")

    # 微信热搜
    @classmethod
    def weixin_hot_search(cls, log_type, crawler, env):
        try:
            url = "https://tophub.today/n/W1VdJPZoLQ"
            headers = {
                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
            }
            response = requests.get(url=url, headers=headers, proxies=proxies).text
            if len(response) == 0:
                Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
                return
            response_html = etree.HTML(response)
            tr_list = response_html.xpath("//tr")
            for tr in tr_list:
                source = "微信"

                publish_day = tr.xpath("./td[1]/text()")[0]
                if len(publish_day) < 10:
                    publish_day = cls.today
                elif publish_day != cls.today:
                    pass
                publish_time = publish_day
                if publish_time != cls.today:
                    continue

                title = tr.xpath("./td[2]/*[1]/text()")[0].strip()
                # print(source)
                # print(publish_time)
                # print(title)

                insert_sql = f"""insert into crawler_hot_title(source, title, publish_time) values("{source}", '{title}', "{publish_time}")"""
                Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
                MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
                Common.logger(log_type, crawler).info("写入数据库成功\n")
        except Exception as e:
            Common.logger(log_type, crawler).error(f"weixin_hot_search:{e}\n")


if __name__ == "__main__":
    HotSearch.baidu_hot_search("hot-search", "weixinzhishu", "dev")
    HotSearch.douyin_hot_search("hot-search", "weixinzhishu", "dev")
    HotSearch.kuaishou_hot_search("hot-search", "weixinzhishu", "dev")
    HotSearch.weibo_hot_search("hot-search", "weixinzhishu", "dev")
    HotSearch.weixin_hot_search("hot-search", "weixinzhishu", "dev")