wangkun 2 years ago
parent
commit
3c8c61454b

BIN
weixinzhishu/weixinzhishu_hot_search/weixinzhishu/.DS_Store


+ 162 - 6
weixinzhishu/weixinzhishu_hot_search/weixinzhishu_hot_search.py

@@ -6,12 +6,9 @@ import sys
 from datetime import date, timedelta
 from lxml import etree
 import requests
-
-from common.common import Common
-
 sys.path.append(os.getcwd())
 from common.scheduling_db import MysqlHelper
-
+from common.common import Common
 proxies = {"http": None, "https": None}
 
 
@@ -38,13 +35,172 @@ class HotSearch:
                 title = title_html.xpath("./text()")[0].strip()
                 publish_time = cls.today
                 insert_sql = f"""insert into crawler_hot_title(source, title, publish_time)
-                values("{source}", "{title}", "{publish_time}")"""
+                values("{source}", '{title}', "{publish_time}")"""
                 Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
                 MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
                 Common.logger(log_type, crawler).info("写入数据库成功\n")
         except Exception as e:
             Common.logger(log_type, crawler).info(f"baidu_hot_search:{e}\n")
 
+    # 抖音热搜
+    @classmethod
+    def douyin_hot_search(cls, log_type, crawler, env):
+        try:
+            url = "https://tophub.today/n/K7GdaMgdQy"
+            headers = {
+                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
+            }
+            response = requests.get(url=url, headers=headers, proxies=proxies).text
+            if len(response) == 0:
+                Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
+                return
+            response_html = etree.HTML(response)
+            tr_list = response_html.xpath("//tr")
+            for tr in tr_list:
+                source = "抖音"
+
+                publish_day = tr.xpath("./td[1]/text()")[0]
+                if len(publish_day) < 10:
+                    publish_day = cls.today
+                elif publish_day != cls.today:
+                    pass
+                publish_time = publish_day
+                if publish_time != cls.today:
+                    continue
+
+                title = tr.xpath("./td[2]/*[1]/text()")[0].strip()
+
+                insert_sql = f"""insert into crawler_hot_title(source, title, publish_time)
+                                values("{source}", '{title}', "{publish_time}")"""
+                Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
+                MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
+                Common.logger(log_type, crawler).info("写入数据库成功\n")
+        except Exception as e:
+            Common.logger(log_type, crawler).error(f"douyin_hot_search:{e}\n")
+
+    # 快手热搜
+    @classmethod
+    def kuaishou_hot_search(cls, log_type, crawler, env):
+        try:
+            url = "https://tophub.today/n/MZd7PrPerO"
+            headers = {
+                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
+            }
+            response = requests.get(url=url, headers=headers, proxies=proxies).text
+            if len(response) == 0:
+                Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
+                return
+            response_html = etree.HTML(response)
+            tr_list = response_html.xpath("//tr")
+            for tr in tr_list:
+                source = "快手"
+
+                publish_day = tr.xpath("./td[1]/text()")[0]
+                if len(publish_day) < 10:
+                    publish_day = cls.today
+                elif publish_day != cls.today:
+                    pass
+                publish_time = publish_day
+                if publish_time != cls.today:
+                    continue
+
+                title = tr.xpath("./td[2]/*[1]/text()")[0].strip()
+                # print(source)
+                # print(publish_time)
+                # print(title)
+                # print('\n')
+
+                insert_sql = f"""insert into crawler_hot_title(source, title, publish_time)
+                                values("{source}", '{title}', "{publish_time}")"""
+                Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
+                MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
+                Common.logger(log_type, crawler).info("写入数据库成功\n")
+        except Exception as e:
+            Common.logger(log_type, crawler).error(f"kuaishou_hot_search:{e}\n")
+
+    # 微博热搜
+    @classmethod
+    def weibo_hot_search(cls, log_type, crawler, env):
+        try:
+            url = "https://tophub.today/n/KqndgxeLl9"
+            headers = {
+                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
+            }
+            response = requests.get(url=url, headers=headers, proxies=proxies).text
+            if len(response) == 0:
+                Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
+                return
+            response_html = etree.HTML(response)
+            tr_list = response_html.xpath("//tr")
+            for tr in tr_list:
+                source = "微博"
+
+                publish_day = tr.xpath("./td[1]/text()")[0]
+                if len(publish_day) < 10:
+                    publish_day = cls.today
+                elif publish_day != cls.today:
+                    pass
+                publish_time = publish_day
+                if publish_time != cls.today:
+                    continue
+
+                title = tr.xpath("./td[2]/*[1]/text()")[0].strip()
+                # print(source)
+                # print(publish_time)
+                # print(title)
+
+                insert_sql = f"""insert into crawler_hot_title(source, title, publish_time)
+                                values("{source}", '{title}', "{publish_time}")"""
+                Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
+                MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
+                Common.logger(log_type, crawler).info("写入数据库成功\n")
+        except Exception as e:
+            Common.logger(log_type, crawler).error(f"weibo_hot_search:{e}\n")
+
+    # 微信热搜
+    @classmethod
+    def weixin_hot_search(cls, log_type, crawler, env):
+        try:
+            url = "https://tophub.today/n/W1VdJPZoLQ"
+            headers = {
+                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
+            }
+            response = requests.get(url=url, headers=headers, proxies=proxies).text
+            if len(response) == 0:
+                Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
+                return
+            response_html = etree.HTML(response)
+            tr_list = response_html.xpath("//tr")
+            for tr in tr_list:
+                source = "微信"
+
+                publish_day = tr.xpath("./td[1]/text()")[0]
+                if len(publish_day) < 10:
+                    publish_day = cls.today
+                elif publish_day != cls.today:
+                    pass
+                publish_time = publish_day
+                if publish_time != cls.today:
+                    continue
+
+                title = tr.xpath("./td[2]/*[1]/text()")[0].strip()
+                # print(source)
+                # print(publish_time)
+                # print(title)
+
+                insert_sql = f"""insert into crawler_hot_title(source, title, publish_time)
+                                values("{source}", '{title}', "{publish_time}")"""
+                Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
+                MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
+                Common.logger(log_type, crawler).info("写入数据库成功\n")
+        except Exception as e:
+            Common.logger(log_type, crawler).error(f"weixin_hot_search:{e}\n")
+
 
 if __name__ == "__main__":
-    HotSearch.baidu_hot_search("hot-search", "weixinzhishu", "dec")
+    # HotSearch.baidu_hot_search("hot-search-baidu", "weixinzhishu", "dev")
+    # HotSearch.douyin_hot_search("hot-search-douyin", "weixinzhishu", "dev")
+    # HotSearch.kuaishou_hot_search("hot-search-kuaishou", "weixinzhishu", "dev")
+    # HotSearch.weibo_hot_search("hot-search-weibo", "weixinzhishu", "dev")
+    HotSearch.weixin_hot_search("hot-search-weixin", "weixinzhishu", "dev")
+

+ 48 - 4
weixinzhishu/weixinzhishu_main/run_weixinzhishu_hot_search.py

@@ -1,18 +1,62 @@
 # -*- coding: utf-8 -*-
 # @Author: wangkun
 # @Time: 2023/3/31
-import datetime
 import os
 import sys
+from threading import Thread
 sys.path.append(os.getcwd())
+from common.common import Common
 from weixinzhishu.weixinzhishu_hot_search.weixinzhishu_hot_search import HotSearch
 
 
 class Main:
     @classmethod
-    def main(cls):
-        HotSearch.baidu_hot_search()
+    def thread_baidu(cls, log_type, crawler, env):
+        Common.logger(log_type, crawler).info(f'开始抓取"百度热搜榜"')
+        HotSearch.baidu_hot_search(log_type, crawler, env)
 
+    @classmethod
+    def thread_kuaishou(cls, log_type, crawler, env):
+        Common.logger(log_type, crawler).info(f'开始抓取"快手热搜榜"')
+        HotSearch.kuaishou_hot_search(log_type, crawler, env)
+
+    @classmethod
+    def thread_douyin(cls, log_type, crawler, env):
+        Common.logger(log_type, crawler).info(f'开始抓取"抖音热搜榜"')
+        HotSearch.douyin_hot_search(log_type, crawler, env)
+
+    @classmethod
+    def thread_weixin(cls, log_type, crawler, env):
+        Common.logger(log_type, crawler).info(f'开始抓取"微信热搜榜"')
+        HotSearch.weixin_hot_search(log_type, crawler, env)
+
+    @classmethod
+    def thread_weibo(cls, log_type, crawler, env):
+        Common.logger(log_type, crawler).info(f'开始抓取"微博热搜榜"')
+        HotSearch.weibo_hot_search(log_type, crawler, env)
+
+    @classmethod
+    def thread_main(cls, log_type, crawler, env):
+        Common.logger(log_type, crawler).info(f"开始抓取今日热搜榜\n")
+        thread_baidu = Thread(target=cls.thread_baidu, args=("hot-search-baidu", crawler, env))
+        thread_kuaishou = Thread(target=cls.thread_kuaishou, args=("hot-search-kuaishou", crawler, env))
+        thread_douyin = Thread(target=cls.thread_douyin, args=("hot-search-douyin", crawler, env))
+        thread_weixin = Thread(target=cls.thread_weixin, args=("hot-search-weixin", crawler, env))
+        thread_weibo = Thread(target=cls.thread_weibo, args=("hot-search-weibo", crawler, env))
+
+        thread_baidu.start()
+        thread_kuaishou.start()
+        thread_douyin.start()
+        thread_weixin.start()
+        thread_weibo.start()
+
+        thread_baidu.join()
+        thread_kuaishou.join()
+        thread_douyin.join()
+        thread_weixin.join()
+        thread_weibo.join()
+        Common.logger(log_type, crawler).info(f"今日热搜榜全部抓取完毕\n")
 
 if __name__ == "__main__":
-    Main.main()
+    Main.thread_main("hot-search", "weixinzhishu", "dev")
+    pass