Explorar o código

Merge branch '2024-11-29-luojunhui-developing-video-crawler' of luojunhui/LongArticlesJob into master

luojunhui hai 4 meses
pai
achega
6d5ab5cd99
Modificáronse 2 ficheiros con 0 adicións e 170 borrados
  1. 0 35
      applications/pipeline.py
  2. 0 135
      applications/spiderTool.py

+ 0 - 35
applications/pipeline.py

@@ -1,35 +0,0 @@
-"""
-@author: luojunhui
-"""
-import json
-
-from applications import WeixinSpider, PQMySQL
-
-
-class LongArticlesPipeline(object):
-    """
-    长文管道, 对数据进行加工过滤
-    """
-
-    db_client = PQMySQL()
-
-    @classmethod
-    def checkAccountExists(cls, gh_id):
-        """
-        校验账号是否已经抓过
-        :param gh_id:
-        :return:
-        """
-        sql = f"""
-            INSERT 
-        """
-
-    @classmethod
-    def accountFilter(cls, account_info):
-        """
-        账号过滤
-        :param account_info:
-        :return:
-        """
-        # 查询账号是否已经存在
-

+ 0 - 135
applications/spiderTool.py

@@ -1,135 +0,0 @@
-"""
-@author: luojunhui
-"""
-import time
-import datetime
-
-from applications import WeixinSpider, Functions, PQMySQL, DeNetMysql
-
-
-class SpiderTools(object):
-    """
-    长文爬虫公共入口
-    """
-    spider_client = WeixinSpider()
-    function = Functions()
-    pq_mysql_client = PQMySQL()
-    denet_mysql_client = DeNetMysql()
-
-    @classmethod
-    def searchEachAccountArticlesSinglePage(cls, gh_id, category):
-        """
-        抓取账号单页
-        :param gh_id:
-        :param category:
-        :return:
-        """
-        response = cls.spider_client.update_msg_list(ghId=gh_id, index=None)
-        msg_list = response.get("data", {}).get("data")
-        if msg_list:
-            cls.updateDataIntoMysql(
-                gh_id=gh_id,
-                category=category,
-                mode="account",
-                article_list=msg_list
-            )
-            cls.updateLatestAccountTimeStamp(gh_id=gh_id)
-        else:
-            print("No more data")
-
-    @classmethod
-    def searchEachAccountArticlesAllData(cls, gh_id, category, latest_time_stamp, index=None):
-        """
-        抓取账号截止到2024-01-01的最新数据
-        :param index:
-        :param gh_id:
-        :param category:
-        :param latest_time_stamp
-        :return:
-        """
-        response = cls.spider_client.update_msg_list(ghId=gh_id, index=index)
-        msg_list = response.get("data", {}).get("data")
-        if msg_list:
-            last_article_in_this_msg = msg_list[-1]
-            cls.updateDataIntoMysql(
-                gh_id=gh_id, category=category, article_list=msg_list, mode="account"
-            )
-            last_time_stamp_in_this_msg = last_article_in_this_msg["AppMsg"]["BaseInfo"]["UpdateTime"]
-            if latest_time_stamp < last_time_stamp_in_this_msg:
-                next_cursor = response["data"]["next_cursor"]
-                return cls.searchEachAccountArticlesAllData(
-                    gh_id=gh_id,
-                    latest_time_stamp=latest_time_stamp,
-                    category=category,
-                    index=next_cursor,
-                )
-            else:
-                # 更新最近抓取时间
-                cls.updateLatestAccountTimeStamp(gh_id=gh_id)
-        else:
-            print("No more data")
-
-    @classmethod
-    def updateDataIntoMysql(cls, gh_id, category, mode, article_list):
-        """
-        将数据更新到数据库
-        :return:
-        """
-        for article_obj in article_list:
-            detail_article_list = article_obj["AppMsg"]["DetailInfo"]
-            for obj in detail_article_list:
-                try:
-                    show_stat = cls.function.show_desc_to_sta(obj["ShowDesc"])
-                    show_view_count = show_stat.get("show_view_count", 0)
-                    show_like_count = show_stat.get("show_like_count", 0)
-                    insert_sql = f"""
-                        insert into crawler_meta_article
-                        (platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, status, unique_index)
-                        VALUES 
-                        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
-                    """
-                    cls.denet_mysql_client.update(
-                        sql=insert_sql,
-                        params=(
-                            "weixin",
-                            mode,
-                            category,
-                            gh_id,
-                            obj['ItemIndex'],
-                            obj["Title"],
-                            obj["ContentUrl"],
-                            show_view_count,
-                            show_like_count,
-                            obj["Digest"],
-                            obj["send_time"],
-                            int(time.time()),
-                            1,
-                            cls.function.generateGzhId(obj["ContentUrl"]),
-                        ),
-                    )
-                except Exception as e:
-                    print(e)
-
-    @classmethod
-    def updateLatestAccountTimeStamp(cls, gh_id):
-        """
-        更新账号的最新时间戳
-        :return:
-        """
-        select_sql = f"""
-            SELECT publish_time 
-            From crawler_meta_article 
-            WHERE out_account_id = '{gh_id}'
-            ORDER BY publish_time DESC LIMIT 1;
-        """
-        result = cls.denet_mysql_client.select(select_sql)
-        time_stamp = result[0][0]
-        dt_object = datetime.datetime.utcfromtimestamp(time_stamp)
-        local_dt = dt_object.astimezone()
-        dt_string = local_dt.strftime('%Y-%m-%d %H:%M:%S')
-        update_sql = f"""
-            update long_articles_accounts
-            set latest_update_time = %s
-            where account_id = %s;
-        """
-        cls.pq_mysql_client.update(sql=update_sql, params=(dt_string, gh_id))