|  | @@ -1,135 +0,0 @@
 | 
											
												
													
														|  | -"""
 |  | 
 | 
											
												
													
														|  | -@author: luojunhui
 |  | 
 | 
											
												
													
														|  | -"""
 |  | 
 | 
											
												
													
														|  | -import time
 |  | 
 | 
											
												
													
														|  | -import datetime
 |  | 
 | 
											
												
													
														|  | -
 |  | 
 | 
											
												
													
														|  | -from applications import WeixinSpider, Functions, PQMySQL, DeNetMysql
 |  | 
 | 
											
												
													
														|  | -
 |  | 
 | 
											
												
													
														|  | -
 |  | 
 | 
											
												
													
														|  | -class SpiderTools(object):
 |  | 
 | 
											
												
													
														|  | -    """
 |  | 
 | 
											
												
													
														|  | -    长文爬虫公共入口
 |  | 
 | 
											
												
													
														|  | -    """
 |  | 
 | 
											
												
													
														|  | -    spider_client = WeixinSpider()
 |  | 
 | 
											
												
													
														|  | -    function = Functions()
 |  | 
 | 
											
												
													
														|  | -    pq_mysql_client = PQMySQL()
 |  | 
 | 
											
												
													
														|  | -    denet_mysql_client = DeNetMysql()
 |  | 
 | 
											
												
													
														|  | -
 |  | 
 | 
											
												
													
														|  | -    @classmethod
 |  | 
 | 
											
												
													
														|  | -    def searchEachAccountArticlesSinglePage(cls, gh_id, category):
 |  | 
 | 
											
												
													
														|  | -        """
 |  | 
 | 
											
												
													
														|  | -        抓取账号单页
 |  | 
 | 
											
												
													
														|  | -        :param gh_id:
 |  | 
 | 
											
												
													
														|  | -        :param category:
 |  | 
 | 
											
												
													
														|  | -        :return:
 |  | 
 | 
											
												
													
														|  | -        """
 |  | 
 | 
											
												
													
														|  | -        response = cls.spider_client.update_msg_list(ghId=gh_id, index=None)
 |  | 
 | 
											
												
													
														|  | -        msg_list = response.get("data", {}).get("data")
 |  | 
 | 
											
												
													
														|  | -        if msg_list:
 |  | 
 | 
											
												
													
														|  | -            cls.updateDataIntoMysql(
 |  | 
 | 
											
												
													
														|  | -                gh_id=gh_id,
 |  | 
 | 
											
												
													
														|  | -                category=category,
 |  | 
 | 
											
												
													
														|  | -                mode="account",
 |  | 
 | 
											
												
													
														|  | -                article_list=msg_list
 |  | 
 | 
											
												
													
														|  | -            )
 |  | 
 | 
											
												
													
														|  | -            cls.updateLatestAccountTimeStamp(gh_id=gh_id)
 |  | 
 | 
											
												
													
														|  | -        else:
 |  | 
 | 
											
												
													
														|  | -            print("No more data")
 |  | 
 | 
											
												
													
														|  | -
 |  | 
 | 
											
												
													
														|  | -    @classmethod
 |  | 
 | 
											
												
													
														|  | -    def searchEachAccountArticlesAllData(cls, gh_id, category, latest_time_stamp, index=None):
 |  | 
 | 
											
												
													
														|  | -        """
 |  | 
 | 
											
												
													
														|  | -        抓取账号截止到2024-01-01的最新数据
 |  | 
 | 
											
												
													
														|  | -        :param index:
 |  | 
 | 
											
												
													
														|  | -        :param gh_id:
 |  | 
 | 
											
												
													
														|  | -        :param category:
 |  | 
 | 
											
												
													
														|  | -        :param latest_time_stamp
 |  | 
 | 
											
												
													
														|  | -        :return:
 |  | 
 | 
											
												
													
														|  | -        """
 |  | 
 | 
											
												
													
														|  | -        response = cls.spider_client.update_msg_list(ghId=gh_id, index=index)
 |  | 
 | 
											
												
													
														|  | -        msg_list = response.get("data", {}).get("data")
 |  | 
 | 
											
												
													
														|  | -        if msg_list:
 |  | 
 | 
											
												
													
														|  | -            last_article_in_this_msg = msg_list[-1]
 |  | 
 | 
											
												
													
														|  | -            cls.updateDataIntoMysql(
 |  | 
 | 
											
												
													
														|  | -                gh_id=gh_id, category=category, article_list=msg_list, mode="account"
 |  | 
 | 
											
												
													
														|  | -            )
 |  | 
 | 
											
												
													
														|  | -            last_time_stamp_in_this_msg = last_article_in_this_msg["AppMsg"]["BaseInfo"]["UpdateTime"]
 |  | 
 | 
											
												
													
														|  | -            if latest_time_stamp < last_time_stamp_in_this_msg:
 |  | 
 | 
											
												
													
														|  | -                next_cursor = response["data"]["next_cursor"]
 |  | 
 | 
											
												
													
														|  | -                return cls.searchEachAccountArticlesAllData(
 |  | 
 | 
											
												
													
														|  | -                    gh_id=gh_id,
 |  | 
 | 
											
												
													
														|  | -                    latest_time_stamp=latest_time_stamp,
 |  | 
 | 
											
												
													
														|  | -                    category=category,
 |  | 
 | 
											
												
													
														|  | -                    index=next_cursor,
 |  | 
 | 
											
												
													
														|  | -                )
 |  | 
 | 
											
												
													
														|  | -            else:
 |  | 
 | 
											
												
													
														|  | -                # 更新最近抓取时间
 |  | 
 | 
											
												
													
														|  | -                cls.updateLatestAccountTimeStamp(gh_id=gh_id)
 |  | 
 | 
											
												
													
														|  | -        else:
 |  | 
 | 
											
												
													
														|  | -            print("No more data")
 |  | 
 | 
											
												
													
														|  | -
 |  | 
 | 
											
												
													
														|  | -    @classmethod
 |  | 
 | 
											
												
													
														|  | -    def updateDataIntoMysql(cls, gh_id, category, mode, article_list):
 |  | 
 | 
											
												
													
														|  | -        """
 |  | 
 | 
											
												
													
														|  | -        将数据更新到数据库
 |  | 
 | 
											
												
													
														|  | -        :return:
 |  | 
 | 
											
												
													
														|  | -        """
 |  | 
 | 
											
												
													
														|  | -        for article_obj in article_list:
 |  | 
 | 
											
												
													
														|  | -            detail_article_list = article_obj["AppMsg"]["DetailInfo"]
 |  | 
 | 
											
												
													
														|  | -            for obj in detail_article_list:
 |  | 
 | 
											
												
													
														|  | -                try:
 |  | 
 | 
											
												
													
														|  | -                    show_stat = cls.function.show_desc_to_sta(obj["ShowDesc"])
 |  | 
 | 
											
												
													
														|  | -                    show_view_count = show_stat.get("show_view_count", 0)
 |  | 
 | 
											
												
													
														|  | -                    show_like_count = show_stat.get("show_like_count", 0)
 |  | 
 | 
											
												
													
														|  | -                    insert_sql = f"""
 |  | 
 | 
											
												
													
														|  | -                        insert into crawler_meta_article
 |  | 
 | 
											
												
													
														|  | -                        (platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, status, unique_index)
 |  | 
 | 
											
												
													
														|  | -                        VALUES 
 |  | 
 | 
											
												
													
														|  | -                        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
 |  | 
 | 
											
												
													
														|  | -                    """
 |  | 
 | 
											
												
													
														|  | -                    cls.denet_mysql_client.update(
 |  | 
 | 
											
												
													
														|  | -                        sql=insert_sql,
 |  | 
 | 
											
												
													
														|  | -                        params=(
 |  | 
 | 
											
												
													
														|  | -                            "weixin",
 |  | 
 | 
											
												
													
														|  | -                            mode,
 |  | 
 | 
											
												
													
														|  | -                            category,
 |  | 
 | 
											
												
													
														|  | -                            gh_id,
 |  | 
 | 
											
												
													
														|  | -                            obj['ItemIndex'],
 |  | 
 | 
											
												
													
														|  | -                            obj["Title"],
 |  | 
 | 
											
												
													
														|  | -                            obj["ContentUrl"],
 |  | 
 | 
											
												
													
														|  | -                            show_view_count,
 |  | 
 | 
											
												
													
														|  | -                            show_like_count,
 |  | 
 | 
											
												
													
														|  | -                            obj["Digest"],
 |  | 
 | 
											
												
													
														|  | -                            obj["send_time"],
 |  | 
 | 
											
												
													
														|  | -                            int(time.time()),
 |  | 
 | 
											
												
													
														|  | -                            1,
 |  | 
 | 
											
												
													
														|  | -                            cls.function.generateGzhId(obj["ContentUrl"]),
 |  | 
 | 
											
												
													
														|  | -                        ),
 |  | 
 | 
											
												
													
														|  | -                    )
 |  | 
 | 
											
												
													
														|  | -                except Exception as e:
 |  | 
 | 
											
												
													
														|  | -                    print(e)
 |  | 
 | 
											
												
													
														|  | -
 |  | 
 | 
											
												
													
														|  | -    @classmethod
 |  | 
 | 
											
												
													
														|  | -    def updateLatestAccountTimeStamp(cls, gh_id):
 |  | 
 | 
											
												
													
														|  | -        """
 |  | 
 | 
											
												
													
														|  | -        更新账号的最新时间戳
 |  | 
 | 
											
												
													
														|  | -        :return:
 |  | 
 | 
											
												
													
														|  | -        """
 |  | 
 | 
											
												
													
														|  | -        select_sql = f"""
 |  | 
 | 
											
												
													
														|  | -            SELECT publish_time 
 |  | 
 | 
											
												
													
														|  | -            From crawler_meta_article 
 |  | 
 | 
											
												
													
														|  | -            WHERE out_account_id = '{gh_id}'
 |  | 
 | 
											
												
													
														|  | -            ORDER BY publish_time DESC LIMIT 1;
 |  | 
 | 
											
												
													
														|  | -        """
 |  | 
 | 
											
												
													
														|  | -        result = cls.denet_mysql_client.select(select_sql)
 |  | 
 | 
											
												
													
														|  | -        time_stamp = result[0][0]
 |  | 
 | 
											
												
													
														|  | -        dt_object = datetime.datetime.utcfromtimestamp(time_stamp)
 |  | 
 | 
											
												
													
														|  | -        local_dt = dt_object.astimezone()
 |  | 
 | 
											
												
													
														|  | -        dt_string = local_dt.strftime('%Y-%m-%d %H:%M:%S')
 |  | 
 | 
											
												
													
														|  | -        update_sql = f"""
 |  | 
 | 
											
												
													
														|  | -            update long_articles_accounts
 |  | 
 | 
											
												
													
														|  | -            set latest_update_time = %s
 |  | 
 | 
											
												
													
														|  | -            where account_id = %s;
 |  | 
 | 
											
												
													
														|  | -        """
 |  | 
 | 
											
												
													
														|  | -        cls.pq_mysql_client.update(sql=update_sql, params=(dt_string, gh_id))
 |  | 
 |