import json import os import time # from gzh_spider.api import Common class GzhSpiderPipeline: def __init__(self): self.queue_path = "message_queue.json" if os.path.exists(self.queue_path): with open(self.queue_path, "r", encoding="utf-8") as f: self.message_queue = json.loads(f.read()) else: self.message_queue = [] print("消息队列初始化完成") def process_item(self, item, spider): """ 处理item # 判断文章质量,法律相关等等 # 判断是否下载 # 判断标题相似度 """ my_dict = {} item['out_user_id'] = item['user_id'] item['platform'] = "gongzhonghao" item['strategy'] = "author" item['out_video_id'] = item['video_id'] item['width'] = 0 item['height'] = 0 # item['crawler_rule'] = json.dumps(rule_dict) # item[''] for key, value in item.items(): my_dict[key] = value self.message_queue.append(my_dict) return item def close_spider(self, spider): with open(self.queue_path, "w", encoding="utf-8") as f: f.write(json.dumps(self.message_queue, ensure_ascii=False)) print(spider.name + "finished")