1234567891011121314151617181920212223242526272829303132333435363738394041424344 |
- import json
- import os
- import time
- # from gzh_spider.api import Common
- class GzhSpiderPipeline:
- def __init__(self):
- self.queue_path = "message_queue.json"
- if os.path.exists(self.queue_path):
- with open(self.queue_path, "r", encoding="utf-8") as f:
- self.message_queue = json.loads(f.read())
- else:
- self.message_queue = []
- print("消息队列初始化完成")
- def process_item(self, item, spider):
- """
- 处理item
- # 判断文章质量,法律相关等等
- # 判断是否下载
- # 判断标题相似度
- """
- my_dict = {}
- item['out_user_id'] = item['user_id']
- item['platform'] = "gongzhonghao"
- item['strategy'] = "author"
- item['out_video_id'] = item['video_id']
- item['width'] = 0
- item['height'] = 0
- # item['crawler_rule'] = json.dumps(rule_dict)
- # item['']
- for key, value in item.items():
- my_dict[key] = value
- self.message_queue.append(my_dict)
- return item
- def close_spider(self, spider):
- with open(self.queue_path, "w", encoding="utf-8") as f:
- f.write(json.dumps(self.message_queue, ensure_ascii=False))
- print(spider.name + "finished")
|