pipelines.py 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. import json
  2. import os
  3. import time
  4. # from gzh_spider.api import Common
  5. class GzhSpiderPipeline:
  6. def __init__(self):
  7. self.queue_path = "message_queue.json"
  8. if os.path.exists(self.queue_path):
  9. with open(self.queue_path, "r", encoding="utf-8") as f:
  10. self.message_queue = json.loads(f.read())
  11. else:
  12. self.message_queue = []
  13. print("消息队列初始化完成")
  14. def process_item(self, item, spider):
  15. """
  16. 处理item
  17. # 判断文章质量,法律相关等等
  18. # 判断是否下载
  19. # 判断标题相似度
  20. """
  21. my_dict = {}
  22. item['out_user_id'] = item['user_id']
  23. item['platform'] = "gongzhonghao"
  24. item['strategy'] = "author"
  25. item['out_video_id'] = item['video_id']
  26. item['width'] = 0
  27. item['height'] = 0
  28. # item['crawler_rule'] = json.dumps(rule_dict)
  29. # item['']
  30. for key, value in item.items():
  31. my_dict[key] = value
  32. self.message_queue.append(my_dict)
  33. return item
  34. def close_spider(self, spider):
  35. with open(self.queue_path, "w", encoding="utf-8") as f:
  36. f.write(json.dumps(self.message_queue, ensure_ascii=False))
  37. print(spider.name + "finished")