crawler_pipeline.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. """
  2. @author: luojunhui
  3. """
  4. import os
  5. from applications.utils import download_gzh_video
  6. from applications.utils import download_toutiao_video
  7. from applications.utils import upload_to_oss
  8. empty_dict = {}
  9. def whether_duplicate_video_title(video_title, db_client):
  10. """
  11. whether duplicate video title
  12. """
  13. sql = f"""
  14. select id from publish_single_video_source
  15. where article_title = %s;
  16. """
  17. duplicate_id = db_client.fetch(query=sql, params=(video_title,))
  18. if duplicate_id:
  19. return True
  20. return False
  21. def video_crawler_pipeline(video_item, db_client) -> dict:
  22. """
  23. video crawler pipeline
  24. """
  25. # whether duplicate video title
  26. video_title = video_item['article_title']
  27. if whether_duplicate_video_title(video_title, db_client):
  28. return empty_dict
  29. # video title sensitive words filter
  30. # download video
  31. article_url = video_item['article_url']
  32. platform = video_item['platform']
  33. match platform:
  34. case "toutiao":
  35. video_path = download_toutiao_video(article_url)
  36. case "gzh":
  37. video_path = download_gzh_video(article_url)
  38. case "hksp":
  39. video_path = ''
  40. case "sph":
  41. video_path = ''
  42. case _:
  43. return empty_dict
  44. if video_path:
  45. # upload video to oss
  46. oss_path = upload_to_oss(video_path)
  47. video_item['video_oss_path'] = oss_path
  48. os.remove(video_path)
  49. return video_item
  50. else:
  51. return empty_dict