url_manage.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. from datetime import datetime
  2. import os
  3. import random
  4. import re
  5. import sys
  6. import requests
  7. import json
  8. sys.path.append(os.getcwd())
  9. class urlManage():
  10. # 随机生成id
  11. @classmethod
  12. def random_id(cls):
  13. now = datetime.now()
  14. rand_num = random.randint(10000, 99999)
  15. oss_id = "{}{}".format(now.strftime("%Y%m%d%H%M%S"), rand_num)
  16. return oss_id
  17. @classmethod
  18. def get_content_id(cls, link, channel):
  19. headers = {
  20. "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
  21. }
  22. share_url = link
  23. resp = requests.get(share_url, headers=headers, timeout=10, verify=False)
  24. url = resp.url
  25. if url:
  26. if channel == "douyin" or channel == "xigua":
  27. pattern = r'/(\d+)/\?'
  28. elif channel == "kuaishou":
  29. pattern = r'/photo/(\w+)\?'
  30. else:
  31. return None
  32. match = re.search(pattern, url)
  33. if match:
  34. return match.group(1)
  35. else:
  36. return None
  37. else:
  38. return None
  39. @classmethod
  40. def extract_link(cls, data_link):
  41. try:
  42. json_data = json.loads(data_link)
  43. content = json_data.get('content', '')
  44. except json.decoder.JSONDecodeError:
  45. content = data_link
  46. # 使用正则表达式提取链接
  47. link = re.search(r'https?://\S+', content)
  48. if link:
  49. return link.group()
  50. else:
  51. return None
  52. @classmethod
  53. def url_manage(cls, data_link, channel):
  54. link = cls.extract_link(data_link)
  55. if link:
  56. content_id = cls.get_content_id(link, channel)
  57. if content_id:
  58. return content_id
  59. else:
  60. return None
  61. else:
  62. return None