dy_search.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. """
  2. @author: luojunhui
  3. """
  4. import json
  5. import requests
  6. from applications.functions.common import sensitive_flag
  7. from applications.log import logging
  8. from applications.const import server_const
  9. def douyin_search(keyword, sensitive_words, trace_id):
  10. """
  11. Search with dou cha cha
  12. rank the relevance and recall the best three videos
  13. :param trace_id:
  14. :param sensitive_words: sensitive words in pq
  15. :param keyword: the words needs to be searched
  16. :return:
  17. """
  18. url = "http://crawler-cn.aiddit.com/crawler/dou_yin/top_hub_content"
  19. payload = json.dumps({
  20. "keyword": keyword,
  21. "category": "全部",
  22. "period": "近90天",
  23. "content_modal": "视频",
  24. "cursor": ""
  25. })
  26. headers = {
  27. 'Content-Type': 'application/json'
  28. }
  29. response = requests.request("POST", url, headers=headers, data=payload)
  30. try:
  31. dt_list = response.json()['data']['data']
  32. logging(
  33. code="4002",
  34. info="抖音搜索成功",
  35. trace_id=trace_id
  36. )
  37. L = []
  38. for obj in dt_list:
  39. try:
  40. title = obj['video_desc']
  41. video_id = obj['video_id']
  42. duration = int(obj['duration'])
  43. if sensitive_flag(sensitive_words, title) and duration < server_const.MAX_VIDEO_DURATION * 1000:
  44. res = douyin_detail(video_id)
  45. if res:
  46. L.append(res)
  47. else:
  48. continue
  49. except Exception as e:
  50. # print(traceback.format_exc())
  51. continue
  52. logging(
  53. code="8001",
  54. info="抖音搜索",
  55. data={
  56. "keys": keyword,
  57. "search_count": len(dt_list),
  58. "useful_count": len(L)
  59. },
  60. trace_id=trace_id
  61. )
  62. return L
  63. except Exception as e:
  64. logging(
  65. code="4003",
  66. info="抖音搜索失败",
  67. trace_id=trace_id,
  68. data={"error": str(e)}
  69. )
  70. return []
  71. # logging(
  72. # code="4003",
  73. # info="抖音搜索失败-搜索词:{} 原因:-{}".format(keyword, "抖查查暂停服务"),
  74. # trace_id=trace_id
  75. # )
  76. # return []
  77. def douyin_detail(video_id):
  78. """
  79. get video url address
  80. :param video_id:
  81. :return:
  82. """
  83. url = "http://crawler-cn.aiddit.com/crawler/dou_yin/detail"
  84. payload = json.dumps({
  85. "content_id": video_id
  86. })
  87. headers = {
  88. 'Content-Type': 'application/json'
  89. }
  90. response = requests.request("POST", url, headers=headers, data=payload).json()
  91. logging(
  92. code="4005",
  93. info="抖音请求详情",
  94. data=response
  95. )
  96. if response['code'] != 0:
  97. logging(
  98. code="4006",
  99. info="抖音请求详情失败",
  100. data={"error": response['msg']}
  101. )
  102. else:
  103. try:
  104. video_info = response['data']['data']
  105. if video_info['content_type'] == "note":
  106. return None
  107. else:
  108. return video_info
  109. except Exception as e:
  110. logging(
  111. code="4006",
  112. info="抖音请求详情失败",
  113. data={"error": str(e)}
  114. )
  115. return None