dy_search.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. """
  2. @author: luojunhui
  3. """
  4. import json
  5. import requests
  6. from applications.functions.common import sensitive_flag
  7. from applications.log import logging
  8. def douyin_search(keyword, sensitive_words, trace_id):
  9. """
  10. Search with dou cha cha
  11. rank the relevance and recall the best three videos
  12. :param trace_id:
  13. :param sensitive_words: sensitive words in pq
  14. :param keyword: the words needs to be searched
  15. :return:
  16. """
  17. url = "http://crawler-cn.aiddit.com/crawler/dou_yin/top_hub_content"
  18. payload = json.dumps({
  19. "keyword": keyword,
  20. "category": "全部",
  21. "period": "近90天",
  22. "content_modal": "视频",
  23. "cursor": ""
  24. })
  25. headers = {
  26. 'Content-Type': 'application/json'
  27. }
  28. response = requests.request("POST", url, headers=headers, data=payload)
  29. try:
  30. dt_list = response.json()['data']['data']
  31. logging(
  32. code="4002",
  33. info="抖音搜索成功",
  34. trace_id=trace_id
  35. )
  36. L = []
  37. for obj in dt_list:
  38. try:
  39. title = obj['video_desc']
  40. video_id = obj['video_id']
  41. duration = int(obj['duration'])
  42. if sensitive_flag(sensitive_words, title) and duration < 30000:
  43. res = douyin_detail(video_id)
  44. if res:
  45. L.append(res)
  46. else:
  47. continue
  48. except Exception as e:
  49. # print(traceback.format_exc())
  50. continue
  51. logging(
  52. code="8001",
  53. info="抖音搜索",
  54. data={
  55. "keys": keyword,
  56. "search_count": len(dt_list),
  57. "useful_count": len(L)
  58. },
  59. trace_id=trace_id
  60. )
  61. return L
  62. except Exception as e:
  63. logging(
  64. code="4003",
  65. info="抖音搜索失败",
  66. trace_id=trace_id,
  67. data={"error": str(e)}
  68. )
  69. return []
  70. # logging(
  71. # code="4003",
  72. # info="抖音搜索失败-搜索词:{} 原因:-{}".format(keyword, "抖查查暂停服务"),
  73. # trace_id=trace_id
  74. # )
  75. # return []
  76. def douyin_detail(video_id):
  77. """
  78. get video url address
  79. :param video_id:
  80. :return:
  81. """
  82. url = "http://crawler-cn.aiddit.com/crawler/dou_yin/detail"
  83. payload = json.dumps({
  84. "content_id": video_id
  85. })
  86. headers = {
  87. 'Content-Type': 'application/json'
  88. }
  89. response = requests.request("POST", url, headers=headers, data=payload).json()
  90. logging(
  91. code="4005",
  92. info="抖音请求详情",
  93. data=response
  94. )
  95. if response['code'] != 0:
  96. logging(
  97. code="4006",
  98. info="抖音请求详情失败",
  99. data={"error": response['msg']}
  100. )
  101. else:
  102. try:
  103. video_info = response['data']['data']
  104. if video_info['content_type'] == "note":
  105. return None
  106. else:
  107. return video_info
  108. except Exception as e:
  109. logging(
  110. code="4006",
  111. info="抖音请求详情失败",
  112. data={"error": str(e)}
  113. )
  114. return None