calculate.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. """
  2. @author: luojunhui
  3. """
  4. import json
  5. import os
  6. def read_single_file(filename):
  7. """
  8. :param filename:
  9. """
  10. with open(filename, encoding="utf-8") as f:
  11. data = json.loads(f.read())
  12. if data:
  13. return data
  14. else:
  15. return {}
  16. def compute_similarity(file_1, file_2):
  17. """
  18. 计算
  19. :param file_1:
  20. :param file_2:
  21. :return:
  22. """
  23. data_1 = read_single_file(file_1)
  24. data_2 = read_single_file(file_2)
  25. def calculate_v1(d1, d2):
  26. """
  27. 通过交并集来判断
  28. :param d1:
  29. :param d2:
  30. :return:
  31. """
  32. f1_keys = set(d1["key_words"])
  33. f2_keys = set(d2["key_words"])
  34. keys_union = f1_keys | f2_keys
  35. keys_intersection = f1_keys & f2_keys
  36. f1_search_keys = set(d1["search_keys"])
  37. f2_search_keys = set(d2["search_keys"])
  38. search_keys_union = f1_search_keys | f2_search_keys
  39. search_keys_intersection = f1_search_keys & f2_search_keys
  40. f1_extra_keys = set(d1["extra_keys"])
  41. f2_extra_keys = set(d2["extra_keys"])
  42. extra_keys_union = f1_extra_keys | f2_extra_keys
  43. extra_keys_intersection = f1_extra_keys & f2_extra_keys
  44. score_1 = len(keys_intersection) / len(keys_union)
  45. score_2 = len(search_keys_intersection) / len(search_keys_union)
  46. score_3 = len(extra_keys_intersection) / len(extra_keys_union)
  47. return score_1 * 0.4 + score_2 * 0.4 + score_3 * 0.2
  48. def calculate_v2(d1, d2):
  49. """
  50. 计算方法 v2
  51. :param d1:
  52. :param d2:
  53. :return:
  54. """
  55. score = 0
  56. tone_1 = d1["tone"]
  57. tone_2 = d2["tone"]
  58. if tone_1 == tone_2:
  59. score += 0.1
  60. target_audience_1 = d1["target_audience"]
  61. target_audience_2 = d2["target_audience"]
  62. if target_audience_1 == target_audience_2:
  63. score += 0.2
  64. target_age_1 = d1["target_age"]
  65. target_age_2 = d2["target_age"]
  66. if target_age_1 == target_age_2:
  67. score += 0.2
  68. address_1 = d1["address"]
  69. address_2 = d2["address"]
  70. if address_1 == address_2:
  71. score += 0.2
  72. gender_1 = d1["theme"]
  73. gender_2 = d2["theme"]
  74. if gender_1 == gender_2:
  75. score += 0.5
  76. return score
  77. if data_1 and data_2:
  78. try:
  79. score_1 = calculate_v1(data_1, data_2)
  80. score_2 = calculate_v2(data_1, data_2)
  81. return score_1, score_2
  82. except Exception as e:
  83. return 0, 0
  84. else:
  85. return 0, 0
  86. def title_mix(title_p, dt):
  87. """
  88. 执行代码
  89. :param title_p:
  90. :param dt: dt
  91. """
  92. json_path = os.path.join(os.getcwd(), 'applications', 'static', dt)
  93. # 处理标题信息
  94. files = os.listdir(json_path)
  95. pq_files = [os.path.join(json_path, file) for file in files]
  96. score_list_1 = []
  97. score_list_2 = []
  98. for file in pq_files:
  99. file_name = file.split('/')[-1].replace(".json", "")
  100. v_id = file_name.split('_')[1]
  101. uid = file_name.split('_')[0]
  102. score1, score2 = compute_similarity(title_p, file)
  103. score_list_1.append([score1, v_id, uid])
  104. score_list_2.append([score2, v_id, uid])
  105. s1_list = sorted(score_list_1, key=lambda x: x[0], reverse=True)
  106. s2_list = sorted(score_list_2, key=lambda x: x[0], reverse=True)
  107. title = title_p.split("/")[-1].replace(".json", "")
  108. obj = {
  109. "title": title,
  110. "s1_vid": s1_list[0][1],
  111. "s1_score": s1_list[0][0],
  112. "s1_uid": s1_list[0][2],
  113. "s2_vid": s2_list[0][1],
  114. "s2_score": s2_list[0][0],
  115. "s2_uid": s2_list[0][2]
  116. }
  117. return obj