article_tools.py 701 B

12345678910111213141516171819202122232425262728293031
  1. """
  2. @author: luojunhui
  3. """
  4. """
  5. @author: luojunhui
  6. """
  7. def title_sim_v2(title_a, title_b, thredhold=0.8):
  8. if len(title_a) < 1 or len(title_b) < 1:
  9. return False
  10. set_a = set(title_a)
  11. set_b = set(title_b)
  12. set_cross = set_a & set_b
  13. set_union = set_a | set_b
  14. if not set_union:
  15. return False
  16. min_len = max(min(len(set_a), len(set_b)), 1)
  17. rate = len(set_cross) / min_len
  18. if rate >= thredhold:
  19. return True
  20. else:
  21. return False
  22. def title_sim_v2_by_list(title_target, title_list):
  23. for title, url in title_list:
  24. sim_score = title_sim_v2(title_target, title)
  25. if sim_score:
  26. return (title, url)
  27. return None