i2i.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. from .base import BaseStrategy
  2. class I2I(BaseStrategy):
  3. @staticmethod
  4. def base(title: str, limit: int = 50) -> str:
  5. query = f"""
  6. -- ① 源标题(可限定,强烈建议)
  7. WITH src_title_user AS (
  8. SELECT DISTINCT title AS 源标题, union_id
  9. FROM loghubods.title_union_id_info
  10. WHERE title = '{title}'
  11. ),
  12. -- ② 源标题 → 联想标题 共现
  13. co_occur AS (
  14. SELECT
  15. s.源标题, t.title AS 联想标题, COUNT(*) AS 联想次数
  16. FROM src_title_user s
  17. JOIN loghubods.title_union_id_info t ON s.union_id = t.union_id
  18. WHERE s.源标题 != t.title
  19. GROUP BY s.源标题, t.title
  20. ),
  21. -- ③ 联想标题的 uid 覆盖量(独立算)
  22. title_uid_cnt AS (
  23. SELECT
  24. title, COUNT(DISTINCT union_id) AS 联想标题_uid数量
  25. FROM loghubods.title_union_id_info
  26. GROUP BY title
  27. )
  28. -- ④ 合并
  29. SELECT c.源标题, c.联想标题, c.联想次数, u.联想标题_uid数量, c.联想次数 / (u.联想标题_uid数量 + 1000) AS 联想标题_uid覆盖率
  30. FROM co_occur c
  31. LEFT JOIN title_uid_cnt u ON c.联想标题 = u.title
  32. ORDER BY 联想标题_uid覆盖率 DESC
  33. LIMIT {limit};
  34. """
  35. return query
  36. @staticmethod
  37. def batch_base(title_list, limit: int = 1000):
  38. title_tuple = tuple(title_list)
  39. query = f"""
  40. SELECT src_title, rec_title, collinear_cnt, base_cnt, rec_collinear_ratio
  41. FROM loghubods.t2i_records
  42. WHERE src_title IN {title_tuple}
  43. AND data_version = 'v1' ORDER BY rec_collinear_ratio DESC
  44. LIMIT {limit};
  45. """
  46. return query
  47. @staticmethod
  48. def strategy_v1(title_list, limit: int = 500):
  49. title_tuple = tuple(title_list)
  50. query = f"""
  51. SELECT associated_title AS recommend_title
  52. ,sum(association_count) as collinear_cnt
  53. ,sum(associated_title_uid_count) as base_cnt
  54. ,sum(association_count) / (sum(associated_title_uid_count) + 10000) AS recommend_score
  55. FROM loghubods.i2i_table
  56. WHERE dt = MAX_PT('i2i_table')
  57. AND source_title IN {title_tuple}
  58. GROUP BY recommend_title
  59. ORDER BY recommend_score DESC
  60. LIMIT {limit};
  61. """
  62. return query