i2i.py 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. from .base import BaseStrategy
  2. class I2I(BaseStrategy):
  3. @staticmethod
  4. def base(title: str, limit: int = 50) -> str:
  5. query = f"""
  6. -- ① 源标题(可限定,强烈建议)
  7. WITH src_title_user AS (
  8. SELECT DISTINCT title AS 源标题, union_id
  9. FROM loghubods.title_union_id_info
  10. WHERE title = '{title}'
  11. ),
  12. -- ② 源标题 → 联想标题 共现
  13. co_occur AS (
  14. SELECT
  15. s.源标题, t.title AS 联想标题, COUNT(*) AS 联想次数
  16. FROM src_title_user s
  17. JOIN loghubods.title_union_id_info t ON s.union_id = t.union_id
  18. WHERE s.源标题 != t.title
  19. GROUP BY s.源标题, t.title
  20. ),
  21. -- ③ 联想标题的 uid 覆盖量(独立算)
  22. title_uid_cnt AS (
  23. SELECT
  24. title, COUNT(DISTINCT union_id) AS 联想标题_uid数量
  25. FROM loghubods.title_union_id_info
  26. GROUP BY title
  27. )
  28. -- ④ 合并
  29. SELECT c.源标题, c.联想标题, c.联想次数, u.联想标题_uid数量, c.联想次数 / (u.联想标题_uid数量 + 1000) AS 联想标题_uid覆盖率
  30. FROM co_occur c
  31. LEFT JOIN title_uid_cnt u ON c.联想标题 = u.title
  32. ORDER BY 联想标题_uid覆盖率 DESC
  33. LIMIT {limit};
  34. """
  35. return query
  36. @staticmethod
  37. def batch_base(title_list, limit: int = 1000):
  38. title_tuple = tuple(title_list)
  39. query = f"""
  40. SELECT src_title, rec_title, collinear_cnt, base_cnt, rec_collinear_ratio
  41. FROM loghubods.t2i_records
  42. WHERE src_title IN {title_tuple}
  43. AND data_version = 'v1' ORDER BY rec_collinear_ratio DESC
  44. LIMIT {limit};
  45. """
  46. return query
  47. @staticmethod
  48. def batch_summary(title_list, limit: int = 500):
  49. title_tuple = tuple(title_list)
  50. query = f"""
  51. SELECT rec_title
  52. ,SUM(collinear_cnt) AS total_collinear_cnt
  53. ,SUM(base_cnt) AS total_base_cnt
  54. ,SUM(collinear_cnt) / (SUM(base_cnt) + 1000) AS rec_collinear_ratio
  55. FROM loghubods.t2i_records
  56. WHERE src_title IN {title_tuple}
  57. AND data_version = 'v3'
  58. GROUP BY rec_title
  59. ORDER BY rec_collinear_ratio DESC
  60. LIMIT {limit};
  61. """
  62. return query
  63. @staticmethod
  64. def strategy_v1(title_list, limit: int = 500):
  65. title_tuple = tuple(title_list)
  66. query = f"""
  67. SELECT associated_title AS recommend_title
  68. ,sum(association_count) as collinear_cnt
  69. ,sum(associated_title_uid_count) as base_cnt
  70. ,sum(association_count) / (sum(associated_title_uid_count) + 10000) AS recommend_score
  71. FROM loghubods.i2i_table
  72. WHERE dt = MAX_PT('i2i_table')
  73. AND source_title IN {title_tuple}
  74. GROUP BY recommend_title
  75. ORDER BY recommend_score DESC
  76. LIMIT {limit};
  77. """
  78. return query