changwen_weight_rank.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. #!/usr/bin/env python3
  2. """
  3. 读取长文 JSON,计算权重 = 分发realplay_uv / 当日分发曝光uv,按权重降序输出。
  4. 每个输入文件单独生成一个输出文件(不合并)。
  5. """
  6. from __future__ import annotations
  7. import argparse
  8. import json
  9. import math
  10. from pathlib import Path
  11. def load_records(path: Path) -> list[dict]:
  12. rows: list[dict] = []
  13. with open(path, encoding="utf-8") as f:
  14. data = json.load(f)
  15. if not isinstance(data, list):
  16. raise ValueError(f"{path} 根节点应为数组")
  17. for item in data:
  18. ext = item.get("ext_data") or {}
  19. expose = ext.get("当日分发曝光uv")
  20. realplay = ext.get("分发realplay_uv")
  21. if expose is None or realplay is None:
  22. continue
  23. try:
  24. expose_f = float(expose)
  25. realplay_f = float(realplay)
  26. except (TypeError, ValueError):
  27. continue
  28. if expose_f <= 0:
  29. weight = float("nan")
  30. else:
  31. weight = realplay_f / expose_f
  32. rows.append(
  33. {
  34. "videoid": str(item.get("videoid", "")),
  35. "二级品类": item.get("二级品类", ""),
  36. "权重值": weight,
  37. }
  38. )
  39. return rows
  40. def _sort_key(r: dict) -> tuple:
  41. w = r["权重值"]
  42. if math.isnan(w):
  43. return (1, 0.0)
  44. return (0, -w)
  45. def default_out_path(inp: Path, out_dir: Path | None) -> Path:
  46. name = f"{inp.stem}_weight_rank.txt"
  47. if out_dir is not None:
  48. return out_dir / name
  49. return inp.parent / name
  50. def main() -> None:
  51. base = Path(__file__).resolve().parent / "data" / "changwen_data"
  52. parser = argparse.ArgumentParser(
  53. description="长文数据按 分发realplay_uv/当日分发曝光uv 排序;每个输入单独写一个输出文件"
  54. )
  55. parser.add_argument(
  56. "inputs",
  57. nargs="*",
  58. type=Path,
  59. default=[
  60. base / "奇观妙技有乾坤.json",
  61. base / "青史铁事漫谈.json",
  62. ],
  63. help="输入 JSON 路径(默认两个账号文件)",
  64. )
  65. parser.add_argument(
  66. "--out-dir",
  67. type=Path,
  68. default=None,
  69. help="可选:把所有输出写到该目录(文件名仍为 {原文件名去扩展}_weight_rank.txt)",
  70. )
  71. args = parser.parse_args()
  72. paths = [p.resolve() for p in args.inputs]
  73. for p in paths:
  74. if not p.is_file():
  75. raise SystemExit(f"文件不存在: {p}")
  76. out_dir = args.out_dir.resolve() if args.out_dir else None
  77. if out_dir is not None:
  78. out_dir.mkdir(parents=True, exist_ok=True)
  79. for p in paths:
  80. rows = load_records(p)
  81. rows.sort(key=_sort_key)
  82. lines = []
  83. for r in rows:
  84. w = r["权重值"]
  85. w_str = "nan" if w != w else f"{w:.6f}"
  86. lines.append(f"{r['videoid']}\t{r['二级品类']}\t{w_str}")
  87. out = default_out_path(p, out_dir)
  88. if out_dir is None:
  89. out.parent.mkdir(parents=True, exist_ok=True)
  90. text = "\n".join(lines) + ("\n" if lines else "")
  91. out.write_text(text, encoding="utf-8")
  92. print(f"{p.name}: 共 {len(lines)} 行 -> {out}")
  93. if __name__ == "__main__":
  94. main()