| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109 |
- #!/usr/bin/env python3
- """
- 读取长文 JSON,计算权重 = 分发realplay_uv / 当日分发曝光uv,按权重降序输出。
- 每个输入文件单独生成一个输出文件(不合并)。
- """
- from __future__ import annotations
- import argparse
- import json
- import math
- from pathlib import Path
- def load_records(path: Path) -> list[dict]:
- rows: list[dict] = []
- with open(path, encoding="utf-8") as f:
- data = json.load(f)
- if not isinstance(data, list):
- raise ValueError(f"{path} 根节点应为数组")
- for item in data:
- ext = item.get("ext_data") or {}
- expose = ext.get("当日分发曝光uv")
- realplay = ext.get("分发realplay_uv")
- if expose is None or realplay is None:
- continue
- try:
- expose_f = float(expose)
- realplay_f = float(realplay)
- except (TypeError, ValueError):
- continue
- if expose_f <= 0:
- weight = float("nan")
- else:
- weight = realplay_f / expose_f
- rows.append(
- {
- "videoid": str(item.get("videoid", "")),
- "二级品类": item.get("二级品类", ""),
- "权重值": weight,
- }
- )
- return rows
- def _sort_key(r: dict) -> tuple:
- w = r["权重值"]
- if math.isnan(w):
- return (1, 0.0)
- return (0, -w)
- def default_out_path(inp: Path, out_dir: Path | None) -> Path:
- name = f"{inp.stem}_weight_rank.txt"
- if out_dir is not None:
- return out_dir / name
- return inp.parent / name
- def main() -> None:
- base = Path(__file__).resolve().parent / "data" / "changwen_data"
- parser = argparse.ArgumentParser(
- description="长文数据按 分发realplay_uv/当日分发曝光uv 排序;每个输入单独写一个输出文件"
- )
- parser.add_argument(
- "inputs",
- nargs="*",
- type=Path,
- default=[
- base / "奇观妙技有乾坤.json",
- base / "青史铁事漫谈.json",
- ],
- help="输入 JSON 路径(默认两个账号文件)",
- )
- parser.add_argument(
- "--out-dir",
- type=Path,
- default=None,
- help="可选:把所有输出写到该目录(文件名仍为 {原文件名去扩展}_weight_rank.txt)",
- )
- args = parser.parse_args()
- paths = [p.resolve() for p in args.inputs]
- for p in paths:
- if not p.is_file():
- raise SystemExit(f"文件不存在: {p}")
- out_dir = args.out_dir.resolve() if args.out_dir else None
- if out_dir is not None:
- out_dir.mkdir(parents=True, exist_ok=True)
- for p in paths:
- rows = load_records(p)
- rows.sort(key=_sort_key)
- lines = []
- for r in rows:
- w = r["权重值"]
- w_str = "nan" if w != w else f"{w:.6f}"
- lines.append(f"{r['videoid']}\t{r['二级品类']}\t{w_str}")
- out = default_out_path(p, out_dir)
- if out_dir is None:
- out.parent.mkdir(parents=True, exist_ok=True)
- text = "\n".join(lines) + ("\n" if lines else "")
- out.write_text(text, encoding="utf-8")
- print(f"{p.name}: 共 {len(lines)} 行 -> {out}")
- if __name__ == "__main__":
- main()
|