|
|
@@ -113,40 +113,129 @@ def extract_root_source_id(path: str) -> dict:
|
|
|
return {}
|
|
|
|
|
|
|
|
|
-def decode_show_v(show_v: str) -> int:
|
|
|
- if not show_v:
|
|
|
- return 0
|
|
|
-
|
|
|
- show_v = show_v.strip().lower()
|
|
|
-
|
|
|
- # 统一小数点(防 1,3k 这种)
|
|
|
- show_v = show_v.replace(",", ".")
|
|
|
-
|
|
|
- # 👇 核心:提取“数字 + 单位”
|
|
|
- match = re.search(r"(\d+(?:\.\d+)?)([a-z\u4e00-\u9fa5]*)", show_v)
|
|
|
- if not match:
|
|
|
- return 0
|
|
|
-
|
|
|
- num = float(match.group(1))
|
|
|
- unit = match.group(2)
|
|
|
-
|
|
|
- # 中文单位
|
|
|
- if "亿" in unit:
|
|
|
- num *= 1e8
|
|
|
- elif "万" in unit:
|
|
|
- num *= 1e4
|
|
|
- elif "千" in unit:
|
|
|
- num *= 1e3
|
|
|
-
|
|
|
- # 英文单位(重点)
|
|
|
- elif unit.startswith("k"):
|
|
|
- num *= 1e3
|
|
|
- elif unit.startswith("m"):
|
|
|
- num *= 1e6
|
|
|
- elif unit.startswith("b"):
|
|
|
- num *= 1e9
|
|
|
-
|
|
|
- return int(num)
|
|
|
+def show_desc_to_sta(show_desc: str):
|
|
|
+ def decode_show_v(show_v: str) -> int:
|
|
|
+ """
|
|
|
+ 解析数值(全球通用):
|
|
|
+ 支持:
|
|
|
+ - 中文:1.3万 / 2千 / 5亿
|
|
|
+ - 英文:13k / 2.5m / 1.2b
|
|
|
+ - 混合:1.2万阅读 / 13k views
|
|
|
+ """
|
|
|
+
|
|
|
+ if not show_v:
|
|
|
+ return 0
|
|
|
+
|
|
|
+ show_v = show_v.strip().lower()
|
|
|
+
|
|
|
+ # 防止欧洲小数格式:1,3k
|
|
|
+ show_v = show_v.replace(",", ".")
|
|
|
+
|
|
|
+ # 提取 数字 + 单位
|
|
|
+ match = re.search(r"(\d+(?:\.\d+)?)([a-z\u4e00-\u9fa5]*)", show_v)
|
|
|
+ if not match:
|
|
|
+ return 0
|
|
|
+
|
|
|
+ num = float(match.group(1))
|
|
|
+ unit = match.group(2)
|
|
|
+
|
|
|
+ # 中文单位
|
|
|
+ if "亿" in unit:
|
|
|
+ num *= 1e8
|
|
|
+ elif "万" in unit:
|
|
|
+ num *= 1e4
|
|
|
+ elif "千" in unit:
|
|
|
+ num *= 1e3
|
|
|
+
|
|
|
+ # 英文单位
|
|
|
+ elif unit.startswith("k"):
|
|
|
+ num *= 1e3
|
|
|
+ elif unit.startswith("m"):
|
|
|
+ num *= 1e6
|
|
|
+ elif unit.startswith("b"):
|
|
|
+ num *= 1e9
|
|
|
+
|
|
|
+ return int(num)
|
|
|
+
|
|
|
+ def decode_show_k(show_k: str) -> str:
|
|
|
+ """
|
|
|
+ 统一 key(中英文)
|
|
|
+ """
|
|
|
+ if not show_k:
|
|
|
+ return "show_unknown"
|
|
|
+
|
|
|
+ show_k = show_k.strip().lower()
|
|
|
+
|
|
|
+ mapping = {
|
|
|
+ # 中文
|
|
|
+ "阅读": "show_view_count",
|
|
|
+ "看过": "show_view_count",
|
|
|
+ "观看": "show_view_count",
|
|
|
+ "赞": "show_like_count",
|
|
|
+ "点赞": "show_like_count",
|
|
|
+ "付费": "show_pay_count",
|
|
|
+ "赞赏": "show_zs_count",
|
|
|
+
|
|
|
+ # 英文
|
|
|
+ "reads": "show_view_count",
|
|
|
+ "views": "show_view_count",
|
|
|
+ "view": "show_view_count",
|
|
|
+ "likes": "show_like_count",
|
|
|
+ "like": "show_like_count",
|
|
|
+ "payments": "show_pay_count",
|
|
|
+ "paid": "show_pay_count",
|
|
|
+ }
|
|
|
+
|
|
|
+ return mapping.get(show_k, "show_unknown")
|
|
|
+
|
|
|
+ # ===== 主逻辑 =====
|
|
|
+
|
|
|
+ if not show_desc:
|
|
|
+ return {
|
|
|
+ "show_view_count": 0,
|
|
|
+ "show_like_count": 0,
|
|
|
+ "show_pay_count": 0,
|
|
|
+ "show_zs_count": 0,
|
|
|
+ }
|
|
|
+
|
|
|
+ # 去掉 +
|
|
|
+ show_desc = show_desc.replace("+", "")
|
|
|
+
|
|
|
+ sta = {}
|
|
|
+
|
|
|
+ # 按“组”切分(兼容各种奇怪空格)
|
|
|
+ groups = re.split(r"[\u2004\u2005]+", show_desc)
|
|
|
+
|
|
|
+ for group in groups:
|
|
|
+ group = group.strip()
|
|
|
+ if not group:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 按 key-value 分隔符拆
|
|
|
+ parts = group.split("\u2006")
|
|
|
+ if len(parts) != 2:
|
|
|
+ continue
|
|
|
+
|
|
|
+ a, b = parts
|
|
|
+
|
|
|
+ # 自动判断哪个是数字
|
|
|
+ if re.search(r"\d", a):
|
|
|
+ show_v, show_k = a, b
|
|
|
+ else:
|
|
|
+ show_k, show_v = a, b
|
|
|
+
|
|
|
+ k = decode_show_k(show_k)
|
|
|
+ v = decode_show_v(show_v)
|
|
|
+
|
|
|
+ if k != "show_unknown":
|
|
|
+ sta[k] = v
|
|
|
+
|
|
|
+ return {
|
|
|
+ "show_view_count": sta.get("show_view_count", 0),
|
|
|
+ "show_like_count": sta.get("show_like_count", 0),
|
|
|
+ "show_pay_count": sta.get("show_pay_count", 0),
|
|
|
+ "show_zs_count": sta.get("show_zs_count", 0),
|
|
|
+ }
|
|
|
|
|
|
|
|
|
def generate_gzh_id(url):
|