|
@@ -112,107 +112,41 @@ def extract_root_source_id(path: str) -> dict:
|
|
|
else:
|
|
else:
|
|
|
return {}
|
|
return {}
|
|
|
|
|
|
|
|
-def show_desc_to_sta(show_desc):
|
|
|
|
|
- def decode_show_v(show_v: str) -> int:
|
|
|
|
|
- """
|
|
|
|
|
- 解析数值,支持:
|
|
|
|
|
- - 1.2万 / 3千 / 5亿
|
|
|
|
|
- - 158 / 3
|
|
|
|
|
- - 2.3万阅读 / 158reads(自动提取数字)
|
|
|
|
|
- """
|
|
|
|
|
- if not show_v:
|
|
|
|
|
- return 0
|
|
|
|
|
-
|
|
|
|
|
- show_v = show_v.strip().lower()
|
|
|
|
|
-
|
|
|
|
|
- # 提取数字(支持小数)
|
|
|
|
|
- match = re.search(r"\d+(\.\d+)?", show_v)
|
|
|
|
|
- if not match:
|
|
|
|
|
- return 0
|
|
|
|
|
-
|
|
|
|
|
- num = float(match.group())
|
|
|
|
|
-
|
|
|
|
|
- # 单位换算(中文)
|
|
|
|
|
- if "亿" in show_v:
|
|
|
|
|
- num *= 1e8
|
|
|
|
|
- elif "万" in show_v:
|
|
|
|
|
- num *= 1e4
|
|
|
|
|
- elif "千" in show_v:
|
|
|
|
|
- num *= 1e3
|
|
|
|
|
-
|
|
|
|
|
- return int(num)
|
|
|
|
|
-
|
|
|
|
|
- def decode_show_k(show_k: str) -> str:
|
|
|
|
|
- """
|
|
|
|
|
- 统一 key 映射(支持中英文)
|
|
|
|
|
- """
|
|
|
|
|
- if not show_k:
|
|
|
|
|
- return "show_unknown"
|
|
|
|
|
-
|
|
|
|
|
- show_k = show_k.strip().lower()
|
|
|
|
|
-
|
|
|
|
|
- this_dict = {
|
|
|
|
|
- # 中文
|
|
|
|
|
- "阅读": "show_view_count",
|
|
|
|
|
- "看过": "show_view_count",
|
|
|
|
|
- "观看": "show_view_count",
|
|
|
|
|
- "赞": "show_like_count",
|
|
|
|
|
- "付费": "show_pay_count",
|
|
|
|
|
- "赞赏": "show_zs_count",
|
|
|
|
|
-
|
|
|
|
|
- # 英文
|
|
|
|
|
- "reads": "show_view_count",
|
|
|
|
|
- "views": "show_view_count",
|
|
|
|
|
- "likes": "show_like_count",
|
|
|
|
|
- "payments": "show_pay_count",
|
|
|
|
|
- "paid": "show_pay_count",
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- return this_dict.get(show_k, "show_unknown")
|
|
|
|
|
-
|
|
|
|
|
- if not show_desc:
|
|
|
|
|
- return {
|
|
|
|
|
- "show_view_count": 0,
|
|
|
|
|
- "show_like_count": 0,
|
|
|
|
|
- "show_pay_count": 0,
|
|
|
|
|
- "show_zs_count": 0,
|
|
|
|
|
- }
|
|
|
|
|
|
|
|
|
|
- # 去掉 "+"
|
|
|
|
|
- show_desc = show_desc.replace("+", "")
|
|
|
|
|
|
|
+def decode_show_v(show_v: str) -> int:
|
|
|
|
|
+ if not show_v:
|
|
|
|
|
+ return 0
|
|
|
|
|
|
|
|
- sta = {}
|
|
|
|
|
|
|
+ show_v = show_v.strip().lower()
|
|
|
|
|
|
|
|
- # 按分组分隔符切分(兼容不同空白字符)
|
|
|
|
|
- for show_kv in re.split(r"[\u2004\u2005]+", show_desc):
|
|
|
|
|
- if not show_kv.strip():
|
|
|
|
|
- continue
|
|
|
|
|
|
|
+ # 统一小数点(防 1,3k 这种)
|
|
|
|
|
+ show_v = show_v.replace(",", ".")
|
|
|
|
|
|
|
|
- # 再按 key-value 分隔符切
|
|
|
|
|
- parts = show_kv.split("\u2006")
|
|
|
|
|
- if len(parts) != 2:
|
|
|
|
|
- continue
|
|
|
|
|
|
|
+ # 👇 核心:提取“数字 + 单位”
|
|
|
|
|
+ match = re.search(r"(\d+(?:\.\d+)?)([a-z\u4e00-\u9fa5]*)", show_v)
|
|
|
|
|
+ if not match:
|
|
|
|
|
+ return 0
|
|
|
|
|
|
|
|
- a, b = parts
|
|
|
|
|
|
|
+ num = float(match.group(1))
|
|
|
|
|
+ unit = match.group(2)
|
|
|
|
|
|
|
|
- # 自动判断谁是 value(数字)
|
|
|
|
|
- if re.search(r"\d", a):
|
|
|
|
|
- show_v, show_k = a, b
|
|
|
|
|
- else:
|
|
|
|
|
- show_k, show_v = a, b
|
|
|
|
|
-
|
|
|
|
|
- k = decode_show_k(show_k)
|
|
|
|
|
- v = decode_show_v(show_v)
|
|
|
|
|
|
|
+ # 中文单位
|
|
|
|
|
+ if "亿" in unit:
|
|
|
|
|
+ num *= 1e8
|
|
|
|
|
+ elif "万" in unit:
|
|
|
|
|
+ num *= 1e4
|
|
|
|
|
+ elif "千" in unit:
|
|
|
|
|
+ num *= 1e3
|
|
|
|
|
|
|
|
- if k != "show_unknown":
|
|
|
|
|
- sta[k] = v
|
|
|
|
|
|
|
+ # 英文单位(重点)
|
|
|
|
|
+ elif unit.startswith("k"):
|
|
|
|
|
+ num *= 1e3
|
|
|
|
|
+ elif unit.startswith("m"):
|
|
|
|
|
+ num *= 1e6
|
|
|
|
|
+ elif unit.startswith("b"):
|
|
|
|
|
+ num *= 1e9
|
|
|
|
|
|
|
|
- return {
|
|
|
|
|
- "show_view_count": sta.get("show_view_count", 0),
|
|
|
|
|
- "show_like_count": sta.get("show_like_count", 0),
|
|
|
|
|
- "show_pay_count": sta.get("show_pay_count", 0),
|
|
|
|
|
- "show_zs_count": sta.get("show_zs_count", 0),
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ return int(num)
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_gzh_id(url):
|
|
def generate_gzh_id(url):
|