|
@@ -2,6 +2,7 @@
|
|
|
@author: luojunhui
|
|
@author: luojunhui
|
|
|
"""
|
|
"""
|
|
|
|
|
|
|
|
|
|
+import re
|
|
|
import oss2
|
|
import oss2
|
|
|
import random
|
|
import random
|
|
|
import string
|
|
import string
|
|
@@ -112,51 +113,129 @@ def extract_root_source_id(path: str) -> dict:
|
|
|
return {}
|
|
return {}
|
|
|
|
|
|
|
|
|
|
|
|
|
-def show_desc_to_sta(show_desc):
|
|
|
|
|
- def decode_show_v(show_v):
|
|
|
|
|
|
|
+def show_desc_to_sta(show_desc: str):
|
|
|
|
|
+ def decode_show_v(show_v: str) -> int:
|
|
|
"""
|
|
"""
|
|
|
-
|
|
|
|
|
- :param show_v:
|
|
|
|
|
- :return:
|
|
|
|
|
|
|
+ 解析数值(全球通用):
|
|
|
|
|
+ 支持:
|
|
|
|
|
+ - 中文:1.3万 / 2千 / 5亿
|
|
|
|
|
+ - 英文:13k / 2.5m / 1.2b
|
|
|
|
|
+ - 混合:1.2万阅读 / 13k views
|
|
|
"""
|
|
"""
|
|
|
- foo = show_v.replace("千", "e3").replace("万", "e4").replace("亿", "e8")
|
|
|
|
|
- foo = eval(foo)
|
|
|
|
|
- return int(foo)
|
|
|
|
|
|
|
|
|
|
- def decode_show_k(show_k):
|
|
|
|
|
- """
|
|
|
|
|
|
|
+ if not show_v:
|
|
|
|
|
+ return 0
|
|
|
|
|
+
|
|
|
|
|
+ show_v = show_v.strip().lower()
|
|
|
|
|
+
|
|
|
|
|
+ # 防止欧洲小数格式:1,3k
|
|
|
|
|
+ show_v = show_v.replace(",", ".")
|
|
|
|
|
+
|
|
|
|
|
+ # 提取 数字 + 单位
|
|
|
|
|
+ match = re.search(r"(\d+(?:\.\d+)?)([a-z\u4e00-\u9fa5]*)", show_v)
|
|
|
|
|
+ if not match:
|
|
|
|
|
+ return 0
|
|
|
|
|
+
|
|
|
|
|
+ num = float(match.group(1))
|
|
|
|
|
+ unit = match.group(2)
|
|
|
|
|
+
|
|
|
|
|
+ # 中文单位
|
|
|
|
|
+ if "亿" in unit:
|
|
|
|
|
+ num *= 1e8
|
|
|
|
|
+ elif "万" in unit:
|
|
|
|
|
+ num *= 1e4
|
|
|
|
|
+ elif "千" in unit:
|
|
|
|
|
+ num *= 1e3
|
|
|
|
|
+
|
|
|
|
|
+ # 英文单位
|
|
|
|
|
+ elif unit.startswith("k"):
|
|
|
|
|
+ num *= 1e3
|
|
|
|
|
+ elif unit.startswith("m"):
|
|
|
|
|
+ num *= 1e6
|
|
|
|
|
+ elif unit.startswith("b"):
|
|
|
|
|
+ num *= 1e9
|
|
|
|
|
|
|
|
- :param show_k:
|
|
|
|
|
- :return:
|
|
|
|
|
|
|
+ return int(num)
|
|
|
|
|
+
|
|
|
|
|
+ def decode_show_k(show_k: str) -> str:
|
|
|
|
|
+ """
|
|
|
|
|
+ 统一 key(中英文)
|
|
|
"""
|
|
"""
|
|
|
- this_dict = {
|
|
|
|
|
- "阅读": "show_view_count", # 文章
|
|
|
|
|
- "看过": "show_view_count", # 图文
|
|
|
|
|
- "观看": "show_view_count", # 视频
|
|
|
|
|
|
|
+ if not show_k:
|
|
|
|
|
+ return "show_unknown"
|
|
|
|
|
+
|
|
|
|
|
+ show_k = show_k.strip().lower()
|
|
|
|
|
+
|
|
|
|
|
+ mapping = {
|
|
|
|
|
+ # 中文
|
|
|
|
|
+ "阅读": "show_view_count",
|
|
|
|
|
+ "看过": "show_view_count",
|
|
|
|
|
+ "观看": "show_view_count",
|
|
|
"赞": "show_like_count",
|
|
"赞": "show_like_count",
|
|
|
|
|
+ "点赞": "show_like_count",
|
|
|
"付费": "show_pay_count",
|
|
"付费": "show_pay_count",
|
|
|
"赞赏": "show_zs_count",
|
|
"赞赏": "show_zs_count",
|
|
|
|
|
+
|
|
|
|
|
+ # 英文
|
|
|
|
|
+ "reads": "show_view_count",
|
|
|
|
|
+ "views": "show_view_count",
|
|
|
|
|
+ "view": "show_view_count",
|
|
|
|
|
+ "likes": "show_like_count",
|
|
|
|
|
+ "like": "show_like_count",
|
|
|
|
|
+ "payments": "show_pay_count",
|
|
|
|
|
+ "paid": "show_pay_count",
|
|
|
}
|
|
}
|
|
|
- if show_k not in this_dict:
|
|
|
|
|
- print(f"error from decode_show_k, show_k not found: {show_k}")
|
|
|
|
|
- return this_dict.get(show_k, "show_unknown")
|
|
|
|
|
|
|
|
|
|
|
|
+ return mapping.get(show_k, "show_unknown")
|
|
|
|
|
+
|
|
|
|
|
+ # ===== 主逻辑 =====
|
|
|
|
|
+
|
|
|
|
|
+ if not show_desc:
|
|
|
|
|
+ return {
|
|
|
|
|
+ "show_view_count": 0,
|
|
|
|
|
+ "show_like_count": 0,
|
|
|
|
|
+ "show_pay_count": 0,
|
|
|
|
|
+ "show_zs_count": 0,
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ # 去掉 +
|
|
|
show_desc = show_desc.replace("+", "")
|
|
show_desc = show_desc.replace("+", "")
|
|
|
|
|
+
|
|
|
sta = {}
|
|
sta = {}
|
|
|
- for show_kv in show_desc.split("\u2004\u2005"):
|
|
|
|
|
- if not show_kv:
|
|
|
|
|
|
|
+
|
|
|
|
|
+ # 按“组”切分(兼容各种奇怪空格)
|
|
|
|
|
+ groups = re.split(r"[\u2004\u2005]+", show_desc)
|
|
|
|
|
+
|
|
|
|
|
+ for group in groups:
|
|
|
|
|
+ group = group.strip()
|
|
|
|
|
+ if not group:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ # 按 key-value 分隔符拆
|
|
|
|
|
+ parts = group.split("\u2006")
|
|
|
|
|
+ if len(parts) != 2:
|
|
|
continue
|
|
continue
|
|
|
- show_k, show_v = show_kv.split("\u2006")
|
|
|
|
|
|
|
+
|
|
|
|
|
+ a, b = parts
|
|
|
|
|
+
|
|
|
|
|
+ # 自动判断哪个是数字
|
|
|
|
|
+ if re.search(r"\d", a):
|
|
|
|
|
+ show_v, show_k = a, b
|
|
|
|
|
+ else:
|
|
|
|
|
+ show_k, show_v = a, b
|
|
|
|
|
+
|
|
|
k = decode_show_k(show_k)
|
|
k = decode_show_k(show_k)
|
|
|
v = decode_show_v(show_v)
|
|
v = decode_show_v(show_v)
|
|
|
- sta[k] = v
|
|
|
|
|
- res = {
|
|
|
|
|
|
|
+
|
|
|
|
|
+ if k != "show_unknown":
|
|
|
|
|
+ sta[k] = v
|
|
|
|
|
+
|
|
|
|
|
+ return {
|
|
|
"show_view_count": sta.get("show_view_count", 0),
|
|
"show_view_count": sta.get("show_view_count", 0),
|
|
|
"show_like_count": sta.get("show_like_count", 0),
|
|
"show_like_count": sta.get("show_like_count", 0),
|
|
|
"show_pay_count": sta.get("show_pay_count", 0),
|
|
"show_pay_count": sta.get("show_pay_count", 0),
|
|
|
"show_zs_count": sta.get("show_zs_count", 0),
|
|
"show_zs_count": sta.get("show_zs_count", 0),
|
|
|
}
|
|
}
|
|
|
- return res
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_gzh_id(url):
|
|
def generate_gzh_id(url):
|