|
@@ -2,6 +2,7 @@
|
|
|
@author: luojunhui
|
|
@author: luojunhui
|
|
|
"""
|
|
"""
|
|
|
|
|
|
|
|
|
|
+import re
|
|
|
import oss2
|
|
import oss2
|
|
|
import random
|
|
import random
|
|
|
import string
|
|
import string
|
|
@@ -111,52 +112,107 @@ def extract_root_source_id(path: str) -> dict:
|
|
|
else:
|
|
else:
|
|
|
return {}
|
|
return {}
|
|
|
|
|
|
|
|
-
|
|
|
|
|
def show_desc_to_sta(show_desc):
|
|
def show_desc_to_sta(show_desc):
|
|
|
- def decode_show_v(show_v):
|
|
|
|
|
|
|
+ def decode_show_v(show_v: str) -> int:
|
|
|
"""
|
|
"""
|
|
|
-
|
|
|
|
|
- :param show_v:
|
|
|
|
|
- :return:
|
|
|
|
|
|
|
+ 解析数值,支持:
|
|
|
|
|
+ - 1.2万 / 3千 / 5亿
|
|
|
|
|
+ - 158 / 3
|
|
|
|
|
+ - 2.3万阅读 / 158reads(自动提取数字)
|
|
|
"""
|
|
"""
|
|
|
- foo = show_v.replace("千", "e3").replace("万", "e4").replace("亿", "e8")
|
|
|
|
|
- foo = eval(foo)
|
|
|
|
|
- return int(foo)
|
|
|
|
|
|
|
+ if not show_v:
|
|
|
|
|
+ return 0
|
|
|
|
|
|
|
|
- def decode_show_k(show_k):
|
|
|
|
|
- """
|
|
|
|
|
|
|
+ show_v = show_v.strip().lower()
|
|
|
|
|
+
|
|
|
|
|
+ # 提取数字(支持小数)
|
|
|
|
|
+ match = re.search(r"\d+(\.\d+)?", show_v)
|
|
|
|
|
+ if not match:
|
|
|
|
|
+ return 0
|
|
|
|
|
+
|
|
|
|
|
+ num = float(match.group())
|
|
|
|
|
+
|
|
|
|
|
+ # 单位换算(中文)
|
|
|
|
|
+ if "亿" in show_v:
|
|
|
|
|
+ num *= 1e8
|
|
|
|
|
+ elif "万" in show_v:
|
|
|
|
|
+ num *= 1e4
|
|
|
|
|
+ elif "千" in show_v:
|
|
|
|
|
+ num *= 1e3
|
|
|
|
|
+
|
|
|
|
|
+ return int(num)
|
|
|
|
|
|
|
|
- :param show_k:
|
|
|
|
|
- :return:
|
|
|
|
|
|
|
+ def decode_show_k(show_k: str) -> str:
|
|
|
"""
|
|
"""
|
|
|
|
|
+ 统一 key 映射(支持中英文)
|
|
|
|
|
+ """
|
|
|
|
|
+ if not show_k:
|
|
|
|
|
+ return "show_unknown"
|
|
|
|
|
+
|
|
|
|
|
+ show_k = show_k.strip().lower()
|
|
|
|
|
+
|
|
|
this_dict = {
|
|
this_dict = {
|
|
|
- "阅读": "show_view_count", # 文章
|
|
|
|
|
- "看过": "show_view_count", # 图文
|
|
|
|
|
- "观看": "show_view_count", # 视频
|
|
|
|
|
|
|
+ # 中文
|
|
|
|
|
+ "阅读": "show_view_count",
|
|
|
|
|
+ "看过": "show_view_count",
|
|
|
|
|
+ "观看": "show_view_count",
|
|
|
"赞": "show_like_count",
|
|
"赞": "show_like_count",
|
|
|
"付费": "show_pay_count",
|
|
"付费": "show_pay_count",
|
|
|
"赞赏": "show_zs_count",
|
|
"赞赏": "show_zs_count",
|
|
|
|
|
+
|
|
|
|
|
+ # 英文
|
|
|
|
|
+ "reads": "show_view_count",
|
|
|
|
|
+ "views": "show_view_count",
|
|
|
|
|
+ "likes": "show_like_count",
|
|
|
|
|
+ "payments": "show_pay_count",
|
|
|
|
|
+ "paid": "show_pay_count",
|
|
|
}
|
|
}
|
|
|
- if show_k not in this_dict:
|
|
|
|
|
- print(f"error from decode_show_k, show_k not found: {show_k}")
|
|
|
|
|
|
|
+
|
|
|
return this_dict.get(show_k, "show_unknown")
|
|
return this_dict.get(show_k, "show_unknown")
|
|
|
|
|
|
|
|
|
|
+ if not show_desc:
|
|
|
|
|
+ return {
|
|
|
|
|
+ "show_view_count": 0,
|
|
|
|
|
+ "show_like_count": 0,
|
|
|
|
|
+ "show_pay_count": 0,
|
|
|
|
|
+ "show_zs_count": 0,
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ # 去掉 "+"
|
|
|
show_desc = show_desc.replace("+", "")
|
|
show_desc = show_desc.replace("+", "")
|
|
|
|
|
+
|
|
|
sta = {}
|
|
sta = {}
|
|
|
- for show_kv in show_desc.split("\u2004\u2005"):
|
|
|
|
|
- if not show_kv:
|
|
|
|
|
|
|
+
|
|
|
|
|
+ # 按分组分隔符切分(兼容不同空白字符)
|
|
|
|
|
+ for show_kv in re.split(r"[\u2004\u2005]+", show_desc):
|
|
|
|
|
+ if not show_kv.strip():
|
|
|
continue
|
|
continue
|
|
|
- show_k, show_v = show_kv.split("\u2006")
|
|
|
|
|
|
|
+
|
|
|
|
|
+ # 再按 key-value 分隔符切
|
|
|
|
|
+ parts = show_kv.split("\u2006")
|
|
|
|
|
+ if len(parts) != 2:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ a, b = parts
|
|
|
|
|
+
|
|
|
|
|
+ # 自动判断谁是 value(数字)
|
|
|
|
|
+ if re.search(r"\d", a):
|
|
|
|
|
+ show_v, show_k = a, b
|
|
|
|
|
+ else:
|
|
|
|
|
+ show_k, show_v = a, b
|
|
|
|
|
+
|
|
|
k = decode_show_k(show_k)
|
|
k = decode_show_k(show_k)
|
|
|
v = decode_show_v(show_v)
|
|
v = decode_show_v(show_v)
|
|
|
- sta[k] = v
|
|
|
|
|
- res = {
|
|
|
|
|
|
|
+
|
|
|
|
|
+ if k != "show_unknown":
|
|
|
|
|
+ sta[k] = v
|
|
|
|
|
+
|
|
|
|
|
+ return {
|
|
|
"show_view_count": sta.get("show_view_count", 0),
|
|
"show_view_count": sta.get("show_view_count", 0),
|
|
|
"show_like_count": sta.get("show_like_count", 0),
|
|
"show_like_count": sta.get("show_like_count", 0),
|
|
|
"show_pay_count": sta.get("show_pay_count", 0),
|
|
"show_pay_count": sta.get("show_pay_count", 0),
|
|
|
"show_zs_count": sta.get("show_zs_count", 0),
|
|
"show_zs_count": sta.get("show_zs_count", 0),
|
|
|
}
|
|
}
|
|
|
- return res
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_gzh_id(url):
|
|
def generate_gzh_id(url):
|