udf_python.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. # coding:utf-8
  2. from odps.udf import annotate
  3. from odps.distcache import get_cache_file
  4. @annotate("string,bigint->string")
  5. class fenci_str2str(object):
  6. def __init__(self):
  7. import sys
  8. sys.path.insert(0, './work/jieba-0.42.1-py3-none-any.zip')
  9. cache_file = get_cache_file('top_tags.txt')
  10. kv = {}
  11. for index, line in enumerate(cache_file):
  12. if index == 10000:
  13. break
  14. line = line.strip()
  15. if not line:
  16. continue
  17. k, _ = line.split('\t')
  18. kv[k] = index
  19. cache_file.close()
  20. self.kv = kv
  21. def evaluate(self, title, top_size):
  22. import jieba.analyse
  23. if title is None or len(title) == 0:
  24. return ""
  25. keys = jieba.analyse.extract_tags(title, topK=top_size, withWeight=False, allowPOS=('n', 'v'))
  26. if keys is None or len(keys) == 0:
  27. return ""
  28. keys_filter = []
  29. for k in keys:
  30. if k in self.kv.keys():
  31. keys_filter.append(k)
  32. if len(keys_filter) == 0:
  33. return ""
  34. return ",".join(keys_filter)
  35. @annotate("string,bigint,bigint->string")
  36. class get_top_tags(object):
  37. def evaluate(self, tag_ts, ts_limit, top):
  38. k_v = [kv.split(":") for kv in tag_ts.split(",")]
  39. result = dict()
  40. for tag, ts in k_v:
  41. ts = int(ts)
  42. if ts >= 0 and ts < ts_limit:
  43. tmp = result[tag] if tag in result else [0, -1]
  44. result[tag] = [tmp[0] + 1, max(tmp[1], ts)]
  45. if len(result) == 0:
  46. return ""
  47. sorted_result = sorted(result.items(), key=lambda x: (-x[1][0], -x[1][1]), reverse=False)
  48. top_keys = [item[0] for item in sorted_result[:top]]
  49. return ",".join(top_keys)
  50. @annotate("string,bigint,bigint->string")
  51. class get_top_tags_v2(object):
  52. def evaluate(self, tag_ts, ts_limit, top):
  53. k_v = [kv.split(":") for kv in tag_ts.split(",")]
  54. result = dict()
  55. for tag, ts, uv in k_v:
  56. ts = int(ts)
  57. uv = int(uv)
  58. if ts >= 0 and ts < ts_limit:
  59. tmp = result[tag] if tag in result else [0, -1]
  60. result[tag] = [tmp[0] + uv, max(tmp[1], ts)]
  61. if len(result) == 0:
  62. return ""
  63. sorted_result = sorted(result.items(), key=lambda x: (-x[1][0], -x[1][1]), reverse=False)
  64. top_keys = [item[0] for item in sorted_result[:top]]
  65. return ",".join(top_keys)
  66. @annotate("string,bigint,bigint->string")
  67. class mid_ts_dedup(object):
  68. def evaluate(self, mid_ts, ts_exp, time_diff):
  69. l = [(s.split(":")[0], s.split(":")[1]) for s in mid_ts.split(",")]
  70. m = {}
  71. for id, ts in l:
  72. ts = int(ts)
  73. if ts - ts_exp > time_diff or ts - ts_exp < 0:
  74. continue
  75. if id not in m or m[id] > ts:
  76. m[id] = ts
  77. ll = []
  78. for k, v in m.items():
  79. ll.append(k + ":" + str(v))
  80. if len(ll) == 0:
  81. return ""
  82. res = ",".join(ll)
  83. return res
  84. @annotate("string->string")
  85. class get_cate1(object):
  86. def __init__(self):
  87. self.cate_list = [
  88. "音乐",
  89. "剧情",
  90. "舞蹈",
  91. "动物",
  92. "三农",
  93. "科技",
  94. "财经",
  95. "母婴",
  96. "法律",
  97. "科普",
  98. "情感",
  99. "文化",
  100. "搞笑",
  101. "名人",
  102. "体育",
  103. "医疗健康",
  104. "时政",
  105. "奇人异象",
  106. "历史",
  107. "军事",
  108. "艺术",
  109. "美食",
  110. "旅行",
  111. "地域本地",
  112. "生活记录",
  113. "生活家居",
  114. "二次元",
  115. "游戏",
  116. "公益",
  117. "随拍",
  118. "职场",
  119. "教育",
  120. "摄影摄像",
  121. "时尚",
  122. "综艺",
  123. "电影",
  124. "电视剧",
  125. "汽车",
  126. "宗教",
  127. "短剧",
  128. "收藏品"]
  129. def evaluate(self, cate_str):
  130. if cate_str is None or cate_str == "" or cate_str == "无":
  131. return "unknown"
  132. result = []
  133. for cate in self.cate_list:
  134. if cate in cate_str:
  135. result.append(cate)
  136. if len(result) == 0:
  137. result.append("unknown_" + cate_str)
  138. res = ",".join(result)
  139. return res
  140. @annotate("string->string")
  141. class get_cate2_all(object):
  142. def __init__(self):
  143. self.cate_list = [
  144. "祝福音乐",
  145. "中国战争史",
  146. "中国历史影像",
  147. "知识科普",
  148. "正能量剧情",
  149. "杂技柔术",
  150. "早中晚好",
  151. "益智解密",
  152. "饮食健康",
  153. "戏曲戏剧",
  154. "未来科幻",
  155. "天气变化",
  156. "他国政策",
  157. "贪污腐败",
  158. "书法",
  159. "食品安全",
  160. "社会风气",
  161. "生活小妙招",
  162. "生活技巧科普",
  163. "省份城市亮点",
  164. "人生忠告",
  165. "人财诈骗",
  166. "亲子日常",
  167. "亲情音乐",
  168. "木工",
  169. "魔术特效",
  170. "迷信祝福",
  171. "民族异域音乐",
  172. "民生政策",
  173. "名画赏析",
  174. "美食教程",
  175. "麻将",
  176. "旅行攻略",
  177. "历史名人",
  178. "老综艺影像",
  179. "老年相关法律科普",
  180. "老年时尚",
  181. "老年审美美女",
  182. "老年生活",
  183. "老明星",
  184. "惊奇事件",
  185. "节日祝福",
  186. "健身操",
  187. "健康知识",
  188. "惠民新闻",
  189. "绘画",
  190. "怀念时光",
  191. "红歌老歌",
  192. "罕见画面",
  193. "国际文化",
  194. "国家统一",
  195. "国家力量",
  196. "国家科技力量",
  197. "搞笑段子",
  198. "风景实拍",
  199. "对口型表演",
  200. "动物萌宠",
  201. "动物表演",
  202. "大型集体艺术",
  203. "当代正能量人物",
  204. "传统文化",
  205. "吃播探店",
  206. "长寿知识",
  207. "本地生活",
  208. "K12教育",
  209. "(老)电影切片"
  210. ]
  211. def evaluate(self, cate_str):
  212. result = []
  213. for cate in self.cate_list:
  214. if cate in cate_str:
  215. result.append(cate)
  216. if len(result) == 0:
  217. result.append("unknown")
  218. res = ",".join(result)
  219. return res
  220. @annotate("string->string")
  221. class get_cate2_only(object):
  222. def __init__(self):
  223. self.cate_list = [
  224. "祝福音乐",
  225. "中国战争史",
  226. "中国历史影像",
  227. "知识科普",
  228. "正能量剧情",
  229. "杂技柔术",
  230. "早中晚好",
  231. "益智解密",
  232. "饮食健康",
  233. "戏曲戏剧",
  234. "未来科幻",
  235. "天气变化",
  236. "他国政策",
  237. "贪污腐败",
  238. "书法",
  239. "食品安全",
  240. "社会风气",
  241. "生活小妙招",
  242. "生活技巧科普",
  243. "省份城市亮点",
  244. "人生忠告",
  245. "人财诈骗",
  246. "亲子日常",
  247. "亲情音乐",
  248. "木工",
  249. "魔术特效",
  250. "迷信祝福",
  251. "民族异域音乐",
  252. "民生政策",
  253. "名画赏析",
  254. "美食教程",
  255. "麻将",
  256. "旅行攻略",
  257. "历史名人",
  258. "老综艺影像",
  259. "老年相关法律科普",
  260. "老年时尚",
  261. "老年审美美女",
  262. "老年生活",
  263. "老明星",
  264. "惊奇事件",
  265. "节日祝福",
  266. "健身操",
  267. "健康知识",
  268. "惠民新闻",
  269. "绘画",
  270. "怀念时光",
  271. "红歌老歌",
  272. "罕见画面",
  273. "国际文化",
  274. "国家统一",
  275. "国家力量",
  276. "国家科技力量",
  277. "搞笑段子",
  278. "风景实拍",
  279. "对口型表演",
  280. "动物萌宠",
  281. "动物表演",
  282. "大型集体艺术",
  283. "当代正能量人物",
  284. "传统文化",
  285. "吃播探店",
  286. "长寿知识",
  287. "本地生活",
  288. "K12教育",
  289. "(老)电影切片"
  290. ]
  291. def evaluate(self, cate_str):
  292. result = "unknown"
  293. for cate in self.cate_list:
  294. if cate in cate_str:
  295. result = cate
  296. break
  297. if result == "unknown":
  298. result = cate_str.split("、")[0].replace("品类-", "").replace("品类", "").replace("分数", "").replace("一人", "")
  299. result = self.clean_text(result)
  300. if len(result) == 0:
  301. result = "unknown"
  302. return result
  303. def clean_text(self, input_text):
  304. """
  305. 去除字符串中的标点符号,只保留汉字和英文字符。
  306. 参数:
  307. input_text (str): 输入的字符串。
  308. 返回:
  309. str: 处理后的字符串。
  310. """
  311. # 使用正则表达式匹配汉字和英文字符
  312. import re
  313. cleaned_text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z]', '', input_text)
  314. return cleaned_text
  315. @annotate("string->string")
  316. class clean_text(object):
  317. def __init__(self):
  318. import re
  319. self.re = re
  320. def evaluate(self, input_text):
  321. if input_text is None:
  322. return ""
  323. cleaned_text = self.re.sub(r"[^\u4e00-\u9fa5a-zA-Z0-9]", "", input_text)
  324. return "" if cleaned_text is None else cleaned_text
  325. @annotate("string->string")
  326. class deduplication4list(object):
  327. def evaluate(self, input_text):
  328. if input_text is None:
  329. return None
  330. result = list(set(input_text.split(",")))
  331. return ",".join(result)