loghubods.alg_vid_feature_basic_info.sql 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398
  1. -- Task: 01_视频基础信息_20241223 ID: 1017892287 Type: ODPS_SQL
  2. --@exclude_input=loghubods.operators_channel_spider_dt
  3. --@exclude_input=loghubods.operators_channel_dt
  4. --@exclude_input=videoods.dim_video
  5. --@exclude_input=loghubods.content_ai_tags_no_dt
  6. --@exclude_input=loghubods.tag_level_2_base
  7. --@exclude_input=loghubods.vid_festive_labels
  8. --@exclude_input=loghubods.operators_channel_spider_day
  9. --@exclude_input=loghubods.aitags_repeatdata
  10. --@exclude_input=loghubods.operators_channel_day
  11. --odps sql
  12. --********************************************************************--
  13. --author:于卓异
  14. --create time:2024-06-11 16:47:23
  15. --********************************************************************--
  16. CREATE TABLE IF NOT EXISTS loghubods.alg_vid_feature_basic_info
  17. (
  18. vid STRING
  19. ,feature JSON
  20. )
  21. PARTITIONED BY
  22. (
  23. dt STRING COMMENT '日期:20240105'
  24. ,hh STRING COMMENT '小时:04'
  25. )
  26. STORED AS ALIORC
  27. TBLPROPERTIES ('comment' = '视频基础特征')
  28. LIFECYCLE 90
  29. ;
  30. SET odps.sql.python.version = cp37
  31. ;
  32. INSERT OVERWRITE TABLE loghubods.alg_vid_feature_basic_info PARTITION (dt = '${dt}',hh = '${hh}')
  33. WITH t_video_merge_cate AS (
  34. SELECT a.vid As vid
  35. ,merge_second_level_cate
  36. ,CASE
  37. -- 音乐相关
  38. WHEN merge_second_level_cate REGEXP '祝福音乐|人生感悟音乐|民族异域音乐|亲情音乐|红歌老歌|音乐知识' THEN '音乐'
  39. -- 剧情/剧情演绎相关
  40. WHEN merge_second_level_cate REGEXP '正能量剧情|对口型表演|快闪' THEN '剧情/剧情演绎'
  41. -- 游戏相关
  42. WHEN merge_second_level_cate REGEXP '拟真游戏|麻将|棋牌' THEN '游戏'
  43. -- 随拍/颜值相关
  44. WHEN merge_second_level_cate REGEXP '老年审美美女|老年审美帅哥' THEN '随拍/颜值'
  45. -- 舞蹈相关
  46. WHEN merge_second_level_cate REGEXP '红歌老歌舞蹈|广场舞|舞蹈教程' THEN '舞蹈'
  47. -- 动物/萌宠相关
  48. WHEN merge_second_level_cate REGEXP '宠物日常|动物表演|生动物' THEN '动物/萌宠'
  49. -- 三农相关
  50. WHEN merge_second_level_cate REGEXP '农村生活|农业技术' THEN '三农'
  51. -- 科技/科技数码相关
  52. WHEN merge_second_level_cate REGEXP '老年相关科技|未来科幻|国家科技力量' THEN '科技/科技数码'
  53. -- 财经相关
  54. WHEN merge_second_level_cate REGEXP '保险|理财' THEN '财经'
  55. -- 母婴/母婴亲子相关
  56. WHEN merge_second_level_cate REGEXP '亲子日常|K12教育' THEN '母婴/母婴亲子'
  57. -- 法律/科普/人文社科相关
  58. WHEN merge_second_level_cate REGEXP '老年相关法律科普|知识科普|生活技巧科普' THEN '法律/科普/人文社科'
  59. -- 情感/情感心理相关
  60. WHEN merge_second_level_cate REGEXP '怀念时光|人生忠告|迷信祝福|节日祝福|早中晚好' THEN '情感/情感心理'
  61. -- 职场/人文社科相关
  62. WHEN merge_second_level_cate REGEXP '退休前|退休后' THEN '职场/人文社科'
  63. -- 教育/教育培训相关
  64. WHEN merge_second_level_cate REGEXP '益智解密|老年教育' THEN '教育/教育培训'
  65. -- 摄影摄像相关
  66. WHEN merge_second_level_cate REGEXP '风景实拍|动植物实拍|人像模特实拍|摄影教学' THEN '摄影摄像'
  67. -- 艺术/才艺技能相关
  68. WHEN merge_second_level_cate REGEXP '名画赏析|杂技柔术|魔术|魔术特效|书法|绘画|木工|口技|大型集体艺术|戏曲戏剧|二人转|其他才艺' THEN '艺术/才艺技能'
  69. -- 美食相关
  70. WHEN merge_second_level_cate REGEXP '美食测评|美食教程|吃播探店' THEN '美食'
  71. -- 旅行/旅游相关
  72. WHEN merge_second_level_cate REGEXP '旅行记录|旅行攻略' THEN '旅行/旅游'
  73. -- 地域本地相关
  74. WHEN merge_second_level_cate REGEXP '省份城市亮点|本地新闻|本地生活' THEN '地域本地'
  75. -- 时尚/美妆相关
  76. WHEN merge_second_level_cate REGEXP '老年时尚|美妆护肤穿搭' THEN '时尚/美妆'
  77. -- 文化/人文社科相关
  78. WHEN merge_second_level_cate REGEXP '传统文化|国际文化' THEN '文化/人文社科'
  79. -- 搞笑/休闲娱乐相关
  80. WHEN merge_second_level_cate REGEXP '搞笑瞬间合集|搞笑段子' THEN '搞笑/休闲娱乐'
  81. -- 明星/名人相关
  82. WHEN merge_second_level_cate REGEXP '历史名人|当代正能量人物|老明星' THEN '明星/名人'
  83. -- 综艺/影视综艺相关
  84. WHEN merge_second_level_cate REGEXP '老年人上综艺|老年关心纪录片|老综艺影像|电影切片|电影解说|电视剧切片|电视剧解说' THEN '综艺/影视综艺'
  85. -- 体育/运动相关
  86. WHEN merge_second_level_cate REGEXP '中国队比赛|老年运动' THEN '体育/运动'
  87. -- 医疗健康/长寿/健身相关
  88. WHEN merge_second_level_cate REGEXP '健康知识|长寿知识|饮食健康' THEN '医疗健康/长寿/健身'
  89. -- 生活记录/生活相关
  90. WHEN merge_second_level_cate REGEXP '健身操' THEN '生活记录/生活'
  91. -- 生活家居/家居家装相关
  92. WHEN merge_second_level_cate REGEXP '老年生活|生活小妙招|园艺花艺' THEN '生活家居/家居家装'
  93. -- 时政社会相关
  94. WHEN merge_second_level_cate REGEXP '民生政策|流行病疫情|社会风气|食品安全|贪污腐败|人财诈骗|核污染|惠民新闻|天气变化|国家力量|国际时政|他国政策' THEN '时政社会'
  95. -- 奇人异象相关
  96. WHEN merge_second_level_cate REGEXP '惊奇事件|罕见画面' THEN '奇人异象'
  97. -- 历史相关
  98. WHEN merge_second_level_cate REGEXP '中国战争史|中国党史|中国历史影像' THEN '历史'
  99. -- 军事相关
  100. WHEN merge_second_level_cate REGEXP '国际军事|国内军事|国家统一' THEN '军事'
  101. ELSE b.first_level_cate
  102. END AS merge_first_level_cate
  103. FROM (
  104. SELECT DISTINCT a.id AS vid
  105. ,CASE WHEN b.tag_level_2 IS NOT NULL THEN b.tag_level_2
  106. ELSE c.tag_name_1
  107. END AS merge_second_level_cate
  108. FROM (
  109. SELECT DISTINCT id
  110. ,CONCAT(clear_title_signal(title),'-',total_time) AS title_duration
  111. FROM videoods.wx_video_per1h
  112. ) a
  113. LEFT JOIN (
  114. SELECT *
  115. FROM loghubods.tag_level_2_base
  116. ) b
  117. ON a.title_duration = b.title_duration
  118. LEFT JOIN (
  119. SELECT DISTINCT title_duration
  120. ,tags
  121. ,tag_name_1
  122. ,tag_name_2
  123. ,tag_name_3
  124. FROM loghubods.content_ai_tags_no_dt
  125. ) c
  126. ON a.title_duration = c.title_duration
  127. ) a
  128. LEFT JOIN (
  129. SELECT DISTINCT title_duration
  130. ,一级品类 AS first_level_cate
  131. ,videoid
  132. FROM (
  133. SELECT CONCAT(CLEAR_TITLE_SIGNAL(title),'-',total_time) AS title_duration
  134. ,一级品类
  135. ,videoid
  136. ,ROW_NUMBER() OVER (PARTITION BY CONCAT(CLEAR_TITLE_SIGNAL(title),'-',total_time) ORDER BY videoid DESC ) AS rank
  137. FROM (
  138. SELECT DISTINCT videoid
  139. ,b.title
  140. ,b.total_time
  141. ,SUBSTRING_INDEX(words_1,'_',-1) AS 一级品类
  142. FROM (
  143. SELECT videoid
  144. ,words_1
  145. FROM (
  146. SELECT videoid
  147. ,words_1
  148. FROM videoods.dim_video
  149. LATERAL VIEW EXPLODE(SPLIT(tags,',')) t AS words_1
  150. )
  151. WHERE words_1 REGEXP '一级品类_'
  152. ) a
  153. LEFT JOIN videoods.wx_video_per1h b
  154. ON a.videoid = b.id
  155. )
  156. )
  157. WHERE rank = 1
  158. HAVING title_duration NOT REGEXP 'None-|null-'
  159. ) b
  160. ON a.vid = b.videoid
  161. )
  162. ,t_video_merge_cate_total as
  163. (SELECT vid
  164. ,CONCAT_WS(',',topic,topic1,topic2,topic3) as ad
  165. FROM (
  166. SELECT id AS vid
  167. ,CASE WHEN te.topic REGEXP '旅行|旅游'
  168. OR te.keywords REGEXP '旅行|旅游'
  169. OR tg.merge_first_level_cate REGEXP '旅行|旅游'
  170. OR tg.merge_second_level_cate REGEXP '旅行|旅游'
  171. OR te.cate2 REGEXP '旅行|旅游'
  172. OR te.cate2_list REGEXP '旅行|旅游'
  173. OR te.cate1_list REGEXP '旅行|旅游' THEN '旅游'
  174. ELSE '其他'
  175. END AS topic
  176. ,'其他' AS topic1
  177. ,'其他' AS topic2
  178. ,'其他' AS topic3
  179. FROM videoods.wx_video_per1h ta
  180. LEFT JOIN (
  181. SELECT *
  182. FROM (
  183. SELECT video_id
  184. ,video_title
  185. ,title_duration
  186. ,IF(视频一级分类 IS NULL,"unknown",GET_CATE1(视频一级分类)) AS cate1_list -- 目前只有一个
  187. ,IF(二级分类 IS NULL,"unknown",GET_CATE2_ALL(二级分类)) AS cate2_list
  188. ,IF(tag_1 IS NULL,"unknown",GET_CATE2_ONLY(tag_1)) AS cate2
  189. ,COALESCE(内容选题,"") AS topic -- 几个字的话题总结
  190. ,COALESCE(视频主题,"") AS theme -- 一小段话的主题总结
  191. ,IF(视频关键词 IS NULL,"",REGEXP_REPLACE(视频关键词,'[\\[\\]\"]','')) AS keywords -- 视频关键词
  192. ,IF(视频风格 IS NULL,"",REGEXP_REPLACE(视频风格,',',',')) AS style -- 视频风格
  193. -- ,情感倾向,是否有片尾引导,传播性判断,推测观众年龄段,推测观众性别,推测观众价值类型,推测观众用观众收入
  194. ,IF(推测观众用户价值点 IS NULL,"",REPLACE(REGEXP_REPLACE(推测观众用户价值点,'[\\[\\]\"]',''),",",",")) AS user_value -- 推测观众用户价值点
  195. ,CASE WHEN 时效性_有无时效 IS NULL THEN "0"
  196. WHEN 时效性_有无时效 = "有" THEN "1"
  197. WHEN 时效性_有无时效 = "无" THEN "0"
  198. ELSE "0"
  199. END AS if_timeliness -- 时效性_有无时效
  200. ,IF(时效性_具体时间 IS NULL,"",REGEXP_REPLACE(时效性_具体时间,',',',')) AS timeliness
  201. ,是否节日视频 AS is_fes
  202. ,IF(是否节日视频 = '是',节日名称,NULL) AS fes_name
  203. ,IF(是否节日视频 = '是',具体时效,NULL) AS fes_time
  204. ,ARRAY_JOIN(
  205. SLICE(
  206. ARRAY_DISTINCT(
  207. REGEXP_EXTRACT_ALL(推测观众地域,'上海|云南|北京|印度|台湾|吉林|四川|天津|宁夏|安徽|山东|山西|巴西|广东|广西|德国|新疆|日本|朝鲜|江苏|江西|河北|河南|法国|泰国|浙江|海南|湖北|湖南|澳门|甘肃|福建|缅甸|美国|英国|西藏|贵州|越南|辽宁|迪拜|重庆|陕西|青海|韩国|香港|俄罗斯|内蒙古|加拿大|新加坡|菲律宾|黑龙江|巴基斯坦|澳大利亚|马来西亚',0)
  208. )
  209. ,1,5)
  210. ,',') AS video_region
  211. ,ROW_NUMBER() OVER (PARTITION BY CONCAT(CLEAN_TEXT(video_title),"-",SUBSTRING_INDEX(title_duration,"-",-1)) ORDER BY dt DESC ) AS rank
  212. FROM loghubods.aitags_repeatdata
  213. WHERE dt >= "${dt-1}"
  214. )
  215. WHERE rank = 1
  216. ) te
  217. ON CONCAT(CLEAN_TEXT(ta.title),"-",CAST(ta.total_time AS STRING)) = CONCAT(CLEAN_TEXT(te.video_title),"-",SUBSTRING_INDEX(te.title_duration,"-",-1))
  218. LEFT JOIN (
  219. SELECT *
  220. FROM t_video_merge_cate
  221. ) tg
  222. ON ta.id = tg.vid
  223. WHERE id IS NOT NULL
  224. AND ta.title IS NOT NULL
  225. AND ta.total_time IS NOT NULL
  226. AND (
  227. recommend_status = -6
  228. OR gmt_create_timestamp > (
  229. UNIX_TIMESTAMP(DATETIME(CURRENT_TIMESTAMP())) - 7 * 24 * 3600
  230. ) * 1000
  231. )
  232. ORDER BY CAST(id AS BIGINT) DESC
  233. ) )
  234. SELECT id AS vid
  235. ,CASE WHEN te.rank IS NOT NULL THEN JSON_OBJECT(
  236. "title_time_w_h_unionid", CONCAT(CLEAN_TEXT(COALESCE(ta.title,"")),'-',COALESCE(ta.total_time,"0"),'-',COALESCE(ta.width,"0"),'-',COALESCE(ta.height,"0")),
  237. "title",title,
  238. "title_split",FENCI_STR2STR(title,3),
  239. "width",width,
  240. "height",height,
  241. "gmt_create_timestamp",gmt_create_timestamp,
  242. "size",size,
  243. "recommend_status",recommend_status,
  244. "total_time",total_time,
  245. "bit_rate",bit_rate,
  246. "vid_source",COALESCE(tb.vid_source,"unknown"),
  247. "channel",COALESCE(tc.channel,"unknown"),
  248. "cate1_list",te.cate1_list,
  249. "cate2_list",te.cate2_list,
  250. "cate2",te.cate2,
  251. "topic",te.topic,
  252. "theme",te.theme,
  253. "keywords",te.keywords,
  254. "style",te.style,
  255. "user_value",te.user_value,
  256. "if_timeliness",te.if_timeliness,
  257. "timeliness",te.timeliness,
  258. "festive_label1",TRIM(tf.first_labels),
  259. "festive_label2",TRIM(tf.secondary_labels),
  260. "merge_second_level_cate",TRIM(tg.merge_second_level_cate),
  261. "merge_first_level_cate",TRIM(tg.merge_first_level_cate),
  262. "is_fes", te.is_fes,
  263. "fes_time", te.fes_time,
  264. "fes_name",te.fes_name,
  265. "uid",ta.uid,
  266. "video_region",te.video_region,
  267. "ad",th.ad,
  268. "attribute_province",ti.province
  269. )
  270. ELSE JSON_OBJECT(
  271. "title_time_w_h_unionid",CONCAT(CLEAN_TEXT(COALESCE(ta.title,"")),'-',COALESCE(ta.total_time,"0"),'-',COALESCE(ta.width,"0"),'-',COALESCE(ta.height,"0")),
  272. "title",title,
  273. "title_split",FENCI_STR2STR(title,3),
  274. "width",width,
  275. "height",height,
  276. "gmt_create_timestamp",gmt_create_timestamp,
  277. "size",size,
  278. "recommend_status",recommend_status,
  279. "total_time",total_time,
  280. "bit_rate",bit_rate,
  281. "vid_source",COALESCE(tb.vid_source,"unknown"),
  282. "channel",COALESCE(tc.channel,"unknown"),
  283. "festive_label1",TRIM(tf.first_labels),
  284. "festive_label2",TRIM(tf.secondary_labels),
  285. "merge_second_level_cate",TRIM(tg.merge_second_level_cate),
  286. "merge_first_level_cate",TRIM(tg.merge_first_level_cate),
  287. "is_fes", te.is_fes,
  288. "fes_time", te.fes_time,
  289. "fes_name", te.fes_name,
  290. "uid",ta.uid,
  291. "video_region",te.video_region,
  292. "ad",th.ad,
  293. "attribute_province",ti.province
  294. )
  295. END AS feature
  296. FROM videoods.wx_video_per1h ta
  297. LEFT JOIN (
  298. SELECT *
  299. FROM (
  300. SELECT videoid AS vid
  301. ,type AS vid_source
  302. ,ROW_NUMBER() OVER (PARTITION BY videoid ORDER BY type DESC ) AS rn
  303. FROM loghubods.operators_channel_dt
  304. WHERE dt = MAX_PT("loghubods.operators_channel_dt")
  305. )
  306. WHERE rn = 1
  307. ) tb
  308. ON ta.id = tb.vid
  309. LEFT JOIN (
  310. SELECT *
  311. FROM (
  312. SELECT videoid AS vid
  313. ,channel AS channel
  314. ,ROW_NUMBER() OVER (PARTITION BY videoid ORDER BY channel ) AS rn
  315. FROM loghubods.operators_channel_spider_dt
  316. WHERE dt = MAX_PT("loghubods.operators_channel_spider_dt")
  317. )
  318. WHERE rn = 1
  319. ) tc
  320. ON ta.id = tc.vid
  321. LEFT JOIN (
  322. SELECT *,CONCAT(cate1_list,cate2_list,cate2,topic,keywords) as cate_total
  323. FROM (
  324. SELECT video_id
  325. ,video_title
  326. ,title_duration
  327. ,IF(视频一级分类 IS NULL,"unknown",GET_CATE1(视频一级分类)) AS cate1_list -- 目前只有一个
  328. ,IF(二级分类 IS NULL,"unknown",GET_CATE2_ALL(二级分类)) AS cate2_list
  329. ,IF(tag_1 IS NULL,"unknown",GET_CATE2_ONLY(tag_1)) AS cate2
  330. ,COALESCE(内容选题,"") AS topic -- 几个字的话题总结
  331. ,COALESCE(视频主题,"") AS theme -- 一小段话的主题总结
  332. ,IF(视频关键词 IS NULL,"",REGEXP_REPLACE(视频关键词,'[\\[\\]\"]','')) AS keywords -- 视频关键词
  333. ,IF(视频风格 IS NULL,"",REGEXP_REPLACE(视频风格,',',',')) AS style -- 视频风格
  334. -- ,情感倾向,是否有片尾引导,传播性判断,推测观众年龄段,推测观众性别,推测观众价值类型,推测观众用观众收入
  335. ,IF(推测观众用户价值点 IS NULL,"",REPLACE(REGEXP_REPLACE(推测观众用户价值点,'[\\[\\]\"]',''),",",",")) AS user_value -- 推测观众用户价值点
  336. ,CASE WHEN 时效性_有无时效 IS NULL THEN "0"
  337. WHEN 时效性_有无时效 = "有" THEN "1"
  338. WHEN 时效性_有无时效 = "无" THEN "0"
  339. ELSE "0"
  340. END AS if_timeliness -- 时效性_有无时效
  341. ,IF(时效性_具体时间 IS NULL,"",REGEXP_REPLACE(时效性_具体时间,',',',')) AS timeliness
  342. ,是否节日视频 AS is_fes
  343. ,IF(是否节日视频 = '是',节日名称, NULL ) AS fes_name
  344. ,IF(是否节日视频 = '是',具体时效, NULL ) AS fes_time
  345. ,ARRAY_JOIN(SLICE(ARRAY_DISTINCT(REGEXP_EXTRACT_ALL(推测观众地域,'上海|云南|北京|印度|台湾|吉林|四川|天津|宁夏|安徽|山东|山西|巴西|广东|广西|德国|新疆|日本|朝鲜|江苏|江西|河北|河南|法国|泰国|浙江|海南|湖北|湖南|澳门|甘肃|福建|缅甸|美国|英国|西藏|贵州|越南|辽宁|迪拜|重庆|陕西|青海|韩国|香港|俄罗斯|内蒙古|加拿大|新加坡|菲律宾|黑龙江|巴基斯坦|澳大利亚|马来西亚',0)),1,5),',') AS video_region
  346. ,ROW_NUMBER() OVER (PARTITION BY CONCAT(CLEAN_TEXT(video_title),"-",SUBSTRING_INDEX(title_duration,"-",-1)) ORDER BY dt DESC ) AS rank
  347. FROM loghubods.aitags_repeatdata
  348. WHERE dt >= "${dt-1}"
  349. )
  350. WHERE rank = 1
  351. ) te
  352. ON CONCAT(CLEAN_TEXT(ta.title),"-",CAST(ta.total_time AS STRING)) = CONCAT(CLEAN_TEXT(te.video_title),"-",SUBSTRING_INDEX(te.title_duration,"-",-1))
  353. LEFT JOIN (
  354. SELECT *
  355. FROM (
  356. SELECT videoid
  357. ,COALESCE(first_labels,"unknown") AS first_labels
  358. ,COALESCE(secondary_labels,"unknown") AS secondary_labels
  359. ,ROW_NUMBER() OVER (PARTITION BY videoid ORDER BY first_labels DESC ) AS rn
  360. FROM loghubods.vid_festive_labels
  361. )
  362. WHERE rn = 1
  363. ) tf
  364. ON ta.id = tf.videoid
  365. LEFT JOIN (
  366. SELECT *
  367. FROM t_video_merge_cate
  368. ) tg ON ta.id = tg.vid
  369. left join t_video_merge_cate_total th
  370. ON ta.id = th.vid
  371. left join (SELECT DISTINCT video_id
  372. ,GET_JSON_OBJECT(attribute,'$.province') as province
  373. FROM (
  374. SELECT DISTINCT video_id
  375. ,attribute
  376. ,ROW_NUMBER() OVER (PARTITION BY video_id ORDER BY create_time ASC ) AS rank
  377. FROM videoods.flow_pool_level_video
  378. WHERE life_cycle_id IS NOT NULL
  379. )
  380. WHERE rank = 1
  381. and attribute is not null ) ti
  382. on ta.id = ti.video_id
  383. WHERE id IS NOT NULL
  384. AND ta.title IS NOT NULL
  385. AND ta.total_time IS NOT NULL
  386. AND (
  387. recommend_status = -6
  388. OR
  389. gmt_create_timestamp>(UNIX_TIMESTAMP(DATETIME(CURRENT_TIMESTAMP())) - 7 * 24 * 3600) * 1000
  390. )
  391. ORDER BY CAST(id AS BIGINT) DESC
  392. ;