-- Task: 01_视频基础信息_20241223 ID: 1017892287 Type: ODPS_SQL --@exclude_input=loghubods.operators_channel_spider_dt --@exclude_input=loghubods.operators_channel_dt --@exclude_input=videoods.dim_video --@exclude_input=loghubods.content_ai_tags_no_dt --@exclude_input=loghubods.tag_level_2_base --@exclude_input=loghubods.vid_festive_labels --@exclude_input=loghubods.operators_channel_spider_day --@exclude_input=loghubods.aitags_repeatdata --@exclude_input=loghubods.operators_channel_day --odps sql --********************************************************************-- --author:于卓异 --create time:2024-06-11 16:47:23 --********************************************************************-- CREATE TABLE IF NOT EXISTS loghubods.alg_vid_feature_basic_info ( vid STRING ,feature JSON ) PARTITIONED BY ( dt STRING COMMENT '日期:20240105' ,hh STRING COMMENT '小时:04' ) STORED AS ALIORC TBLPROPERTIES ('comment' = '视频基础特征') LIFECYCLE 90 ; SET odps.sql.python.version = cp37 ; INSERT OVERWRITE TABLE loghubods.alg_vid_feature_basic_info PARTITION (dt = '${dt}',hh = '${hh}') WITH t_video_merge_cate AS ( SELECT a.vid As vid ,merge_second_level_cate ,CASE -- 音乐相关 WHEN merge_second_level_cate REGEXP '祝福音乐|人生感悟音乐|民族异域音乐|亲情音乐|红歌老歌|音乐知识' THEN '音乐' -- 剧情/剧情演绎相关 WHEN merge_second_level_cate REGEXP '正能量剧情|对口型表演|快闪' THEN '剧情/剧情演绎' -- 游戏相关 WHEN merge_second_level_cate REGEXP '拟真游戏|麻将|棋牌' THEN '游戏' -- 随拍/颜值相关 WHEN merge_second_level_cate REGEXP '老年审美美女|老年审美帅哥' THEN '随拍/颜值' -- 舞蹈相关 WHEN merge_second_level_cate REGEXP '红歌老歌舞蹈|广场舞|舞蹈教程' THEN '舞蹈' -- 动物/萌宠相关 WHEN merge_second_level_cate REGEXP '宠物日常|动物表演|生动物' THEN '动物/萌宠' -- 三农相关 WHEN merge_second_level_cate REGEXP '农村生活|农业技术' THEN '三农' -- 科技/科技数码相关 WHEN merge_second_level_cate REGEXP '老年相关科技|未来科幻|国家科技力量' THEN '科技/科技数码' -- 财经相关 WHEN merge_second_level_cate REGEXP '保险|理财' THEN '财经' -- 母婴/母婴亲子相关 WHEN merge_second_level_cate REGEXP '亲子日常|K12教育' THEN '母婴/母婴亲子' -- 法律/科普/人文社科相关 WHEN merge_second_level_cate REGEXP '老年相关法律科普|知识科普|生活技巧科普' THEN '法律/科普/人文社科' -- 情感/情感心理相关 WHEN merge_second_level_cate REGEXP '怀念时光|人生忠告|迷信祝福|节日祝福|早中晚好' THEN '情感/情感心理' -- 职场/人文社科相关 WHEN merge_second_level_cate REGEXP '退休前|退休后' THEN '职场/人文社科' -- 教育/教育培训相关 WHEN merge_second_level_cate REGEXP '益智解密|老年教育' THEN '教育/教育培训' -- 摄影摄像相关 WHEN merge_second_level_cate REGEXP '风景实拍|动植物实拍|人像模特实拍|摄影教学' THEN '摄影摄像' -- 艺术/才艺技能相关 WHEN merge_second_level_cate REGEXP '名画赏析|杂技柔术|魔术|魔术特效|书法|绘画|木工|口技|大型集体艺术|戏曲戏剧|二人转|其他才艺' THEN '艺术/才艺技能' -- 美食相关 WHEN merge_second_level_cate REGEXP '美食测评|美食教程|吃播探店' THEN '美食' -- 旅行/旅游相关 WHEN merge_second_level_cate REGEXP '旅行记录|旅行攻略' THEN '旅行/旅游' -- 地域本地相关 WHEN merge_second_level_cate REGEXP '省份城市亮点|本地新闻|本地生活' THEN '地域本地' -- 时尚/美妆相关 WHEN merge_second_level_cate REGEXP '老年时尚|美妆护肤穿搭' THEN '时尚/美妆' -- 文化/人文社科相关 WHEN merge_second_level_cate REGEXP '传统文化|国际文化' THEN '文化/人文社科' -- 搞笑/休闲娱乐相关 WHEN merge_second_level_cate REGEXP '搞笑瞬间合集|搞笑段子' THEN '搞笑/休闲娱乐' -- 明星/名人相关 WHEN merge_second_level_cate REGEXP '历史名人|当代正能量人物|老明星' THEN '明星/名人' -- 综艺/影视综艺相关 WHEN merge_second_level_cate REGEXP '老年人上综艺|老年关心纪录片|老综艺影像|电影切片|电影解说|电视剧切片|电视剧解说' THEN '综艺/影视综艺' -- 体育/运动相关 WHEN merge_second_level_cate REGEXP '中国队比赛|老年运动' THEN '体育/运动' -- 医疗健康/长寿/健身相关 WHEN merge_second_level_cate REGEXP '健康知识|长寿知识|饮食健康' THEN '医疗健康/长寿/健身' -- 生活记录/生活相关 WHEN merge_second_level_cate REGEXP '健身操' THEN '生活记录/生活' -- 生活家居/家居家装相关 WHEN merge_second_level_cate REGEXP '老年生活|生活小妙招|园艺花艺' THEN '生活家居/家居家装' -- 时政社会相关 WHEN merge_second_level_cate REGEXP '民生政策|流行病疫情|社会风气|食品安全|贪污腐败|人财诈骗|核污染|惠民新闻|天气变化|国家力量|国际时政|他国政策' THEN '时政社会' -- 奇人异象相关 WHEN merge_second_level_cate REGEXP '惊奇事件|罕见画面' THEN '奇人异象' -- 历史相关 WHEN merge_second_level_cate REGEXP '中国战争史|中国党史|中国历史影像' THEN '历史' -- 军事相关 WHEN merge_second_level_cate REGEXP '国际军事|国内军事|国家统一' THEN '军事' ELSE b.first_level_cate END AS merge_first_level_cate FROM ( SELECT DISTINCT a.id AS vid ,CASE WHEN b.tag_level_2 IS NOT NULL THEN b.tag_level_2 ELSE c.tag_name_1 END AS merge_second_level_cate FROM ( SELECT DISTINCT id ,CONCAT(clear_title_signal(title),'-',total_time) AS title_duration FROM videoods.wx_video_per1h ) a LEFT JOIN ( SELECT * FROM loghubods.tag_level_2_base ) b ON a.title_duration = b.title_duration LEFT JOIN ( SELECT DISTINCT title_duration ,tags ,tag_name_1 ,tag_name_2 ,tag_name_3 FROM loghubods.content_ai_tags_no_dt ) c ON a.title_duration = c.title_duration ) a LEFT JOIN ( SELECT DISTINCT title_duration ,一级品类 AS first_level_cate ,videoid FROM ( SELECT CONCAT(CLEAR_TITLE_SIGNAL(title),'-',total_time) AS title_duration ,一级品类 ,videoid ,ROW_NUMBER() OVER (PARTITION BY CONCAT(CLEAR_TITLE_SIGNAL(title),'-',total_time) ORDER BY videoid DESC ) AS rank FROM ( SELECT DISTINCT videoid ,b.title ,b.total_time ,SUBSTRING_INDEX(words_1,'_',-1) AS 一级品类 FROM ( SELECT videoid ,words_1 FROM ( SELECT videoid ,words_1 FROM videoods.dim_video LATERAL VIEW EXPLODE(SPLIT(tags,',')) t AS words_1 ) WHERE words_1 REGEXP '一级品类_' ) a LEFT JOIN videoods.wx_video_per1h b ON a.videoid = b.id ) ) WHERE rank = 1 HAVING title_duration NOT REGEXP 'None-|null-' ) b ON a.vid = b.videoid ) ,t_video_merge_cate_total as (SELECT vid ,CONCAT_WS(',',topic,topic1,topic2,topic3) as ad FROM ( SELECT id AS vid ,CASE WHEN te.topic REGEXP '旅行|旅游' OR te.keywords REGEXP '旅行|旅游' OR tg.merge_first_level_cate REGEXP '旅行|旅游' OR tg.merge_second_level_cate REGEXP '旅行|旅游' OR te.cate2 REGEXP '旅行|旅游' OR te.cate2_list REGEXP '旅行|旅游' OR te.cate1_list REGEXP '旅行|旅游' THEN '旅游' ELSE '其他' END AS topic ,'其他' AS topic1 ,'其他' AS topic2 ,'其他' AS topic3 FROM videoods.wx_video_per1h ta LEFT JOIN ( SELECT * FROM ( SELECT video_id ,video_title ,title_duration ,IF(视频一级分类 IS NULL,"unknown",GET_CATE1(视频一级分类)) AS cate1_list -- 目前只有一个 ,IF(二级分类 IS NULL,"unknown",GET_CATE2_ALL(二级分类)) AS cate2_list ,IF(tag_1 IS NULL,"unknown",GET_CATE2_ONLY(tag_1)) AS cate2 ,COALESCE(内容选题,"") AS topic -- 几个字的话题总结 ,COALESCE(视频主题,"") AS theme -- 一小段话的主题总结 ,IF(视频关键词 IS NULL,"",REGEXP_REPLACE(视频关键词,'[\\[\\]\"]','')) AS keywords -- 视频关键词 ,IF(视频风格 IS NULL,"",REGEXP_REPLACE(视频风格,',',',')) AS style -- 视频风格 -- ,情感倾向,是否有片尾引导,传播性判断,推测观众年龄段,推测观众性别,推测观众价值类型,推测观众用观众收入 ,IF(推测观众用户价值点 IS NULL,"",REPLACE(REGEXP_REPLACE(推测观众用户价值点,'[\\[\\]\"]',''),",",",")) AS user_value -- 推测观众用户价值点 ,CASE WHEN 时效性_有无时效 IS NULL THEN "0" WHEN 时效性_有无时效 = "有" THEN "1" WHEN 时效性_有无时效 = "无" THEN "0" ELSE "0" END AS if_timeliness -- 时效性_有无时效 ,IF(时效性_具体时间 IS NULL,"",REGEXP_REPLACE(时效性_具体时间,',',',')) AS timeliness ,是否节日视频 AS is_fes ,IF(是否节日视频 = '是',节日名称,NULL) AS fes_name ,IF(是否节日视频 = '是',具体时效,NULL) AS fes_time ,ARRAY_JOIN( SLICE( ARRAY_DISTINCT( REGEXP_EXTRACT_ALL(推测观众地域,'上海|云南|北京|印度|台湾|吉林|四川|天津|宁夏|安徽|山东|山西|巴西|广东|广西|德国|新疆|日本|朝鲜|江苏|江西|河北|河南|法国|泰国|浙江|海南|湖北|湖南|澳门|甘肃|福建|缅甸|美国|英国|西藏|贵州|越南|辽宁|迪拜|重庆|陕西|青海|韩国|香港|俄罗斯|内蒙古|加拿大|新加坡|菲律宾|黑龙江|巴基斯坦|澳大利亚|马来西亚',0) ) ,1,5) ,',') AS video_region ,ROW_NUMBER() OVER (PARTITION BY CONCAT(CLEAN_TEXT(video_title),"-",SUBSTRING_INDEX(title_duration,"-",-1)) ORDER BY dt DESC ) AS rank FROM loghubods.aitags_repeatdata WHERE dt >= "${dt-1}" ) WHERE rank = 1 ) te ON CONCAT(CLEAN_TEXT(ta.title),"-",CAST(ta.total_time AS STRING)) = CONCAT(CLEAN_TEXT(te.video_title),"-",SUBSTRING_INDEX(te.title_duration,"-",-1)) LEFT JOIN ( SELECT * FROM t_video_merge_cate ) tg ON ta.id = tg.vid WHERE id IS NOT NULL AND ta.title IS NOT NULL AND ta.total_time IS NOT NULL AND ( recommend_status = -6 OR gmt_create_timestamp > ( UNIX_TIMESTAMP(DATETIME(CURRENT_TIMESTAMP())) - 7 * 24 * 3600 ) * 1000 ) ORDER BY CAST(id AS BIGINT) DESC ) ) SELECT id AS vid ,CASE WHEN te.rank IS NOT NULL THEN JSON_OBJECT( "title_time_w_h_unionid", CONCAT(CLEAN_TEXT(COALESCE(ta.title,"")),'-',COALESCE(ta.total_time,"0"),'-',COALESCE(ta.width,"0"),'-',COALESCE(ta.height,"0")), "title",title, "title_split",FENCI_STR2STR(title,3), "width",width, "height",height, "gmt_create_timestamp",gmt_create_timestamp, "size",size, "recommend_status",recommend_status, "total_time",total_time, "bit_rate",bit_rate, "vid_source",COALESCE(tb.vid_source,"unknown"), "channel",COALESCE(tc.channel,"unknown"), "cate1_list",te.cate1_list, "cate2_list",te.cate2_list, "cate2",te.cate2, "topic",te.topic, "theme",te.theme, "keywords",te.keywords, "style",te.style, "user_value",te.user_value, "if_timeliness",te.if_timeliness, "timeliness",te.timeliness, "festive_label1",TRIM(tf.first_labels), "festive_label2",TRIM(tf.secondary_labels), "merge_second_level_cate",TRIM(tg.merge_second_level_cate), "merge_first_level_cate",TRIM(tg.merge_first_level_cate), "is_fes", te.is_fes, "fes_time", te.fes_time, "fes_name",te.fes_name, "uid",ta.uid, "video_region",te.video_region, "ad",th.ad, "attribute_province",ti.province ) ELSE JSON_OBJECT( "title_time_w_h_unionid",CONCAT(CLEAN_TEXT(COALESCE(ta.title,"")),'-',COALESCE(ta.total_time,"0"),'-',COALESCE(ta.width,"0"),'-',COALESCE(ta.height,"0")), "title",title, "title_split",FENCI_STR2STR(title,3), "width",width, "height",height, "gmt_create_timestamp",gmt_create_timestamp, "size",size, "recommend_status",recommend_status, "total_time",total_time, "bit_rate",bit_rate, "vid_source",COALESCE(tb.vid_source,"unknown"), "channel",COALESCE(tc.channel,"unknown"), "festive_label1",TRIM(tf.first_labels), "festive_label2",TRIM(tf.secondary_labels), "merge_second_level_cate",TRIM(tg.merge_second_level_cate), "merge_first_level_cate",TRIM(tg.merge_first_level_cate), "is_fes", te.is_fes, "fes_time", te.fes_time, "fes_name", te.fes_name, "uid",ta.uid, "video_region",te.video_region, "ad",th.ad, "attribute_province",ti.province ) END AS feature FROM videoods.wx_video_per1h ta LEFT JOIN ( SELECT * FROM ( SELECT videoid AS vid ,type AS vid_source ,ROW_NUMBER() OVER (PARTITION BY videoid ORDER BY type DESC ) AS rn FROM loghubods.operators_channel_dt WHERE dt = MAX_PT("loghubods.operators_channel_dt") ) WHERE rn = 1 ) tb ON ta.id = tb.vid LEFT JOIN ( SELECT * FROM ( SELECT videoid AS vid ,channel AS channel ,ROW_NUMBER() OVER (PARTITION BY videoid ORDER BY channel ) AS rn FROM loghubods.operators_channel_spider_dt WHERE dt = MAX_PT("loghubods.operators_channel_spider_dt") ) WHERE rn = 1 ) tc ON ta.id = tc.vid LEFT JOIN ( SELECT *,CONCAT(cate1_list,cate2_list,cate2,topic,keywords) as cate_total FROM ( SELECT video_id ,video_title ,title_duration ,IF(视频一级分类 IS NULL,"unknown",GET_CATE1(视频一级分类)) AS cate1_list -- 目前只有一个 ,IF(二级分类 IS NULL,"unknown",GET_CATE2_ALL(二级分类)) AS cate2_list ,IF(tag_1 IS NULL,"unknown",GET_CATE2_ONLY(tag_1)) AS cate2 ,COALESCE(内容选题,"") AS topic -- 几个字的话题总结 ,COALESCE(视频主题,"") AS theme -- 一小段话的主题总结 ,IF(视频关键词 IS NULL,"",REGEXP_REPLACE(视频关键词,'[\\[\\]\"]','')) AS keywords -- 视频关键词 ,IF(视频风格 IS NULL,"",REGEXP_REPLACE(视频风格,',',',')) AS style -- 视频风格 -- ,情感倾向,是否有片尾引导,传播性判断,推测观众年龄段,推测观众性别,推测观众价值类型,推测观众用观众收入 ,IF(推测观众用户价值点 IS NULL,"",REPLACE(REGEXP_REPLACE(推测观众用户价值点,'[\\[\\]\"]',''),",",",")) AS user_value -- 推测观众用户价值点 ,CASE WHEN 时效性_有无时效 IS NULL THEN "0" WHEN 时效性_有无时效 = "有" THEN "1" WHEN 时效性_有无时效 = "无" THEN "0" ELSE "0" END AS if_timeliness -- 时效性_有无时效 ,IF(时效性_具体时间 IS NULL,"",REGEXP_REPLACE(时效性_具体时间,',',',')) AS timeliness ,是否节日视频 AS is_fes ,IF(是否节日视频 = '是',节日名称, NULL ) AS fes_name ,IF(是否节日视频 = '是',具体时效, NULL ) AS fes_time ,ARRAY_JOIN(SLICE(ARRAY_DISTINCT(REGEXP_EXTRACT_ALL(推测观众地域,'上海|云南|北京|印度|台湾|吉林|四川|天津|宁夏|安徽|山东|山西|巴西|广东|广西|德国|新疆|日本|朝鲜|江苏|江西|河北|河南|法国|泰国|浙江|海南|湖北|湖南|澳门|甘肃|福建|缅甸|美国|英国|西藏|贵州|越南|辽宁|迪拜|重庆|陕西|青海|韩国|香港|俄罗斯|内蒙古|加拿大|新加坡|菲律宾|黑龙江|巴基斯坦|澳大利亚|马来西亚',0)),1,5),',') AS video_region ,ROW_NUMBER() OVER (PARTITION BY CONCAT(CLEAN_TEXT(video_title),"-",SUBSTRING_INDEX(title_duration,"-",-1)) ORDER BY dt DESC ) AS rank FROM loghubods.aitags_repeatdata WHERE dt >= "${dt-1}" ) WHERE rank = 1 ) te ON CONCAT(CLEAN_TEXT(ta.title),"-",CAST(ta.total_time AS STRING)) = CONCAT(CLEAN_TEXT(te.video_title),"-",SUBSTRING_INDEX(te.title_duration,"-",-1)) LEFT JOIN ( SELECT * FROM ( SELECT videoid ,COALESCE(first_labels,"unknown") AS first_labels ,COALESCE(secondary_labels,"unknown") AS secondary_labels ,ROW_NUMBER() OVER (PARTITION BY videoid ORDER BY first_labels DESC ) AS rn FROM loghubods.vid_festive_labels ) WHERE rn = 1 ) tf ON ta.id = tf.videoid LEFT JOIN ( SELECT * FROM t_video_merge_cate ) tg ON ta.id = tg.vid left join t_video_merge_cate_total th ON ta.id = th.vid left join (SELECT DISTINCT video_id ,GET_JSON_OBJECT(attribute,'$.province') as province FROM ( SELECT DISTINCT video_id ,attribute ,ROW_NUMBER() OVER (PARTITION BY video_id ORDER BY create_time ASC ) AS rank FROM videoods.flow_pool_level_video WHERE life_cycle_id IS NOT NULL ) WHERE rank = 1 and attribute is not null ) ti on ta.id = ti.video_id WHERE id IS NOT NULL AND ta.title IS NOT NULL AND ta.total_time IS NOT NULL AND ( recommend_status = -6 OR gmt_create_timestamp>(UNIX_TIMESTAMP(DATETIME(CURRENT_TIMESTAMP())) - 7 * 24 * 3600) * 1000 ) ORDER BY CAST(id AS BIGINT) DESC ;