| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398 |
- -- Task: 01_视频基础信息_20241223 ID: 1017892287 Type: ODPS_SQL
- --@exclude_input=loghubods.operators_channel_spider_dt
- --@exclude_input=loghubods.operators_channel_dt
- --@exclude_input=videoods.dim_video
- --@exclude_input=loghubods.content_ai_tags_no_dt
- --@exclude_input=loghubods.tag_level_2_base
- --@exclude_input=loghubods.vid_festive_labels
- --@exclude_input=loghubods.operators_channel_spider_day
- --@exclude_input=loghubods.aitags_repeatdata
- --@exclude_input=loghubods.operators_channel_day
- --odps sql
- --********************************************************************--
- --author:于卓异
- --create time:2024-06-11 16:47:23
- --********************************************************************--
- CREATE TABLE IF NOT EXISTS loghubods.alg_vid_feature_basic_info
- (
- vid STRING
- ,feature JSON
- )
- PARTITIONED BY
- (
- dt STRING COMMENT '日期:20240105'
- ,hh STRING COMMENT '小时:04'
- )
- STORED AS ALIORC
- TBLPROPERTIES ('comment' = '视频基础特征')
- LIFECYCLE 90
- ;
- SET odps.sql.python.version = cp37
- ;
- INSERT OVERWRITE TABLE loghubods.alg_vid_feature_basic_info PARTITION (dt = '${dt}',hh = '${hh}')
- WITH t_video_merge_cate AS (
- SELECT a.vid As vid
- ,merge_second_level_cate
- ,CASE
- -- 音乐相关
- WHEN merge_second_level_cate REGEXP '祝福音乐|人生感悟音乐|民族异域音乐|亲情音乐|红歌老歌|音乐知识' THEN '音乐'
- -- 剧情/剧情演绎相关
- WHEN merge_second_level_cate REGEXP '正能量剧情|对口型表演|快闪' THEN '剧情/剧情演绎'
- -- 游戏相关
- WHEN merge_second_level_cate REGEXP '拟真游戏|麻将|棋牌' THEN '游戏'
- -- 随拍/颜值相关
- WHEN merge_second_level_cate REGEXP '老年审美美女|老年审美帅哥' THEN '随拍/颜值'
- -- 舞蹈相关
- WHEN merge_second_level_cate REGEXP '红歌老歌舞蹈|广场舞|舞蹈教程' THEN '舞蹈'
- -- 动物/萌宠相关
- WHEN merge_second_level_cate REGEXP '宠物日常|动物表演|生动物' THEN '动物/萌宠'
- -- 三农相关
- WHEN merge_second_level_cate REGEXP '农村生活|农业技术' THEN '三农'
- -- 科技/科技数码相关
- WHEN merge_second_level_cate REGEXP '老年相关科技|未来科幻|国家科技力量' THEN '科技/科技数码'
- -- 财经相关
- WHEN merge_second_level_cate REGEXP '保险|理财' THEN '财经'
- -- 母婴/母婴亲子相关
- WHEN merge_second_level_cate REGEXP '亲子日常|K12教育' THEN '母婴/母婴亲子'
- -- 法律/科普/人文社科相关
- WHEN merge_second_level_cate REGEXP '老年相关法律科普|知识科普|生活技巧科普' THEN '法律/科普/人文社科'
- -- 情感/情感心理相关
- WHEN merge_second_level_cate REGEXP '怀念时光|人生忠告|迷信祝福|节日祝福|早中晚好' THEN '情感/情感心理'
- -- 职场/人文社科相关
- WHEN merge_second_level_cate REGEXP '退休前|退休后' THEN '职场/人文社科'
- -- 教育/教育培训相关
- WHEN merge_second_level_cate REGEXP '益智解密|老年教育' THEN '教育/教育培训'
- -- 摄影摄像相关
- WHEN merge_second_level_cate REGEXP '风景实拍|动植物实拍|人像模特实拍|摄影教学' THEN '摄影摄像'
- -- 艺术/才艺技能相关
- WHEN merge_second_level_cate REGEXP '名画赏析|杂技柔术|魔术|魔术特效|书法|绘画|木工|口技|大型集体艺术|戏曲戏剧|二人转|其他才艺' THEN '艺术/才艺技能'
- -- 美食相关
- WHEN merge_second_level_cate REGEXP '美食测评|美食教程|吃播探店' THEN '美食'
- -- 旅行/旅游相关
- WHEN merge_second_level_cate REGEXP '旅行记录|旅行攻略' THEN '旅行/旅游'
- -- 地域本地相关
- WHEN merge_second_level_cate REGEXP '省份城市亮点|本地新闻|本地生活' THEN '地域本地'
- -- 时尚/美妆相关
- WHEN merge_second_level_cate REGEXP '老年时尚|美妆护肤穿搭' THEN '时尚/美妆'
- -- 文化/人文社科相关
- WHEN merge_second_level_cate REGEXP '传统文化|国际文化' THEN '文化/人文社科'
- -- 搞笑/休闲娱乐相关
- WHEN merge_second_level_cate REGEXP '搞笑瞬间合集|搞笑段子' THEN '搞笑/休闲娱乐'
- -- 明星/名人相关
- WHEN merge_second_level_cate REGEXP '历史名人|当代正能量人物|老明星' THEN '明星/名人'
- -- 综艺/影视综艺相关
- WHEN merge_second_level_cate REGEXP '老年人上综艺|老年关心纪录片|老综艺影像|电影切片|电影解说|电视剧切片|电视剧解说' THEN '综艺/影视综艺'
- -- 体育/运动相关
- WHEN merge_second_level_cate REGEXP '中国队比赛|老年运动' THEN '体育/运动'
- -- 医疗健康/长寿/健身相关
- WHEN merge_second_level_cate REGEXP '健康知识|长寿知识|饮食健康' THEN '医疗健康/长寿/健身'
- -- 生活记录/生活相关
- WHEN merge_second_level_cate REGEXP '健身操' THEN '生活记录/生活'
- -- 生活家居/家居家装相关
- WHEN merge_second_level_cate REGEXP '老年生活|生活小妙招|园艺花艺' THEN '生活家居/家居家装'
- -- 时政社会相关
- WHEN merge_second_level_cate REGEXP '民生政策|流行病疫情|社会风气|食品安全|贪污腐败|人财诈骗|核污染|惠民新闻|天气变化|国家力量|国际时政|他国政策' THEN '时政社会'
- -- 奇人异象相关
- WHEN merge_second_level_cate REGEXP '惊奇事件|罕见画面' THEN '奇人异象'
- -- 历史相关
- WHEN merge_second_level_cate REGEXP '中国战争史|中国党史|中国历史影像' THEN '历史'
- -- 军事相关
- WHEN merge_second_level_cate REGEXP '国际军事|国内军事|国家统一' THEN '军事'
- ELSE b.first_level_cate
- END AS merge_first_level_cate
- FROM (
- SELECT DISTINCT a.id AS vid
- ,CASE WHEN b.tag_level_2 IS NOT NULL THEN b.tag_level_2
- ELSE c.tag_name_1
- END AS merge_second_level_cate
- FROM (
- SELECT DISTINCT id
- ,CONCAT(clear_title_signal(title),'-',total_time) AS title_duration
- FROM videoods.wx_video_per1h
- ) a
- LEFT JOIN (
- SELECT *
- FROM loghubods.tag_level_2_base
- ) b
- ON a.title_duration = b.title_duration
- LEFT JOIN (
- SELECT DISTINCT title_duration
- ,tags
- ,tag_name_1
- ,tag_name_2
- ,tag_name_3
- FROM loghubods.content_ai_tags_no_dt
- ) c
- ON a.title_duration = c.title_duration
- ) a
- LEFT JOIN (
- SELECT DISTINCT title_duration
- ,一级品类 AS first_level_cate
- ,videoid
- FROM (
- SELECT CONCAT(CLEAR_TITLE_SIGNAL(title),'-',total_time) AS title_duration
- ,一级品类
- ,videoid
- ,ROW_NUMBER() OVER (PARTITION BY CONCAT(CLEAR_TITLE_SIGNAL(title),'-',total_time) ORDER BY videoid DESC ) AS rank
- FROM (
- SELECT DISTINCT videoid
- ,b.title
- ,b.total_time
- ,SUBSTRING_INDEX(words_1,'_',-1) AS 一级品类
- FROM (
- SELECT videoid
- ,words_1
- FROM (
- SELECT videoid
- ,words_1
- FROM videoods.dim_video
- LATERAL VIEW EXPLODE(SPLIT(tags,',')) t AS words_1
- )
- WHERE words_1 REGEXP '一级品类_'
- ) a
- LEFT JOIN videoods.wx_video_per1h b
- ON a.videoid = b.id
- )
- )
- WHERE rank = 1
- HAVING title_duration NOT REGEXP 'None-|null-'
- ) b
- ON a.vid = b.videoid
- )
- ,t_video_merge_cate_total as
- (SELECT vid
- ,CONCAT_WS(',',topic,topic1,topic2,topic3) as ad
- FROM (
- SELECT id AS vid
- ,CASE WHEN te.topic REGEXP '旅行|旅游'
- OR te.keywords REGEXP '旅行|旅游'
- OR tg.merge_first_level_cate REGEXP '旅行|旅游'
- OR tg.merge_second_level_cate REGEXP '旅行|旅游'
- OR te.cate2 REGEXP '旅行|旅游'
- OR te.cate2_list REGEXP '旅行|旅游'
- OR te.cate1_list REGEXP '旅行|旅游' THEN '旅游'
- ELSE '其他'
- END AS topic
- ,'其他' AS topic1
- ,'其他' AS topic2
- ,'其他' AS topic3
- FROM videoods.wx_video_per1h ta
- LEFT JOIN (
- SELECT *
- FROM (
- SELECT video_id
- ,video_title
- ,title_duration
- ,IF(视频一级分类 IS NULL,"unknown",GET_CATE1(视频一级分类)) AS cate1_list -- 目前只有一个
- ,IF(二级分类 IS NULL,"unknown",GET_CATE2_ALL(二级分类)) AS cate2_list
- ,IF(tag_1 IS NULL,"unknown",GET_CATE2_ONLY(tag_1)) AS cate2
- ,COALESCE(内容选题,"") AS topic -- 几个字的话题总结
- ,COALESCE(视频主题,"") AS theme -- 一小段话的主题总结
- ,IF(视频关键词 IS NULL,"",REGEXP_REPLACE(视频关键词,'[\\[\\]\"]','')) AS keywords -- 视频关键词
- ,IF(视频风格 IS NULL,"",REGEXP_REPLACE(视频风格,',',',')) AS style -- 视频风格
- -- ,情感倾向,是否有片尾引导,传播性判断,推测观众年龄段,推测观众性别,推测观众价值类型,推测观众用观众收入
- ,IF(推测观众用户价值点 IS NULL,"",REPLACE(REGEXP_REPLACE(推测观众用户价值点,'[\\[\\]\"]',''),",",",")) AS user_value -- 推测观众用户价值点
- ,CASE WHEN 时效性_有无时效 IS NULL THEN "0"
- WHEN 时效性_有无时效 = "有" THEN "1"
- WHEN 时效性_有无时效 = "无" THEN "0"
- ELSE "0"
- END AS if_timeliness -- 时效性_有无时效
- ,IF(时效性_具体时间 IS NULL,"",REGEXP_REPLACE(时效性_具体时间,',',',')) AS timeliness
- ,是否节日视频 AS is_fes
- ,IF(是否节日视频 = '是',节日名称,NULL) AS fes_name
- ,IF(是否节日视频 = '是',具体时效,NULL) AS fes_time
- ,ARRAY_JOIN(
- SLICE(
- ARRAY_DISTINCT(
- REGEXP_EXTRACT_ALL(推测观众地域,'上海|云南|北京|印度|台湾|吉林|四川|天津|宁夏|安徽|山东|山西|巴西|广东|广西|德国|新疆|日本|朝鲜|江苏|江西|河北|河南|法国|泰国|浙江|海南|湖北|湖南|澳门|甘肃|福建|缅甸|美国|英国|西藏|贵州|越南|辽宁|迪拜|重庆|陕西|青海|韩国|香港|俄罗斯|内蒙古|加拿大|新加坡|菲律宾|黑龙江|巴基斯坦|澳大利亚|马来西亚',0)
- )
- ,1,5)
- ,',') AS video_region
- ,ROW_NUMBER() OVER (PARTITION BY CONCAT(CLEAN_TEXT(video_title),"-",SUBSTRING_INDEX(title_duration,"-",-1)) ORDER BY dt DESC ) AS rank
- FROM loghubods.aitags_repeatdata
- WHERE dt >= "${dt-1}"
- )
- WHERE rank = 1
- ) te
- ON CONCAT(CLEAN_TEXT(ta.title),"-",CAST(ta.total_time AS STRING)) = CONCAT(CLEAN_TEXT(te.video_title),"-",SUBSTRING_INDEX(te.title_duration,"-",-1))
- LEFT JOIN (
- SELECT *
- FROM t_video_merge_cate
- ) tg
- ON ta.id = tg.vid
- WHERE id IS NOT NULL
- AND ta.title IS NOT NULL
- AND ta.total_time IS NOT NULL
- AND (
- recommend_status = -6
- OR gmt_create_timestamp > (
- UNIX_TIMESTAMP(DATETIME(CURRENT_TIMESTAMP())) - 7 * 24 * 3600
- ) * 1000
- )
- ORDER BY CAST(id AS BIGINT) DESC
- ) )
- SELECT id AS vid
- ,CASE WHEN te.rank IS NOT NULL THEN JSON_OBJECT(
- "title_time_w_h_unionid", CONCAT(CLEAN_TEXT(COALESCE(ta.title,"")),'-',COALESCE(ta.total_time,"0"),'-',COALESCE(ta.width,"0"),'-',COALESCE(ta.height,"0")),
- "title",title,
- "title_split",FENCI_STR2STR(title,3),
- "width",width,
- "height",height,
- "gmt_create_timestamp",gmt_create_timestamp,
- "size",size,
- "recommend_status",recommend_status,
- "total_time",total_time,
- "bit_rate",bit_rate,
- "vid_source",COALESCE(tb.vid_source,"unknown"),
- "channel",COALESCE(tc.channel,"unknown"),
- "cate1_list",te.cate1_list,
- "cate2_list",te.cate2_list,
- "cate2",te.cate2,
- "topic",te.topic,
- "theme",te.theme,
- "keywords",te.keywords,
- "style",te.style,
- "user_value",te.user_value,
- "if_timeliness",te.if_timeliness,
- "timeliness",te.timeliness,
- "festive_label1",TRIM(tf.first_labels),
- "festive_label2",TRIM(tf.secondary_labels),
- "merge_second_level_cate",TRIM(tg.merge_second_level_cate),
- "merge_first_level_cate",TRIM(tg.merge_first_level_cate),
- "is_fes", te.is_fes,
- "fes_time", te.fes_time,
- "fes_name",te.fes_name,
- "uid",ta.uid,
- "video_region",te.video_region,
- "ad",th.ad,
- "attribute_province",ti.province
- )
- ELSE JSON_OBJECT(
- "title_time_w_h_unionid",CONCAT(CLEAN_TEXT(COALESCE(ta.title,"")),'-',COALESCE(ta.total_time,"0"),'-',COALESCE(ta.width,"0"),'-',COALESCE(ta.height,"0")),
- "title",title,
- "title_split",FENCI_STR2STR(title,3),
- "width",width,
- "height",height,
- "gmt_create_timestamp",gmt_create_timestamp,
- "size",size,
- "recommend_status",recommend_status,
- "total_time",total_time,
- "bit_rate",bit_rate,
- "vid_source",COALESCE(tb.vid_source,"unknown"),
- "channel",COALESCE(tc.channel,"unknown"),
- "festive_label1",TRIM(tf.first_labels),
- "festive_label2",TRIM(tf.secondary_labels),
- "merge_second_level_cate",TRIM(tg.merge_second_level_cate),
- "merge_first_level_cate",TRIM(tg.merge_first_level_cate),
- "is_fes", te.is_fes,
- "fes_time", te.fes_time,
- "fes_name", te.fes_name,
- "uid",ta.uid,
- "video_region",te.video_region,
- "ad",th.ad,
- "attribute_province",ti.province
- )
- END AS feature
- FROM videoods.wx_video_per1h ta
- LEFT JOIN (
- SELECT *
- FROM (
- SELECT videoid AS vid
- ,type AS vid_source
- ,ROW_NUMBER() OVER (PARTITION BY videoid ORDER BY type DESC ) AS rn
- FROM loghubods.operators_channel_dt
- WHERE dt = MAX_PT("loghubods.operators_channel_dt")
- )
- WHERE rn = 1
- ) tb
- ON ta.id = tb.vid
- LEFT JOIN (
- SELECT *
- FROM (
- SELECT videoid AS vid
- ,channel AS channel
- ,ROW_NUMBER() OVER (PARTITION BY videoid ORDER BY channel ) AS rn
- FROM loghubods.operators_channel_spider_dt
- WHERE dt = MAX_PT("loghubods.operators_channel_spider_dt")
- )
- WHERE rn = 1
- ) tc
- ON ta.id = tc.vid
- LEFT JOIN (
- SELECT *,CONCAT(cate1_list,cate2_list,cate2,topic,keywords) as cate_total
- FROM (
- SELECT video_id
- ,video_title
- ,title_duration
- ,IF(视频一级分类 IS NULL,"unknown",GET_CATE1(视频一级分类)) AS cate1_list -- 目前只有一个
- ,IF(二级分类 IS NULL,"unknown",GET_CATE2_ALL(二级分类)) AS cate2_list
- ,IF(tag_1 IS NULL,"unknown",GET_CATE2_ONLY(tag_1)) AS cate2
- ,COALESCE(内容选题,"") AS topic -- 几个字的话题总结
- ,COALESCE(视频主题,"") AS theme -- 一小段话的主题总结
- ,IF(视频关键词 IS NULL,"",REGEXP_REPLACE(视频关键词,'[\\[\\]\"]','')) AS keywords -- 视频关键词
- ,IF(视频风格 IS NULL,"",REGEXP_REPLACE(视频风格,',',',')) AS style -- 视频风格
- -- ,情感倾向,是否有片尾引导,传播性判断,推测观众年龄段,推测观众性别,推测观众价值类型,推测观众用观众收入
- ,IF(推测观众用户价值点 IS NULL,"",REPLACE(REGEXP_REPLACE(推测观众用户价值点,'[\\[\\]\"]',''),",",",")) AS user_value -- 推测观众用户价值点
- ,CASE WHEN 时效性_有无时效 IS NULL THEN "0"
- WHEN 时效性_有无时效 = "有" THEN "1"
- WHEN 时效性_有无时效 = "无" THEN "0"
- ELSE "0"
- END AS if_timeliness -- 时效性_有无时效
- ,IF(时效性_具体时间 IS NULL,"",REGEXP_REPLACE(时效性_具体时间,',',',')) AS timeliness
- ,是否节日视频 AS is_fes
- ,IF(是否节日视频 = '是',节日名称, NULL ) AS fes_name
- ,IF(是否节日视频 = '是',具体时效, NULL ) AS fes_time
- ,ARRAY_JOIN(SLICE(ARRAY_DISTINCT(REGEXP_EXTRACT_ALL(推测观众地域,'上海|云南|北京|印度|台湾|吉林|四川|天津|宁夏|安徽|山东|山西|巴西|广东|广西|德国|新疆|日本|朝鲜|江苏|江西|河北|河南|法国|泰国|浙江|海南|湖北|湖南|澳门|甘肃|福建|缅甸|美国|英国|西藏|贵州|越南|辽宁|迪拜|重庆|陕西|青海|韩国|香港|俄罗斯|内蒙古|加拿大|新加坡|菲律宾|黑龙江|巴基斯坦|澳大利亚|马来西亚',0)),1,5),',') AS video_region
- ,ROW_NUMBER() OVER (PARTITION BY CONCAT(CLEAN_TEXT(video_title),"-",SUBSTRING_INDEX(title_duration,"-",-1)) ORDER BY dt DESC ) AS rank
- FROM loghubods.aitags_repeatdata
- WHERE dt >= "${dt-1}"
- )
- WHERE rank = 1
- ) te
- ON CONCAT(CLEAN_TEXT(ta.title),"-",CAST(ta.total_time AS STRING)) = CONCAT(CLEAN_TEXT(te.video_title),"-",SUBSTRING_INDEX(te.title_duration,"-",-1))
- LEFT JOIN (
- SELECT *
- FROM (
- SELECT videoid
- ,COALESCE(first_labels,"unknown") AS first_labels
- ,COALESCE(secondary_labels,"unknown") AS secondary_labels
- ,ROW_NUMBER() OVER (PARTITION BY videoid ORDER BY first_labels DESC ) AS rn
- FROM loghubods.vid_festive_labels
- )
- WHERE rn = 1
- ) tf
- ON ta.id = tf.videoid
- LEFT JOIN (
- SELECT *
- FROM t_video_merge_cate
- ) tg ON ta.id = tg.vid
- left join t_video_merge_cate_total th
- ON ta.id = th.vid
- left join (SELECT DISTINCT video_id
- ,GET_JSON_OBJECT(attribute,'$.province') as province
- FROM (
- SELECT DISTINCT video_id
- ,attribute
- ,ROW_NUMBER() OVER (PARTITION BY video_id ORDER BY create_time ASC ) AS rank
- FROM videoods.flow_pool_level_video
- WHERE life_cycle_id IS NOT NULL
- )
- WHERE rank = 1
- and attribute is not null ) ti
- on ta.id = ti.video_id
- WHERE id IS NOT NULL
- AND ta.title IS NOT NULL
- AND ta.total_time IS NOT NULL
- AND (
- recommend_status = -6
- OR
- gmt_create_timestamp>(UNIX_TIMESTAMP(DATETIME(CURRENT_TIMESTAMP())) - 7 * 24 * 3600) * 1000
- )
- ORDER BY CAST(id AS BIGINT) DESC
- ;
|