Bläddra i källkod

feat: 新增 base_v5/v5_v1 尾号实验 SQL(追加模型预估/COPC/特征统计),更新 fetch_daily 及 feishu 模块

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
yangxiaohui 1 vecka sedan
förälder
incheckning
cf7454277a

+ 77 - 34
fetch_daily.py

@@ -460,56 +460,82 @@ def upload_to_feishu(csv_file, sheet_token, sheet_id=None, sort_spec="dt:desc",
         'Authorization': f'Bearer {access_token}'
     }
 
-    # 第1步:删除旧数据行(保留第1行表头 + 第2行样式模板),分批删除
-    if current_rows > 2:
-        print(f"清理旧数据({current_rows - 2}行)...")
-        rows_to_delete = current_rows - 2
+    # 判断是否有模板行(第2行)
+    has_template = current_rows >= 2
+    data_start = 3 if has_template else 2
+    keep_rows = 2 if has_template else 1
+
+    # 第1步:删除旧数据行(保留表头 + 模板行(如有)),分批删除
+    if current_rows > keep_rows:
+        rows_to_delete = current_rows - keep_rows
+        print(f"清理旧数据({rows_to_delete}行)...")
         delete_batch = 5000
         while rows_to_delete > 0:
-            # 每次从第3行开始删除,删除后行号会自动调整
             batch = min(rows_to_delete, delete_batch)
             try:
-                client.delete_rows(access_token, sheet_token, sheet_id, 3, 2 + batch)
+                client.delete_rows(access_token, sheet_token, sheet_id, data_start, data_start - 1 + batch)
                 rows_to_delete -= batch
                 if rows_to_delete > 0:
-                    print(f"  已删除 {current_rows - 2 - rows_to_delete}/{current_rows - 2}")
+                    print(f"  已删除 {current_rows - keep_rows - rows_to_delete}/{current_rows - keep_rows}")
             except Exception as e:
                 print(f"  清理失败: {e}")
                 break
 
-    # 第2步:扩展表格容量(insert 不会自动扩展)
-    # 删除后当前只有2行(表头+模板),需要扩展到 2 + total_rows 行
-    add_url = f"{LARK_HOST}/open-apis/sheets/v2/spreadsheets/{sheet_token}/dimension_range"
-    expand_batch = 5000
-    remaining = total_rows
-    expanded = 0
-    while remaining > 0:
-        chunk = min(remaining, expand_batch)
-        add_payload = {
-            "dimension": {
-                "sheetId": sheet_id,
-                "majorDimension": "ROWS",
-                "length": chunk
+    # 第2步:准备空行
+    if has_template:
+        # 有模板行:先扩展占位行(使 endIndex 不超过 sheetMaxRowCount),再 insert 继承样式
+        insert_batch = 5000
+        remaining = total_rows
+        inserted = 0
+        while remaining > 0:
+            chunk = min(remaining, insert_batch)
+            try:
+                # 先扩展占位行(dimension_range POST 无 endIndex 限制)
+                client.append_empty_rows(access_token, sheet_token, sheet_id, chunk)
+                # 再 insert 带样式的行(此时 sheet 行数已足够大)
+                client.insert_rows_before(access_token, sheet_token, sheet_id,
+                                          data_start + inserted, chunk,
+                                          inherit_style="BEFORE")
+                inserted += chunk
+                remaining -= chunk
+            except Exception as e:
+                print(f"  插入行失败(已插入{inserted}): {e}")
+                break
+        if inserted > 0:
+            print(f"插入行(继承模板样式): +{inserted} 行")
+    else:
+        # 无模板行:用 dimension_range POST 扩展(无样式继承)
+        add_url = f"{LARK_HOST}/open-apis/sheets/v2/spreadsheets/{sheet_token}/dimension_range"
+        expand_batch = 5000
+        remaining = total_rows
+        expanded = 0
+        while remaining > 0:
+            chunk = min(remaining, expand_batch)
+            add_payload = {
+                "dimension": {
+                    "sheetId": sheet_id,
+                    "majorDimension": "ROWS",
+                    "length": chunk
+                }
             }
-        }
-        try:
-            request("POST", add_url, headers, add_payload)
-            expanded += chunk
-            remaining -= chunk
-        except Exception as e:
-            print(f"  扩展容量失败(已扩展{expanded}): {e}")
-            break
-    if expanded > 0:
-        print(f"扩展容量: +{expanded} 行")
+            try:
+                request("POST", add_url, headers, add_payload)
+                expanded += chunk
+                remaining -= chunk
+            except Exception as e:
+                print(f"  扩展容量失败(已扩展{expanded}): {e}")
+                break
+        if expanded > 0:
+            print(f"扩展容量: +{expanded} 行")
 
-    # 第3步:分批写入数据到扩展的空行(不再 insert,避免 expand+insert 双重加行超 cell 上限)
+    # 第3步:分批写入数据
     print(f"写入 {total_rows} 行...")
     batches = [converted_rows[i:i + batch_size] for i in range(0, total_rows, batch_size)]
     processed = 0
 
     for i, batch in enumerate(batches):
         batch_count = len(batch)
-        start_row = 3 + i * batch_size  # 从第3行开始,顺序写入
+        start_row = data_start + i * batch_size
 
         # 写入数据(飞书单次最多100列,需按列分批)
         col_batch = 100
@@ -528,14 +554,31 @@ def upload_to_feishu(csv_file, sheet_token, sheet_id=None, sort_spec="dt:desc",
         processed += batch_count
         print(f"  处理: {processed}/{total_rows}")
 
-    # 第5步:删除模板行(第2行),仅当初始存在模板行时
-    if current_rows >= 2:
+    # 第4步:删除模板行(第2行),仅当有模板行时
+    if has_template:
         print(f"删除模板行...")
         try:
             client.delete_rows(access_token, sheet_token, sheet_id, 2, 2)
         except Exception as e:
             print(f"  删除模板行失败: {e}")
 
+    # 第5步:删除占位行(在数据行之后的多余空行),分批删除(每批≤5000行)
+    if has_template and total_rows > 0:
+        try:
+            sheet_props_final = client.get_sheet_properties(access_token, sheet_token, sheet_id)
+            if sheet_props_final and sheet_props_final['row_count'] > 1 + total_rows:
+                rows_to_clean = sheet_props_final['row_count'] - (1 + total_rows)
+                clean_start = 1 + total_rows + 1  # 表头(1) + 数据(total_rows) + 第一个占位行
+                print(f"清理占位行({rows_to_clean}行)...")
+                delete_batch = 5000
+                while rows_to_clean > 0:
+                    batch = min(rows_to_clean, delete_batch)
+                    client.delete_rows(access_token, sheet_token, sheet_id,
+                                       clean_start, clean_start - 1 + batch)
+                    rows_to_clean -= batch
+        except Exception as e:
+            print(f"  清理占位行失败: {e}")
+
     print(f"飞书上传完成: {sheet_token}")
 
 

+ 37 - 31
lib/feishu.py

@@ -549,19 +549,31 @@ class Client(object):
             print(f"插入数据到第{row}行失败: {e}")
             return None
 
-    def insert_rows_before(self, access_token, doctoken, sheetid, row_index, count=1):
+    def insert_rows_before(self, access_token, doctoken, sheetid, row_index, count=1, inherit_style="BEFORE"):
         """
-        在指定行前插入新行(基于飞书官方API)
-        
+        在指定行前插入新行(基于飞书 insert_dimension_range API)
+
+        注意: insert_dimension_range 要求 endIndex <= 当前 sheet 行数。
+        如果 sheet 行数不够(例如删除旧数据后只剩 2 行,却要插入 5000 行),
+        需先调用 append_empty_rows 扩展占位行,再调用本方法插入带样式的行。
+        典型用法:
+            client.append_empty_rows(token, doc, sheet, count)  # 先扩展
+            client.insert_rows_before(token, doc, sheet, row, count)  # 再插入(继承样式)
+            # 写完数据后删除多余占位行
+
         Args:
             access_token: 访问令牌
             doctoken: 表格token
             sheetid: 工作表ID
             row_index: 插入位置的行号(从1开始,在此行前插入)
             count: 插入行数(默认1行)
-            
+            inherit_style: 样式继承方向,"BEFORE"/"AFTER"/None
+
         Returns:
             操作结果
+
+        Raises:
+            Exception: 插入失败时抛出,由调用方处理
         """
         # 先获取工作表信息,检查当前行数
         sheet_props = self.get_sheet_properties(access_token, doctoken, sheetid)
@@ -571,50 +583,40 @@ class Client(object):
         else:
             current_row_count = sheet_props['row_count']
             print(f"当前工作表行数: {current_row_count}")
-        
-        # 如果要插入的位置超过了当前行数,使用追加模式
-        if row_index > current_row_count:
-            print(f"插入位置({row_index})超过当前行数({current_row_count}),使用追加模式")
-            # 使用追加方式在末尾添加行
+
+        # 不能在不存在的行前面插入(允许在末尾+1位置插入)
+        if row_index > current_row_count + 1:
+            print(f"插入位置({row_index})超过当前行数+1({current_row_count + 1}),使用追加模式")
             return self.append_empty_rows(access_token, doctoken, sheetid, count)
-        
+
         url = f"{self._host}/open-apis/sheets/v2/spreadsheets/{doctoken}/insert_dimension_range"
         headers = {
             'Content-Type': 'application/json; charset=utf-8',
             'Authorization': f'Bearer {access_token}'
         }
-        
+
         # 转换为0基索引:row_index=3表示第3行,对应startIndex=2
         start_index = row_index - 1  # 从0开始计数
         end_index = start_index + count  # 结束位置(不包含)
-        
-        # 确保 endIndex 不超过当前工作表的行数限制
-        if end_index > current_row_count:
-            print(f"警告:计算的endIndex({end_index})超过当前行数({current_row_count}),调整为追加模式")
-            return self.append_empty_rows(access_token, doctoken, sheetid, count)
-        
-        # 智能选择继承样式:插入第2行时继承后面的数据行样式,其他情况继承前面的样式
-        inherit_style = "AFTER" if row_index == 2 else "BEFORE"
-        
+
         payload = {
             "dimension": {
                 "sheetId": sheetid,
                 "majorDimension": "ROWS",
                 "startIndex": start_index,  # 从0开始计数
                 "endIndex": end_index  # 结束位置(不包含此行)
-            },
-            "inheritStyle": inherit_style  # 智能继承样式
+            }
         }
-        
+        if inherit_style is not None:
+            payload["inheritStyle"] = inherit_style
+
         try:
             resp = request("POST", url, headers, payload)
             print(f"在第{row_index}行前成功插入{count}行(startIndex={start_index}, endIndex={end_index}, inheritStyle={inherit_style})")
             return resp
         except Exception as e:
             print(f"在第{row_index}行前插入{count}行失败: {e}")
-            # 如果插入失败,尝试追加模式
-            print("尝试使用追加模式...")
-            return self.append_empty_rows(access_token, doctoken, sheetid, count)
+            raise  # 让调用方决定如何处理
 
     def insert_row_with_images(self, access_token, doctoken, sheetid, row, values, compress_image=True, grid_width=None, grid_height=None, border_width=3, border_color=(200, 200, 200)):
         """
@@ -952,8 +954,8 @@ class Client(object):
             "dimension": {
                 "sheetId": sheetid,
                 "majorDimension": "ROWS",
-                "startIndex": start_row,  # 从1开始计数,包含
-                "endIndex": end_row       # 从1开始计数,包含
+                "startIndex": start_row,  # 1-based inclusive
+                "endIndex": end_row       # 1-based inclusive
             }
         }
         
@@ -981,14 +983,18 @@ class Client(object):
 
     def append_empty_rows(self, access_token, doctoken, sheetid, count=1):
         """
-        在工作表末尾追加空行
-        
+        在工作表末尾追加空行(不继承样式)
+
+        常与 insert_rows_before 配合使用:先用本方法扩展 sheet 行数(绕过
+        insert_dimension_range 的 endIndex <= sheetMaxRowCount 限制),
+        再用 insert_rows_before 插入带样式的行。
+
         Args:
             access_token: 访问令牌
             doctoken: 表格token
             sheetid: 工作表ID
             count: 追加行数(默认1行)
-            
+
         Returns:
             操作结果
         """

+ 7 - 10
tasks/00_尾号实验/base_v3.sql

@@ -1,17 +1,14 @@
 WITH t_abmap AS
 (
-    SELECT "0" AS suffix, "实验组:ros损失函数优化" AS abcode
-    UNION ALL SELECT "5", "实验组:ros损失函数优化"
-    UNION ALL SELECT "f", "实验组:ros损失函数优化"
-    UNION ALL SELECT "4", "实验组:c1_rovn & 去掉vor实验"
-    UNION ALL SELECT "6", "实验组:c1_rovn & 去掉vor实验"
-    UNION ALL SELECT "7", "实验组:c1_rovn & 去掉vor实验"
-    UNION ALL SELECT "8", "实验组:c1_rovn"
-    UNION ALL SELECT "9", "实验组:c1_rovn"
-    UNION ALL SELECT "e", "实验组:c1_rovn"
+    SELECT "c" AS suffix, "前基线" AS abcode
+    UNION ALL SELECT "e", "实验组:解构特征排序str模型&召回"
+    UNION ALL SELECT "f", "实验组:解构特征排序str模型&召回"
+    UNION ALL SELECT "5", "实验组:解构特征排序str模型"
+    UNION ALL SELECT "d", "实验组:解构特征排序str模型"
+    UNION ALL SELECT "6", "实验组:bn_ros新损失函数"
+    UNION ALL SELECT "7", "实验组:bn_ros新损失函数"
     UNION ALL SELECT "a", "对照组"
     UNION ALL SELECT "b", "对照组"
-    UNION ALL SELECT "c", "对照组"
 )
 ,t_base AS
 (

+ 10 - 10
tasks/00_尾号实验/base_v4_v1.sql

@@ -1,16 +1,16 @@
 WITH t_abmap AS
 (
-    SELECT "3" AS suffix, "实验组:ros损失函数优化" AS abcode
-    UNION ALL SELECT "4", "实验组:c1_rovn & 去掉vor实验"
-    UNION ALL SELECT "5", "实验组:c1_rovn"
-    UNION ALL SELECT "d", "实验组:c1_rovn"
-    UNION ALL SELECT "e", "实验组:c1_rovn"
-    UNION ALL SELECT "f", "实验组:c1_rovn"
-    UNION ALL SELECT "6", "实验组:dn_rovn"
-    UNION ALL SELECT "7", "实验组:cn_rovn"
+    SELECT "c" AS suffix, "前基线" AS abcode
+    UNION ALL SELECT "e", "实验组:解构特征排序str模型&召回"
+    UNION ALL SELECT "f", "实验组:解构特征排序str模型&召回"
+    UNION ALL SELECT "5", "实验组:解构特征排序str模型"
+    UNION ALL SELECT "d", "实验组:解构特征排序str模型"
+    UNION ALL SELECT "3", "实验组:bn_ros新损失函数"
+    UNION ALL SELECT "4", "实验组:bn_ros新损失函数"
+    UNION ALL SELECT "6", "实验组:cn_rov"
+    UNION ALL SELECT "7", "实验组:cn_rov"
     UNION ALL SELECT "a", "对照组"
     UNION ALL SELECT "b", "对照组"
-    UNION ALL SELECT "c", "对照组"
 )
 ,t_base AS
 (
@@ -197,7 +197,7 @@ WITH t_abmap AS
                 -- FROM    loghubods.useractive_log_per5min
                 -- WHERE   dt BETWEEN CONCAT("${dt}","000000") AND CONCAT("${dt}","235500")
                 AND     apptype IN ("0")
-                AND     GET_JSON_OBJECT(extparams,'$.eventInfos.ab_test003') IN ("ab0","ab1","ab2","ab3","ab4","ab8","ab9")
+                AND     GET_JSON_OBJECT(extparams,'$.eventInfos.ab_test003') IN ("ab0","ab1","ab2","ab3","ab4","ab5", "ab6", "ab7", "ab8","ab9")
                 AND     GET_JSON_OBJECT(extparams,'$.eventInfos.ab_test003') NOT IN ("ab100")
             ) sub
     LEFT JOIN t_abmap m

+ 6 - 0
tasks/00_尾号实验/base_v5.json

@@ -0,0 +1,6 @@
+{
+  "token": "ONZqsxB9BhGH8tt90EScSJT5nHh",
+  "sheet_id": "FG7sdV",
+  "sort": "dt:desc",
+  "cols": null
+}

+ 464 - 0
tasks/00_尾号实验/base_v5.sql

@@ -0,0 +1,464 @@
+-- base_v5:基于 base_v3 结构,切换 sample_all 表,追加模型预估 / COPC / 特征统计
+WITH t_abmap AS
+(
+    SELECT "c" AS suffix, "前基线" AS abcode
+    UNION ALL SELECT "e", "实验组:解构特征排序str模型&召回"
+    UNION ALL SELECT "f", "实验组:解构特征排序str模型&召回"
+    UNION ALL SELECT "5", "实验组:解构特征排序str模型"
+    UNION ALL SELECT "d", "实验组:解构特征排序str模型"
+    UNION ALL SELECT "6", "实验组:bn_ros新损失函数"
+    UNION ALL SELECT "7", "实验组:bn_ros新损失函数"
+    UNION ALL SELECT "a", "对照组"
+    UNION ALL SELECT "b", "对照组"
+)
+,t_base AS
+(
+    SELECT  sub.*
+            ,COALESCE(m.abcode,"other") AS abcode
+    FROM    (
+                SELECT  dt
+                        ,apptype
+                        ,SUBSTR(GET_JSON_OBJECT(extend,'$.rootsessionid'),LENGTH(GET_JSON_OBJECT(extend,'$.rootsessionid')),1) AS suffix
+                        ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                                WHEN page IN ("回流页","其他") THEN "非推荐"
+                                ELSE "其他"
+                        END AS page
+                        ,a.mid
+                        ,a.vid
+                        ,is_share
+                        ,share_cnt
+                        ,is_return_1
+                        ,is_return_n
+                        ,is_return_noself
+                        ,return_1_uv
+                        ,return_n_uv
+                        ,return_n_uv_noself
+                        ,new_exposure_cnt
+                        ,flowpool
+                        -- ===== 模型预估值 =====
+                        ,CAST(GET_JSON_OBJECT(REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\",""),'$.fmRov') AS DOUBLE) AS a_str_pred
+                        ,1.22 * pow(CAST(GET_JSON_OBJECT(REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\",""),'$.NorXGBScore') AS DOUBLE), 1.15) AS bn_ros_pred
+                        -- ===== 模型打分 =====
+                        ,CAST(GET_JSON_OBJECT(extend_alg,'$.sortScore') AS DOUBLE) AS sortScore
+                        ,CAST(GET_JSON_OBJECT(extend_alg,'$.rovScore') AS DOUBLE) AS rovScore
+                        -- ===== b1 视频全局历史 =====
+                        ,CAST(GET_JSON_OBJECT(b1_feature,'$.exp_168h') AS DOUBLE) AS b1_exp_168h
+                        ,CAST(GET_JSON_OBJECT(b1_feature,'$.return_1_uv_1h') AS DOUBLE)
+                            / NULLIF(CAST(GET_JSON_OBJECT(b1_feature,'$.exp_1h') AS DOUBLE), 0) AS b1_rovn_1h
+                        ,CAST(GET_JSON_OBJECT(b1_feature,'$.return_1_uv_24h') AS DOUBLE)
+                            / NULLIF(CAST(GET_JSON_OBJECT(b1_feature,'$.exp_24h') AS DOUBLE), 0) AS b1_rovn_24h
+                        ,CAST(GET_JSON_OBJECT(b1_feature,'$.rovn_168h') AS DOUBLE) AS b1_rovn_168h
+                        ,CAST(GET_JSON_OBJECT(b1_feature,'$.is_share_168h') AS DOUBLE)
+                            / NULLIF(CAST(GET_JSON_OBJECT(b1_feature,'$.exp_168h') AS DOUBLE), 0) AS b1_str_168h
+                        -- ===== b2 推荐场景历史 =====
+                        ,CAST(GET_JSON_OBJECT(b2_feature,'$.return_n_uv_168h') AS DOUBLE)
+                            / NULLIF(CAST(GET_JSON_OBJECT(b2_feature,'$.exp_168h') AS DOUBLE), 0) AS b2_rovn_168h
+                        ,CAST(GET_JSON_OBJECT(b2_feature,'$.exp_168h') AS DOUBLE) AS b2_exp_168h
+                        -- ===== b12 超长期历史 =====
+                        ,CAST(GET_JSON_OBJECT(b12_feature,'$.return_n_uv_30d') AS DOUBLE)
+                            / NULLIF(CAST(GET_JSON_OBJECT(b12_feature,'$.exp_30d') AS DOUBLE), 0) AS b12_rovn_30d
+                        ,CAST(GET_JSON_OBJECT(b12_feature,'$.exp_30d') AS DOUBLE) AS b12_exp_30d
+                        -- ===== c1 用户全局特征 =====
+                        ,CAST(GET_JSON_OBJECT(c1_feature,'$.rovn_168h') AS DOUBLE) AS c1_rovn_168h
+                        ,CAST(GET_JSON_OBJECT(c1_feature,'$.exp_168h') AS DOUBLE) AS c1_exp_168h
+                        -- ===== b8 类目基线 =====
+                        ,CAST(GET_JSON_OBJECT(b8_feature,'$.return_n_uv_168h') AS DOUBLE)
+                            / NULLIF(CAST(GET_JSON_OBJECT(b8_feature,'$.exp_168h') AS DOUBLE), 0) AS b8_rovn_168h
+                        -- ===== v1 视频元信息 =====
+                        ,GET_JSON_OBJECT(v1_feature,'$.channel') AS channel
+                        ,GET_JSON_OBJECT(v1_feature,'$.merge_first_level_cate') AS merge_cate1
+                        ,CAST(GET_JSON_OBJECT(v1_feature,'$.total_time') AS DOUBLE) AS total_time
+                        -- ===== c1/cn 链路 =====
+                        ,cc.cn
+                        ,cc.c1
+                        -- ===== d1/dn 链路 =====
+                        ,dd.dn
+                        ,dd.d1
+                FROM    loghubods.dwd_recsys_alg_sample_all_20250212 a
+                LEFT JOIN   (
+                                -- c1/cn:分享后被点击的回流 UV
+                                SELECT  a.machinecode AS mid
+                                        ,a.subsessionid
+                                        ,a.videoid AS vid
+                                        ,COUNT(DISTINCT CASE WHEN b1.machinecode <> b2.machinecode THEN b2.machinecode END) AS cn
+                                        ,COUNT(DISTINCT CASE WHEN b2.sharedepth = 1 AND b1.machinecode <> b2.machinecode THEN b2.machinecode END) AS c1
+                                FROM    (
+                                            SELECT  DISTINCT machinecode
+                                                    ,shareobjectid AS videoid
+                                                    ,recomTraceId
+                                                    ,subsessionid
+                                                    ,sharedepth
+                                                    ,shareid
+                                            FROM    loghubods.user_share_log
+                                            WHERE   dt = '${dt}'
+                                            AND     topic = 'share'
+                                            AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                        ) a
+                                LEFT JOIN   (
+                                                SELECT  DISTINCT machinecode
+                                                        ,clickobjectid
+                                                        ,recomTraceId
+                                                        ,subsessionid
+                                                        ,sharedepth
+                                                        ,rootshareid
+                                                FROM    loghubods.user_share_log
+                                                WHERE   dt = '${dt}'
+                                                AND     topic = 'click'
+                                            ) b
+                                ON      a.shareid = b.rootshareid
+                                LEFT JOIN   (
+                                                SELECT  DISTINCT machinecode
+                                                        ,shareobjectid
+                                                        ,recomTraceId
+                                                        ,subsessionid
+                                                        ,sharedepth
+                                                        ,shareid
+                                                FROM    loghubods.user_share_log
+                                                WHERE   dt = '${dt}'
+                                                AND     topic = 'share'
+                                                AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                            ) b1
+                                ON      b.machinecode = b1.machinecode
+                                AND     b.subsessionid = b1.subsessionid
+                                LEFT JOIN   (
+                                                SELECT  DISTINCT machinecode
+                                                        ,clickobjectid
+                                                        ,recomTraceId
+                                                        ,subsessionid
+                                                        ,sharedepth
+                                                        ,shareid
+                                                        ,rootshareid
+                                                FROM    loghubods.user_share_log
+                                                WHERE   dt = '${dt}'
+                                                AND     topic = 'click'
+                                            ) b2
+                                ON      b1.shareid = b2.rootshareid
+                                GROUP BY a.machinecode
+                                         ,a.subsessionid
+                                         ,a.videoid
+                            ) cc
+                ON      a.mid = cc.mid
+                AND     a.subsessionid = cc.subsessionid
+                AND     a.vid = cc.vid
+                LEFT JOIN   (
+                                -- d1/dn:下一条视频带来的回流
+                                SELECT  *
+                                        ,LAG(回流,1,0) OVER (PARTITION BY mid,subsessionid ORDER BY rn DESC) AS dn
+                                        ,LAG(回流1,1,0) OVER (PARTITION BY mid,subsessionid ORDER BY rn DESC) AS d1
+                                FROM    (
+                                            SELECT  a.mid AS mid
+                                                    ,a.subsessionid
+                                                    ,a.videoid AS vid
+                                                    ,COUNT(DISTINCT b.shareid) AS 分享次数
+                                                    ,COUNT(DISTINCT CASE WHEN c.machinecode <> b.machinecode THEN c.machinecode END) AS 回流
+                                                    ,COUNT(DISTINCT CASE WHEN c.machinecode <> b.machinecode AND c.sharedepth = 1 THEN c.machinecode END) AS 回流1
+                                                    ,ROW_NUMBER() OVER (PARTITION BY a.subsessionid ORDER BY a.logtimestamp ASC) AS rn
+                                            FROM    (
+                                                        SELECT  *
+                                                        FROM    (
+                                                                    SELECT  DISTINCT mid
+                                                                            ,subsessionid
+                                                                            ,videoid
+                                                                            ,logtimestamp
+                                                                            ,ROW_NUMBER() OVER (PARTITION BY mid,subsessionid,videoid ORDER BY logtimestamp ASC) AS rn
+                                                                    FROM    loghubods.video_action_log_rp
+                                                                    WHERE   dt = '${dt}'
+                                                                    AND     businesstype = 'videoView'
+                                                                    AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                                                )
+                                                        WHERE   rn = 1
+                                                    ) a
+                                            LEFT JOIN   (
+                                                            SELECT  DISTINCT machinecode
+                                                                    ,shareobjectid AS videoid
+                                                                    ,recomTraceId
+                                                                    ,subsessionid
+                                                                    ,sharedepth
+                                                                    ,shareid
+                                                                    ,clienttimestamp
+                                                            FROM    loghubods.user_share_log
+                                                            WHERE   dt = '${dt}'
+                                                            AND     topic = 'share'
+                                                            AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                                        ) b
+                                            ON      a.mid = b.machinecode
+                                            AND     a.subsessionid = b.subsessionid
+                                            AND     a.videoid = b.videoid
+                                            LEFT JOIN   (
+                                                            SELECT  DISTINCT machinecode
+                                                                    ,clickobjectid
+                                                                    ,recomTraceId
+                                                                    ,subsessionid
+                                                                    ,sharedepth
+                                                                    ,rootshareid
+                                                            FROM    loghubods.user_share_log
+                                                            WHERE   dt = '${dt}'
+                                                            AND     topic = 'click'
+                                                        ) c
+                                            ON      b.shareid = c.rootshareid
+                                            GROUP BY a.mid
+                                                     ,a.subsessionid
+                                                     ,a.videoid
+                                                     ,a.logtimestamp
+                                        )
+                            ) dd
+                ON      a.mid = dd.mid
+                AND     a.subsessionid = dd.subsessionid
+                AND     a.vid = dd.vid
+                WHERE   dt="${dt}"
+                AND     apptype IN ("4")
+                AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页","回流页","其他")
+                AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+                AND     abcode NOT IN ("ab100")
+            ) sub
+    LEFT JOIN t_abmap m
+    ON      sub.apptype = "4"
+    AND     sub.suffix = m.suffix
+)
+-- dau2:按单尾号聚合
+,t_dau2_bucket AS
+(
+    SELECT  SUBSTR(sub.dt,1,8) AS dt
+            ,sub.apptype
+            ,COALESCE(m.abcode,"other") AS abcode
+            ,sub.suffix
+            ,COUNT(DISTINCT sub.machinecode) AS dau2
+    FROM    (
+                SELECT  dt
+                        ,apptype
+                        ,machinecode
+                        ,SUBSTR(GET_JSON_OBJECT(extparams,'$.rootSessionId'),LENGTH(GET_JSON_OBJECT(extparams,'$.rootSessionId')),1) AS suffix
+                FROM    loghubods.useractive_log
+                WHERE   dt="${dt}"
+                AND     apptype IN ("4")
+            ) sub
+    LEFT JOIN t_abmap m
+    ON      sub.apptype = "4"
+    AND     sub.suffix = m.suffix
+    GROUP BY SUBSTR(sub.dt,1,8)
+             ,sub.apptype
+             ,COALESCE(m.abcode,"other")
+             ,sub.suffix
+)
+-- dau2:按实验组求尾号均值
+,t_dau2 AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,AVG(dau2) AS dau2
+    FROM    t_dau2_bucket
+    GROUP BY dt
+             ,apptype
+             ,abcode
+)
+-- 按单尾号聚合(尾号内 UV 去重)
+,t_bucket AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,suffix
+            -- ===== 原有业务效率 =====
+            ,COALESCE(COUNT(1) / COUNT(DISTINCT mid),0) AS exp_per_dau
+            ,COALESCE(SUM(is_share) / COUNT(1),0) AS str_one
+            ,COALESCE(SUM(return_n_uv) / SUM(is_share),0) AS ros_one
+            ,COALESCE(SUM(share_cnt) / COUNT(1),0) AS str
+            ,COALESCE(SUM(return_n_uv) / SUM(share_cnt),0) AS ros
+            ,COALESCE(SUM(is_return_1) / COUNT(1),0) AS str_plus
+            ,COALESCE(SUM(return_n_uv) / SUM(is_return_1),0) AS ros_minus
+            ,COALESCE(SUM(return_n_uv) / COUNT(1),0) AS bn_rov
+            ,COALESCE(SUM(c1) / COUNT(1),0) AS c1_rov
+            ,COALESCE(SUM(cn) / COUNT(1),0) AS cn_rov
+            ,COALESCE(SUM(d1) / COUNT(1),0) AS d1_rov
+            ,COALESCE(SUM(dn) / COUNT(1),0) AS dn_rov
+            ,COALESCE(SUM(new_exposure_cnt) / COUNT(1),0) AS vovh24
+            -- ===== 模型对应真实值 =====
+            ,COALESCE(SUM(is_return_noself) / COUNT(1),0) AS a_strx
+            ,COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself),0),0) AS bn_rosx
+            ,COALESCE(SUM(return_n_uv_noself) / COUNT(1),0) AS bn_rovx
+            -- ===== 模型预估值 =====
+            ,ROUND(AVG(a_str_pred),6) AS a_strx_pred
+            ,ROUND(AVG(CASE WHEN is_return_noself = 1 THEN bn_ros_pred END),6) AS bn_rosx_pred
+            ,ROUND(AVG(a_str_pred * bn_ros_pred),6) AS bn_rovx_pred
+            -- ===== COPC =====
+            ,ROUND((SUM(is_return_noself) / COUNT(1)) / NULLIF(AVG(a_str_pred), 0), 4) AS a_strx_copc
+            ,ROUND(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself),0),0) / NULLIF(AVG(CASE WHEN is_return_noself = 1 THEN bn_ros_pred END), 0), 4) AS bn_rosx_copc
+            ,ROUND((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(a_str_pred * bn_ros_pred), 0), 4) AS bn_rovx_copc
+            -- ===== 模型打分 =====
+            ,ROUND(AVG(sortScore),4) AS avg_sortScore
+            ,ROUND(AVG(rovScore),4) AS avg_rovScore
+            -- ===== 特征均值 =====
+            ,ROUND(AVG(b1_exp_168h),0) AS b1_exp_168h
+            ,ROUND(AVG(b1_rovn_1h),6) AS b1_rovn_1h
+            ,ROUND(AVG(b1_rovn_24h),6) AS b1_rovn_24h
+            ,ROUND(AVG(b1_rovn_168h),6) AS b1_rovn_168h
+            ,ROUND(AVG(b1_str_168h),6) AS b1_str_168h
+            ,ROUND(AVG(b2_rovn_168h),6) AS b2_rovn_168h
+            ,ROUND(AVG(b2_exp_168h),0) AS b2_exp_168h
+            ,ROUND(AVG(b12_rovn_30d),6) AS b12_rovn_30d
+            ,ROUND(AVG(b12_exp_30d),0) AS b12_exp_30d
+            ,ROUND(AVG(c1_rovn_168h),6) AS avg_c1_rovn_168h
+            ,ROUND(AVG(c1_exp_168h),0) AS avg_c1_exp_168h
+            ,ROUND(AVG(b8_rovn_168h),6) AS b8_rovn_168h
+            ,ROUND(AVG(total_time),0) AS avg_total_time
+            -- ===== 原有计数 =====
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+            ,COALESCE(SUM(is_return_noself),0) AS is_return_noself
+            ,COALESCE(SUM(cn),0) AS cn
+            ,COALESCE(SUM(c1),0) AS c1
+            ,COALESCE(SUM(dn),0) AS dn
+            ,COALESCE(SUM(d1),0) AS d1
+    FROM    t_base
+    WHERE   page = "推荐"
+    AND     abcode != "other"
+    GROUP BY dt
+             ,apptype
+             ,abcode
+             ,suffix
+)
+-- 按实验组求尾号均值
+,t_metrics AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            -- ===== 原有业务效率 =====
+            ,ROUND(AVG(exp_per_dau),2) AS exp_per_dau
+            ,ROUND(AVG(str_one),6) AS str_one
+            ,ROUND(AVG(ros_one),6) AS ros_one
+            ,ROUND(AVG(str),6) AS str
+            ,ROUND(AVG(ros),6) AS ros
+            ,ROUND(AVG(str_plus),6) AS str_plus
+            ,ROUND(AVG(ros_minus),6) AS ros_minus
+            ,ROUND(AVG(bn_rov),6) AS bn_rov
+            ,ROUND(AVG(c1_rov),6) AS c1_rov
+            ,ROUND(AVG(cn_rov),6) AS cn_rov
+            ,ROUND(AVG(d1_rov),6) AS d1_rov
+            ,ROUND(AVG(dn_rov),6) AS dn_rov
+            ,ROUND(AVG(vovh24),6) AS vovh24
+            -- ===== 模型对应真实值 =====
+            ,ROUND(AVG(a_strx),6) AS a_strx
+            ,ROUND(AVG(bn_rosx),6) AS bn_rosx
+            ,ROUND(AVG(bn_rovx),6) AS bn_rovx
+            -- ===== 模型预估值 =====
+            ,ROUND(AVG(a_strx_pred),6) AS a_strx_pred
+            ,ROUND(AVG(bn_rosx_pred),6) AS bn_rosx_pred
+            ,ROUND(AVG(bn_rovx_pred),6) AS bn_rovx_pred
+            -- ===== COPC =====
+            ,ROUND(AVG(a_strx_copc),4) AS a_strx_copc
+            ,ROUND(AVG(bn_rosx_copc),4) AS bn_rosx_copc
+            ,ROUND(AVG(bn_rovx_copc),4) AS bn_rovx_copc
+            -- ===== 模型打分 =====
+            ,ROUND(AVG(avg_sortScore),4) AS avg_sortScore
+            ,ROUND(AVG(avg_rovScore),4) AS avg_rovScore
+            -- ===== 特征均值 =====
+            ,ROUND(AVG(b1_exp_168h),0) AS b1_exp_168h
+            ,ROUND(AVG(b1_rovn_1h),6) AS b1_rovn_1h
+            ,ROUND(AVG(b1_rovn_24h),6) AS b1_rovn_24h
+            ,ROUND(AVG(b1_rovn_168h),6) AS b1_rovn_168h
+            ,ROUND(AVG(b1_str_168h),6) AS b1_str_168h
+            ,ROUND(AVG(b2_rovn_168h),6) AS b2_rovn_168h
+            ,ROUND(AVG(b2_exp_168h),0) AS b2_exp_168h
+            ,ROUND(AVG(b12_rovn_30d),6) AS b12_rovn_30d
+            ,ROUND(AVG(b12_exp_30d),0) AS b12_exp_30d
+            ,ROUND(AVG(avg_c1_rovn_168h),6) AS avg_c1_rovn_168h
+            ,ROUND(AVG(avg_c1_exp_168h),0) AS avg_c1_exp_168h
+            ,ROUND(AVG(b8_rovn_168h),6) AS b8_rovn_168h
+            ,ROUND(AVG(avg_total_time),0) AS avg_total_time
+            -- ===== 原有计数 =====
+            ,AVG(dau) AS dau
+            ,AVG(exp) AS exp
+            ,AVG(is_share) AS is_share
+            ,AVG(share_cnt) AS share_cnt
+            ,AVG(is_return_1) AS is_return_1
+            ,AVG(return_n_uv) AS return_n_uv
+            ,AVG(viewh24) AS viewh24
+            ,AVG(return_n_uv_noself) AS return_n_uv_noself
+            ,AVG(is_return_noself) AS is_return_noself
+            ,AVG(cn) AS cn
+            ,AVG(c1) AS c1
+            ,AVG(dn) AS dn
+            ,AVG(d1) AS d1
+            ,WM_CONCAT(DISTINCT ',',suffix) AS suffix
+    FROM    t_bucket
+    GROUP BY dt
+             ,apptype
+             ,abcode
+)
+SELECT  a.dt
+        ,a.apptype
+        ,a.abcode
+        ,a.suffix
+        -- 原有业务效率
+        ,a.exp_per_dau
+        ,a.str_one
+        ,a.ros_one
+        ,a.str
+        ,a.ros
+        ,a.str_plus
+        ,a.ros_minus
+        ,a.bn_rov
+        ,a.c1_rov
+        ,a.cn_rov
+        ,a.d1_rov
+        ,a.dn_rov
+        ,a.vovh24
+        -- 模型对应真实值
+        ,a.a_strx
+        ,a.bn_rosx
+        ,a.bn_rovx
+        -- 模型预估值
+        ,a.a_strx_pred
+        ,a.bn_rosx_pred
+        ,a.bn_rovx_pred
+        -- COPC
+        ,a.a_strx_copc
+        ,a.bn_rosx_copc
+        ,a.bn_rovx_copc
+        -- 模型打分
+        ,a.avg_sortScore
+        ,a.avg_rovScore
+        -- 特征
+        ,a.b1_exp_168h
+        ,a.b1_rovn_1h
+        ,a.b1_rovn_24h
+        ,a.b1_rovn_168h
+        ,a.b1_str_168h
+        ,a.b2_rovn_168h
+        ,a.b2_exp_168h
+        ,a.b12_rovn_30d
+        ,a.b12_exp_30d
+        ,a.avg_c1_rovn_168h
+        ,a.avg_c1_exp_168h
+        ,a.b8_rovn_168h
+        ,a.avg_total_time
+        -- 计数
+        ,a.dau
+        ,a.exp
+        ,a.is_share
+        ,a.share_cnt
+        ,a.is_return_1
+        ,a.return_n_uv
+        ,a.viewh24
+        ,a.return_n_uv_noself
+        ,a.is_return_noself
+        ,a.cn
+        ,a.c1
+        ,a.dn
+        ,a.d1
+        ,b.dau2
+FROM    t_metrics a
+LEFT JOIN t_dau2 b
+ON      a.dt = b.dt
+AND     a.apptype = b.apptype
+AND     a.abcode = b.abcode
+ORDER BY a.dt DESC,a.apptype,a.abcode
+;

+ 6 - 0
tasks/00_尾号实验/base_v5_v1.json

@@ -0,0 +1,6 @@
+{
+  "token": "ONZqsxB9BhGH8tt90EScSJT5nHh",
+  "sheet_id": "iLCYB8",
+  "sort": "dt:desc",
+  "cols": null
+}

+ 465 - 0
tasks/00_尾号实验/base_v5_v1.sql

@@ -0,0 +1,465 @@
+-- base_v5_v1:基于 base_v4_v1 结构(apptype=0),追加模型预估 / COPC / 特征统计
+WITH t_abmap AS
+(
+    SELECT "c" AS suffix, "前基线" AS abcode
+    UNION ALL SELECT "e", "实验组:解构特征排序str模型&召回"
+    UNION ALL SELECT "f", "实验组:解构特征排序str模型&召回"
+    UNION ALL SELECT "5", "实验组:解构特征排序str模型"
+    UNION ALL SELECT "d", "实验组:解构特征排序str模型"
+    UNION ALL SELECT "3", "实验组:bn_ros新损失函数"
+    UNION ALL SELECT "4", "实验组:bn_ros新损失函数"
+    UNION ALL SELECT "6", "实验组:cn_rov"
+    UNION ALL SELECT "7", "实验组:cn_rov"
+    UNION ALL SELECT "a", "对照组"
+    UNION ALL SELECT "b", "对照组"
+)
+,t_base AS
+(
+    SELECT  sub.*
+            ,COALESCE(m.abcode,"other") AS abcode
+    FROM    (
+                SELECT  dt
+                        ,apptype
+                        ,SUBSTR(GET_JSON_OBJECT(extend,'$.rootsessionid'),LENGTH(GET_JSON_OBJECT(extend,'$.rootsessionid')),1) AS suffix
+                        ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                                WHEN page IN ("回流页","其他") THEN "非推荐"
+                                ELSE "其他"
+                        END AS page
+                        ,a.mid
+                        ,a.vid
+                        ,is_share
+                        ,share_cnt
+                        ,is_return_1
+                        ,is_return_n
+                        ,is_return_noself
+                        ,return_1_uv
+                        ,return_n_uv
+                        ,return_n_uv_noself
+                        ,new_exposure_cnt
+                        ,flowpool
+                        -- ===== 模型预估值 =====
+                        ,CAST(GET_JSON_OBJECT(REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\",""),'$.fmRov') AS DOUBLE) AS a_str_pred
+                        ,1.22 * pow(CAST(GET_JSON_OBJECT(REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\",""),'$.NorXGBScore') AS DOUBLE), 1.15) AS bn_ros_pred
+                        -- ===== 模型打分 =====
+                        ,CAST(GET_JSON_OBJECT(extend_alg,'$.sortScore') AS DOUBLE) AS sortScore
+                        ,CAST(GET_JSON_OBJECT(extend_alg,'$.rovScore') AS DOUBLE) AS rovScore
+                        -- ===== b1 视频全局历史 =====
+                        ,CAST(GET_JSON_OBJECT(b1_feature,'$.exp_168h') AS DOUBLE) AS b1_exp_168h
+                        ,CAST(GET_JSON_OBJECT(b1_feature,'$.return_1_uv_1h') AS DOUBLE)
+                            / NULLIF(CAST(GET_JSON_OBJECT(b1_feature,'$.exp_1h') AS DOUBLE), 0) AS b1_rovn_1h
+                        ,CAST(GET_JSON_OBJECT(b1_feature,'$.return_1_uv_24h') AS DOUBLE)
+                            / NULLIF(CAST(GET_JSON_OBJECT(b1_feature,'$.exp_24h') AS DOUBLE), 0) AS b1_rovn_24h
+                        ,CAST(GET_JSON_OBJECT(b1_feature,'$.rovn_168h') AS DOUBLE) AS b1_rovn_168h
+                        ,CAST(GET_JSON_OBJECT(b1_feature,'$.is_share_168h') AS DOUBLE)
+                            / NULLIF(CAST(GET_JSON_OBJECT(b1_feature,'$.exp_168h') AS DOUBLE), 0) AS b1_str_168h
+                        -- ===== b2 推荐场景历史 =====
+                        ,CAST(GET_JSON_OBJECT(b2_feature,'$.return_n_uv_168h') AS DOUBLE)
+                            / NULLIF(CAST(GET_JSON_OBJECT(b2_feature,'$.exp_168h') AS DOUBLE), 0) AS b2_rovn_168h
+                        ,CAST(GET_JSON_OBJECT(b2_feature,'$.exp_168h') AS DOUBLE) AS b2_exp_168h
+                        -- ===== b12 超长期历史 =====
+                        ,CAST(GET_JSON_OBJECT(b12_feature,'$.return_n_uv_30d') AS DOUBLE)
+                            / NULLIF(CAST(GET_JSON_OBJECT(b12_feature,'$.exp_30d') AS DOUBLE), 0) AS b12_rovn_30d
+                        ,CAST(GET_JSON_OBJECT(b12_feature,'$.exp_30d') AS DOUBLE) AS b12_exp_30d
+                        -- ===== c1 用户全局特征 =====
+                        ,CAST(GET_JSON_OBJECT(c1_feature,'$.rovn_168h') AS DOUBLE) AS c1_rovn_168h
+                        ,CAST(GET_JSON_OBJECT(c1_feature,'$.exp_168h') AS DOUBLE) AS c1_exp_168h
+                        -- ===== b8 类目基线 =====
+                        ,CAST(GET_JSON_OBJECT(b8_feature,'$.return_n_uv_168h') AS DOUBLE)
+                            / NULLIF(CAST(GET_JSON_OBJECT(b8_feature,'$.exp_168h') AS DOUBLE), 0) AS b8_rovn_168h
+                        -- ===== v1 视频元信息 =====
+                        ,GET_JSON_OBJECT(v1_feature,'$.channel') AS channel
+                        ,GET_JSON_OBJECT(v1_feature,'$.merge_first_level_cate') AS merge_cate1
+                        ,CAST(GET_JSON_OBJECT(v1_feature,'$.total_time') AS DOUBLE) AS total_time
+                        -- ===== c1/cn 链路 =====
+                        ,cc.cn
+                        ,cc.c1
+                        -- ===== d1/dn 链路 =====
+                        ,dd.dn
+                        ,dd.d1
+                FROM    loghubods.dwd_recsys_alg_sample_all_20250212 a
+                LEFT JOIN   (
+                                SELECT  a.machinecode AS mid
+                                        ,a.subsessionid
+                                        ,a.videoid AS vid
+                                        ,COUNT(DISTINCT CASE WHEN b1.machinecode <> b2.machinecode THEN b2.machinecode END) AS cn
+                                        ,COUNT(DISTINCT CASE WHEN b2.sharedepth = 1 AND b1.machinecode <> b2.machinecode THEN b2.machinecode END) AS c1
+                                FROM    (
+                                            SELECT  DISTINCT machinecode
+                                                    ,shareobjectid AS videoid
+                                                    ,recomTraceId
+                                                    ,subsessionid
+                                                    ,sharedepth
+                                                    ,shareid
+                                            FROM    loghubods.user_share_log
+                                            WHERE   dt = '${dt}'
+                                            AND     topic = 'share'
+                                            AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                        ) a
+                                LEFT JOIN   (
+                                                SELECT  DISTINCT machinecode
+                                                        ,clickobjectid
+                                                        ,recomTraceId
+                                                        ,subsessionid
+                                                        ,sharedepth
+                                                        ,rootshareid
+                                                FROM    loghubods.user_share_log
+                                                WHERE   dt = '${dt}'
+                                                AND     topic = 'click'
+                                            ) b
+                                ON      a.shareid = b.rootshareid
+                                LEFT JOIN   (
+                                                SELECT  DISTINCT machinecode
+                                                        ,shareobjectid
+                                                        ,recomTraceId
+                                                        ,subsessionid
+                                                        ,sharedepth
+                                                        ,shareid
+                                                FROM    loghubods.user_share_log
+                                                WHERE   dt = '${dt}'
+                                                AND     topic = 'share'
+                                                AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                            ) b1
+                                ON      b.machinecode = b1.machinecode
+                                AND     b.subsessionid = b1.subsessionid
+                                LEFT JOIN   (
+                                                SELECT  DISTINCT machinecode
+                                                        ,clickobjectid
+                                                        ,recomTraceId
+                                                        ,subsessionid
+                                                        ,sharedepth
+                                                        ,shareid
+                                                        ,rootshareid
+                                                FROM    loghubods.user_share_log
+                                                WHERE   dt = '${dt}'
+                                                AND     topic = 'click'
+                                            ) b2
+                                ON      b1.shareid = b2.rootshareid
+                                GROUP BY a.machinecode
+                                         ,a.subsessionid
+                                         ,a.videoid
+                            ) cc
+                ON      a.mid = cc.mid
+                AND     a.subsessionid = cc.subsessionid
+                AND     a.vid = cc.vid
+                LEFT JOIN   (
+                                SELECT  *
+                                        ,LAG(回流,1,0) OVER (PARTITION BY mid,subsessionid ORDER BY rn DESC) AS dn
+                                        ,LAG(回流1,1,0) OVER (PARTITION BY mid,subsessionid ORDER BY rn DESC) AS d1
+                                FROM    (
+                                            SELECT  a.mid AS mid
+                                                    ,a.subsessionid
+                                                    ,a.videoid AS vid
+                                                    ,COUNT(DISTINCT b.shareid) AS 分享次数
+                                                    ,COUNT(DISTINCT CASE WHEN c.machinecode <> b.machinecode THEN c.machinecode END) AS 回流
+                                                    ,COUNT(DISTINCT CASE WHEN c.machinecode <> b.machinecode AND c.sharedepth = 1 THEN c.machinecode END) AS 回流1
+                                                    ,ROW_NUMBER() OVER (PARTITION BY a.subsessionid ORDER BY a.logtimestamp ASC) AS rn
+                                            FROM    (
+                                                        SELECT  *
+                                                        FROM    (
+                                                                    SELECT  DISTINCT mid
+                                                                            ,subsessionid
+                                                                            ,videoid
+                                                                            ,logtimestamp
+                                                                            ,ROW_NUMBER() OVER (PARTITION BY mid,subsessionid,videoid ORDER BY logtimestamp ASC) AS rn
+                                                                    FROM    loghubods.video_action_log_rp
+                                                                    WHERE   dt = '${dt}'
+                                                                    AND     businesstype = 'videoView'
+                                                                    AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                                                )
+                                                        WHERE   rn = 1
+                                                    ) a
+                                            LEFT JOIN   (
+                                                            SELECT  DISTINCT machinecode
+                                                                    ,shareobjectid AS videoid
+                                                                    ,recomTraceId
+                                                                    ,subsessionid
+                                                                    ,sharedepth
+                                                                    ,shareid
+                                                                    ,clienttimestamp
+                                                            FROM    loghubods.user_share_log
+                                                            WHERE   dt = '${dt}'
+                                                            AND     topic = 'share'
+                                                            AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                                        ) b
+                                            ON      a.mid = b.machinecode
+                                            AND     a.subsessionid = b.subsessionid
+                                            AND     a.videoid = b.videoid
+                                            LEFT JOIN   (
+                                                            SELECT  DISTINCT machinecode
+                                                                    ,clickobjectid
+                                                                    ,recomTraceId
+                                                                    ,subsessionid
+                                                                    ,sharedepth
+                                                                    ,rootshareid
+                                                            FROM    loghubods.user_share_log
+                                                            WHERE   dt = '${dt}'
+                                                            AND     topic = 'click'
+                                                        ) c
+                                            ON      b.shareid = c.rootshareid
+                                            GROUP BY a.mid
+                                                     ,a.subsessionid
+                                                     ,a.videoid
+                                                     ,a.logtimestamp
+                                        )
+                            ) dd
+                ON      a.mid = dd.mid
+                AND     a.subsessionid = dd.subsessionid
+                AND     a.vid = dd.vid
+                WHERE   dt="${dt}"
+                AND     apptype IN ("0")
+                AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页","回流页","其他")
+                AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab8","ab9")
+                AND     abcode NOT IN ("ab100")
+            ) sub
+    LEFT JOIN t_abmap m
+    ON      sub.apptype = "0"
+    AND     sub.suffix = m.suffix
+)
+-- dau2:按单尾号聚合
+,t_dau2_bucket AS
+(
+    SELECT  SUBSTR(sub.dt,1,8) AS dt
+            ,sub.apptype
+            ,COALESCE(m.abcode,"other") AS abcode
+            ,sub.suffix
+            ,COUNT(DISTINCT sub.machinecode) AS dau2
+    FROM    (
+                SELECT  dt
+                        ,apptype
+                        ,machinecode
+                        ,SUBSTR(GET_JSON_OBJECT(extparams,'$.rootSessionId'),LENGTH(GET_JSON_OBJECT(extparams,'$.rootSessionId')),1) AS suffix
+                FROM    loghubods.useractive_log
+                WHERE   dt="${dt}"
+                AND     apptype IN ("0")
+                AND     GET_JSON_OBJECT(extparams,'$.eventInfos.ab_test003') IN ("ab0","ab1","ab2","ab3","ab4","ab5", "ab6", "ab7", "ab8","ab9")
+                AND     GET_JSON_OBJECT(extparams,'$.eventInfos.ab_test003') NOT IN ("ab100")
+            ) sub
+    LEFT JOIN t_abmap m
+    ON      sub.apptype = "0"
+    AND     sub.suffix = m.suffix
+    GROUP BY SUBSTR(sub.dt,1,8)
+             ,sub.apptype
+             ,COALESCE(m.abcode,"other")
+             ,sub.suffix
+)
+-- dau2:按实验组求尾号均值
+,t_dau2 AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,AVG(dau2) AS dau2
+    FROM    t_dau2_bucket
+    GROUP BY dt
+             ,apptype
+             ,abcode
+)
+-- 按单尾号聚合(尾号内 UV 去重)
+,t_bucket AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,suffix
+            -- ===== 原有业务效率 =====
+            ,COALESCE(COUNT(1) / COUNT(DISTINCT mid),0) AS exp_per_dau
+            ,COALESCE(SUM(is_share) / COUNT(1),0) AS str_one
+            ,COALESCE(SUM(return_n_uv) / SUM(is_share),0) AS ros_one
+            ,COALESCE(SUM(share_cnt) / COUNT(1),0) AS str
+            ,COALESCE(SUM(return_n_uv) / SUM(share_cnt),0) AS ros
+            ,COALESCE(SUM(is_return_1) / COUNT(1),0) AS str_plus
+            ,COALESCE(SUM(return_n_uv) / SUM(is_return_1),0) AS ros_minus
+            ,COALESCE(SUM(return_n_uv) / COUNT(1),0) AS bn_rov
+            ,COALESCE(SUM(c1) / COUNT(1),0) AS c1_rov
+            ,COALESCE(SUM(cn) / COUNT(1),0) AS cn_rov
+            ,COALESCE(SUM(d1) / COUNT(1),0) AS d1_rov
+            ,COALESCE(SUM(dn) / COUNT(1),0) AS dn_rov
+            ,COALESCE(SUM(new_exposure_cnt) / COUNT(1),0) AS vovh24
+            -- ===== 模型对应真实值 =====
+            ,COALESCE(SUM(is_return_noself) / COUNT(1),0) AS a_strx
+            ,COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself),0),0) AS bn_rosx
+            ,COALESCE(SUM(return_n_uv_noself) / COUNT(1),0) AS bn_rovx
+            -- ===== 模型预估值 =====
+            ,ROUND(AVG(a_str_pred),6) AS a_strx_pred
+            ,ROUND(AVG(CASE WHEN is_return_noself = 1 THEN bn_ros_pred END),6) AS bn_rosx_pred
+            ,ROUND(AVG(a_str_pred * bn_ros_pred),6) AS bn_rovx_pred
+            -- ===== COPC =====
+            ,ROUND((SUM(is_return_noself) / COUNT(1)) / NULLIF(AVG(a_str_pred), 0), 4) AS a_strx_copc
+            ,ROUND(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself),0),0) / NULLIF(AVG(CASE WHEN is_return_noself = 1 THEN bn_ros_pred END), 0), 4) AS bn_rosx_copc
+            ,ROUND((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(a_str_pred * bn_ros_pred), 0), 4) AS bn_rovx_copc
+            -- ===== 模型打分 =====
+            ,ROUND(AVG(sortScore),4) AS avg_sortScore
+            ,ROUND(AVG(rovScore),4) AS avg_rovScore
+            -- ===== 特征均值 =====
+            ,ROUND(AVG(b1_exp_168h),0) AS b1_exp_168h
+            ,ROUND(AVG(b1_rovn_1h),6) AS b1_rovn_1h
+            ,ROUND(AVG(b1_rovn_24h),6) AS b1_rovn_24h
+            ,ROUND(AVG(b1_rovn_168h),6) AS b1_rovn_168h
+            ,ROUND(AVG(b1_str_168h),6) AS b1_str_168h
+            ,ROUND(AVG(b2_rovn_168h),6) AS b2_rovn_168h
+            ,ROUND(AVG(b2_exp_168h),0) AS b2_exp_168h
+            ,ROUND(AVG(b12_rovn_30d),6) AS b12_rovn_30d
+            ,ROUND(AVG(b12_exp_30d),0) AS b12_exp_30d
+            ,ROUND(AVG(c1_rovn_168h),6) AS avg_c1_rovn_168h
+            ,ROUND(AVG(c1_exp_168h),0) AS avg_c1_exp_168h
+            ,ROUND(AVG(b8_rovn_168h),6) AS b8_rovn_168h
+            ,ROUND(AVG(total_time),0) AS avg_total_time
+            -- ===== 原有计数 =====
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+            ,COALESCE(SUM(is_return_noself),0) AS is_return_noself
+            ,COALESCE(SUM(cn),0) AS cn
+            ,COALESCE(SUM(c1),0) AS c1
+            ,COALESCE(SUM(dn),0) AS dn
+            ,COALESCE(SUM(d1),0) AS d1
+    FROM    t_base
+    WHERE   page = "推荐"
+    GROUP BY dt
+             ,apptype
+             ,abcode
+             ,suffix
+)
+-- 按实验组求尾号均值
+,t_metrics AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            -- ===== 原有业务效率 =====
+            ,ROUND(AVG(exp_per_dau),2) AS exp_per_dau
+            ,ROUND(AVG(str_one),6) AS str_one
+            ,ROUND(AVG(ros_one),6) AS ros_one
+            ,ROUND(AVG(str),6) AS str
+            ,ROUND(AVG(ros),6) AS ros
+            ,ROUND(AVG(str_plus),6) AS str_plus
+            ,ROUND(AVG(ros_minus),6) AS ros_minus
+            ,ROUND(AVG(bn_rov),6) AS bn_rov
+            ,ROUND(AVG(c1_rov),6) AS c1_rov
+            ,ROUND(AVG(cn_rov),6) AS cn_rov
+            ,ROUND(AVG(d1_rov),6) AS d1_rov
+            ,ROUND(AVG(dn_rov),6) AS dn_rov
+            ,ROUND(AVG(vovh24),6) AS vovh24
+            -- ===== 模型对应真实值 =====
+            ,ROUND(AVG(a_strx),6) AS a_strx
+            ,ROUND(AVG(bn_rosx),6) AS bn_rosx
+            ,ROUND(AVG(bn_rovx),6) AS bn_rovx
+            -- ===== 模型预估值 =====
+            ,ROUND(AVG(a_strx_pred),6) AS a_strx_pred
+            ,ROUND(AVG(bn_rosx_pred),6) AS bn_rosx_pred
+            ,ROUND(AVG(bn_rovx_pred),6) AS bn_rovx_pred
+            -- ===== COPC =====
+            ,ROUND(AVG(a_strx_copc),4) AS a_strx_copc
+            ,ROUND(AVG(bn_rosx_copc),4) AS bn_rosx_copc
+            ,ROUND(AVG(bn_rovx_copc),4) AS bn_rovx_copc
+            -- ===== 模型打分 =====
+            ,ROUND(AVG(avg_sortScore),4) AS avg_sortScore
+            ,ROUND(AVG(avg_rovScore),4) AS avg_rovScore
+            -- ===== 特征均值 =====
+            ,ROUND(AVG(b1_exp_168h),0) AS b1_exp_168h
+            ,ROUND(AVG(b1_rovn_1h),6) AS b1_rovn_1h
+            ,ROUND(AVG(b1_rovn_24h),6) AS b1_rovn_24h
+            ,ROUND(AVG(b1_rovn_168h),6) AS b1_rovn_168h
+            ,ROUND(AVG(b1_str_168h),6) AS b1_str_168h
+            ,ROUND(AVG(b2_rovn_168h),6) AS b2_rovn_168h
+            ,ROUND(AVG(b2_exp_168h),0) AS b2_exp_168h
+            ,ROUND(AVG(b12_rovn_30d),6) AS b12_rovn_30d
+            ,ROUND(AVG(b12_exp_30d),0) AS b12_exp_30d
+            ,ROUND(AVG(avg_c1_rovn_168h),6) AS avg_c1_rovn_168h
+            ,ROUND(AVG(avg_c1_exp_168h),0) AS avg_c1_exp_168h
+            ,ROUND(AVG(b8_rovn_168h),6) AS b8_rovn_168h
+            ,ROUND(AVG(avg_total_time),0) AS avg_total_time
+            -- ===== 原有计数 =====
+            ,AVG(dau) AS dau
+            ,AVG(exp) AS exp
+            ,AVG(is_share) AS is_share
+            ,AVG(share_cnt) AS share_cnt
+            ,AVG(is_return_1) AS is_return_1
+            ,AVG(return_n_uv) AS return_n_uv
+            ,AVG(viewh24) AS viewh24
+            ,AVG(return_n_uv_noself) AS return_n_uv_noself
+            ,AVG(is_return_noself) AS is_return_noself
+            ,AVG(cn) AS cn
+            ,AVG(c1) AS c1
+            ,AVG(dn) AS dn
+            ,AVG(d1) AS d1
+            ,WM_CONCAT(DISTINCT ',',suffix) AS suffix
+    FROM    t_bucket
+    GROUP BY dt
+             ,apptype
+             ,abcode
+)
+SELECT  a.dt
+        ,a.apptype
+        ,a.abcode
+        ,a.suffix
+        -- 原有业务效率
+        ,a.exp_per_dau
+        ,a.str_one
+        ,a.ros_one
+        ,a.str
+        ,a.ros
+        ,a.str_plus
+        ,a.ros_minus
+        ,a.bn_rov
+        ,a.c1_rov
+        ,a.cn_rov
+        ,a.d1_rov
+        ,a.dn_rov
+        ,a.vovh24
+        -- 模型对应真实值
+        ,a.a_strx
+        ,a.bn_rosx
+        ,a.bn_rovx
+        -- 模型预估值
+        ,a.a_strx_pred
+        ,a.bn_rosx_pred
+        ,a.bn_rovx_pred
+        -- COPC
+        ,a.a_strx_copc
+        ,a.bn_rosx_copc
+        ,a.bn_rovx_copc
+        -- 模型打分
+        ,a.avg_sortScore
+        ,a.avg_rovScore
+        -- 特征
+        ,a.b1_exp_168h
+        ,a.b1_rovn_1h
+        ,a.b1_rovn_24h
+        ,a.b1_rovn_168h
+        ,a.b1_str_168h
+        ,a.b2_rovn_168h
+        ,a.b2_exp_168h
+        ,a.b12_rovn_30d
+        ,a.b12_exp_30d
+        ,a.avg_c1_rovn_168h
+        ,a.avg_c1_exp_168h
+        ,a.b8_rovn_168h
+        ,a.avg_total_time
+        -- 计数
+        ,a.dau
+        ,a.exp
+        ,a.is_share
+        ,a.share_cnt
+        ,a.is_return_1
+        ,a.return_n_uv
+        ,a.viewh24
+        ,a.return_n_uv_noself
+        ,a.is_return_noself
+        ,a.cn
+        ,a.c1
+        ,a.dn
+        ,a.d1
+        ,b.dau2
+FROM    t_metrics a
+LEFT JOIN t_dau2 b
+ON      a.dt = b.dt
+AND     a.apptype = b.apptype
+AND     a.abcode = b.abcode
+ORDER BY a.dt DESC,a.apptype,a.abcode
+;

BIN
tasks/archive/人群品类曝光分析/.DS_Store


BIN
tasks/archive/品类再分享分析/.DS_Store


BIN
tasks/archive/品类命中分析/.DS_Store


BIN
tasks/archive/渠道效果分析/.DS_Store


BIN
tasks/archive/素材视频内容分析/.DS_Store