Просмотр исходного кода

需求接入先验场景需求

wangyunpeng 1 день назад
Родитель
Сommit
b7e7cf3ae1

+ 158 - 0
core/src/main/java/com/tzld/videoVector/job/ChannelDemandMatchJob.java

@@ -13,9 +13,11 @@ import com.tzld.videoVector.model.po.pgVector.ChannelDemandMatchResultExample;
 import com.tzld.videoVector.model.vo.RecallVideoScoreVO;
 import com.tzld.videoVector.service.VideoSearchService;
 import com.tzld.videoVector.util.OdpsUtil;
+import com.tzld.videoVector.util.VectorUtils;
 import com.xxl.job.core.biz.model.ReturnT;
 import com.xxl.job.core.handler.annotation.XxlJob;
 import lombok.extern.slf4j.Slf4j;
+import org.springframework.beans.factory.annotation.Value;
 import org.springframework.stereotype.Component;
 import org.springframework.util.CollectionUtils;
 import org.springframework.util.StringUtils;
@@ -58,6 +60,23 @@ public class ChannelDemandMatchJob {
      */
     private static final int MATCH_THREAD_POOL_SIZE = 5;
 
+    /**
+     * 先验需求-场景 数据源策略标识
+     */
+    private static final String DIMENSION_STAT_STRATEGY = "先验需求-场景";
+
+    /**
+     * 先验需求-场景 数据源的ROV阈值 (默认3%)
+     */
+    @Value("${channel.demand.dimension-stat.min-rov:0.03}")
+    private double dimensionStatMinRov;
+
+    /**
+     * 先验需求-场景 数据源的UV占比阈值 (默认0.2%)
+     */
+    @Value("${channel.demand.dimension-stat.min-uv-ratio:0.002}")
+    private double dimensionStatMinUvRatio;
+
     /**
      * 多渠道配置并发执行线程池
      */
@@ -144,6 +163,12 @@ public class ChannelDemandMatchJob {
         // 1. 先清理该渠道+日期的历史数据(支持重跑)
         deleteExistingResults(config.getId(), dt);
 
+        // 如果是先验需求-场景数据源,走独立的处理逻辑(不需要向量召回)
+        if (DIMENSION_STAT_STRATEGY.equals(config.getDemandStrategy())) {
+            processDimensionStatSource(config, dt, totalDemands, totalMatched, totalFailed);
+            return;
+        }
+
         // 2. 构造ODPS SQL并查询需求数据
         String sql = buildDemandSql(config, dt, minUv, minRov);
         log.info("查询ODPS需求, 渠道: {}, sql长度: {}", channelName, sql.length());
@@ -396,6 +421,139 @@ public class ChannelDemandMatchJob {
         return StringUtils.hasText(value) && !"-".equals(value.trim());
     }
 
+    /**
+     * 处理先验需求-场景数据源:从dwd_channel_element_dimension_stat查询,直接入库(不需向量召回)
+     */
+    private void processDimensionStatSource(ChannelDemandMatchConfig config, String dt,
+                                            AtomicInteger totalDemands, AtomicInteger totalMatched, AtomicInteger totalFailed) {
+        log.info("开始处理先验需求-场景数据源, configId={}, dt={}", config.getId(), dt);
+
+        String sql = buildDimensionStatSql(config, dt);
+        log.info("先验需求-场景ODPS SQL长度: {}", sql.length());
+
+        List<ChannelDemandMatchResult> results = new ArrayList<>();
+
+        OdpsUtil.getOdpsDataStream(sql, record -> {
+            try {
+                ChannelDemandMatchResult result = new ChannelDemandMatchResult();
+                result.setConfigId(config.getId());
+                result.setDt(dt);
+                result.setDemandStrategy(DIMENSION_STAT_STRATEGY);
+
+                // 需求维度字段
+                result.setChannelName(record.getString("渠道类"));
+                result.setCrowdSegment(record.getString("人群细分"));
+                result.setChannelLevel3(record.getString("三级渠道"));
+                result.setDimension(record.getString("维度"));
+                result.setStandardElement(record.getString("标准化元素"));
+                result.setCategoryName(record.getString("分类名称"));
+
+                // 统计指标(总访问uv/总uv占比 被 访问uv/uv占比 覆盖,最终取行级指标)
+                result.setVisitUv(safeGetLong(record, "访问uv"));
+                result.setUvRatio(safeGetDouble(record, "uv占比"));
+                result.setTotalRov(safeGetDouble(record, "总rov"));
+                result.setMatchRov(safeGetDouble(record, "rov"));
+
+                // 计算综合评分:sim默认1,rov取totalRov
+                double rov = result.getMatchRov() != null ? result.getMatchRov() : 0.0;
+                result.setMatchScore(VectorUtils.calculateScore(1.0, rov));
+                result.setMatchSim(1.0);
+                result.setMatchRov(rov);
+
+                // 视频信息:该表已有videoid,直接作为匹配结果
+                String videoIdStr = record.getString("videoid");
+                if (StringUtils.hasText(videoIdStr)) {
+                    try {
+                        result.setMatchVideoId(Long.parseLong(videoIdStr.trim()));
+                    } catch (NumberFormatException e) {
+                        log.warn("先验需求-场景 videoid解析失败: {}", videoIdStr);
+                        return;
+                    }
+                } else {
+                    return; // 无videoid则跳过
+                }
+
+                // 标题同时映射到matchText和demandContentTitle
+                String title = record.getString("标题");
+                result.setMatchText(title);
+                result.setDemandContentTitle(title);
+
+                result.setMatchConfigCode("DIMENSION_STAT");
+                result.setMatchStatus((short) 1); // 已匹配(无需向量召回)
+
+                // 生成确定性实验ID
+                result.setExperimentId(generateExperimentId(result, result.getMatchVideoId(), result.getMatchConfigCode()));
+
+                synchronized (results) {
+                    results.add(result);
+                }
+            } catch (Exception e) {
+                log.error("解析先验需求-场景ODPS记录失败: {}", e.getMessage());
+            }
+        });
+
+        log.info("先验需求-场景数据源查询到 {} 条记录", results.size());
+        totalDemands.addAndGet(results.size());
+
+        if (results.isEmpty()) {
+            return;
+        }
+
+        // 批量写入
+        for (List<ChannelDemandMatchResult> partition : Lists.partition(results, 1000)) {
+            resultMapperExt.batchInsert(partition);
+        }
+        totalMatched.addAndGet(results.size());
+        log.info("先验需求-场景数据源写入完成, 共 {} 条", results.size());
+    }
+
+    /**
+     * 构造先验需求-场景数据源的ODPS SQL
+     * 查询loghubods.dwd_channel_element_dimension_stat,过滤rov>3%且uv占比>0.2%
+     */
+    private String buildDimensionStatSql(ChannelDemandMatchConfig config, String dt) {
+        StringBuilder sb = new StringBuilder();
+        sb.append("SELECT DISTINCT dt");
+        sb.append(",渠道类");
+        sb.append(",人群细分");
+        sb.append(",三级渠道");
+        sb.append(",维度");
+        sb.append(",标准化元素");
+        sb.append(",分类名称");
+        sb.append(",总访问uv");
+        sb.append(",总uv占比");
+        sb.append(",总str");
+        sb.append(",总rov");
+        sb.append(",videoid");
+        sb.append(",标题");
+        sb.append(",`merge二级品类`");
+        sb.append(",访问uv");
+        sb.append(",访问pv");
+        sb.append(",单层分享pv");
+        sb.append(",拉回uv");
+        sb.append(",uv占比");
+        sb.append(",str");
+        sb.append(",rov");
+        sb.append(",全局分发pv");
+        sb.append(",全局分发回流uv");
+        sb.append(",全局总回流uv");
+        sb.append(",全局rov");
+        sb.append(",rank");
+        sb.append(" FROM loghubods.dwd_channel_element_dimension_stat");
+        sb.append(" WHERE dt = '").append(dt).append("'");
+        // 过滤条件:rov > 3% 且 uv占比 > 0.2%
+        sb.append(" AND rov > ").append(dimensionStatMinRov);
+        sb.append(" AND uv占比 > ").append(dimensionStatMinUvRatio);
+        sb.append(" AND `merge二级品类` not in ('早中晚好','节日祝福') ");
+        // 渠道筛选
+        if (StringUtils.hasText(config.getChannelName())) {
+            sb.append(" AND 渠道类 = '").append(config.getChannelName().replace("'", "''")).append("'");
+        }
+        sb.append(" ORDER BY 总rov DESC,全局rov DESC");
+        sb.append(";");
+        return sb.toString();
+    }
+
     /**
      * 构造ODPS查询需求SQL(返回所有字段)
      */

+ 14 - 26
core/src/main/java/com/tzld/videoVector/service/impl/VideoSearchServiceImpl.java

@@ -1380,25 +1380,19 @@ public class VideoSearchServiceImpl implements VideoSearchService {
 
     // ========================== 召回综合评分 ==========================
 
-    /** 默认评分参数 */
-    private static final double DEFAULT_ALPHA = 0.6;
-    private static final double DEFAULT_SIM_MIN = 0.7;
-    private static final double DEFAULT_SIM_MAX = 1.0;
-    private static final double DEFAULT_ROV_P5 = 0.005;
-    private static final double DEFAULT_ROV_P95 = 0.1;
-
     @Override
     public RecallVideoScoreVO recallWithScore(RecallVideoScoreParam param) {
         if (param == null) {
             log.error("recallWithScore 参数为空");
-            return emptyScoreResult(DEFAULT_ALPHA, DEFAULT_ROV_P95, DEFAULT_ROV_P5, DEFAULT_SIM_MIN);
+            return emptyScoreResult(VectorUtils.DEFAULT_ALPHA, VectorUtils.DEFAULT_ROV_P95,
+                    VectorUtils.DEFAULT_ROV_P5, VectorUtils.DEFAULT_SIM_MIN);
         }
 
         // 解析评分参数,使用默认值
-        double alpha = param.getAlpha() != null ? param.getAlpha() : DEFAULT_ALPHA;
-        double rovP95 = param.getRovP95() != null ? param.getRovP95() : DEFAULT_ROV_P95;
-        double rovP5 = param.getRovP5() != null ? param.getRovP5() : DEFAULT_ROV_P5;
-        double simMin = param.getSimMin() != null ? param.getSimMin() : DEFAULT_SIM_MIN;
+        double alpha = param.getAlpha() != null ? param.getAlpha() : VectorUtils.DEFAULT_ALPHA;
+        double rovP95 = param.getRovP95() != null ? param.getRovP95() : VectorUtils.DEFAULT_ROV_P95;
+        double rovP5 = param.getRovP5() != null ? param.getRovP5() : VectorUtils.DEFAULT_ROV_P5;
+        double simMin = param.getSimMin() != null ? param.getSimMin() : VectorUtils.DEFAULT_SIM_MIN;
 
         // 1. 复用现有 matchTopNVideo 进行向量召回
         MatchTopNVideoParam matchParam = new MatchTopNVideoParam();
@@ -1435,23 +1429,17 @@ public class VideoSearchServiceImpl implements VideoSearchService {
             double rov = extractRov(match.getVideoDetail());
             item.setRov(rov);
 
-            // sim 归一化: sim_norm = (sim - simMin) / (simMax - simMin)
-            double simNorm = (DEFAULT_SIM_MAX - simMin) > 0
-                    ? (sim - simMin) / (DEFAULT_SIM_MAX - simMin)
-                    : 0.0;
-            simNorm = Math.max(0.0, Math.min(1.0, simNorm));
+            // sim 归一化
+            double simNorm = VectorUtils.normalize(sim, simMin, VectorUtils.DEFAULT_SIM_MAX);
             item.setSimNorm(round(simNorm));
 
-            // rov 归一化: rov_norm = clip((rov - rovP5) / (rovP95 - rovP5), 0, 1)
-            double rovNorm = (rovP95 - rovP5) > 0
-                    ? (rov - rovP5) / (rovP95 - rovP5)
-                    : 0.0;
-            rovNorm = Math.max(0.0, Math.min(1.0, rovNorm));
+            // rov 归一化
+            double rovNorm = VectorUtils.normalize(rov, rovP5, rovP95);
             item.setRovNorm(round(rovNorm));
 
             // 综合得分: score = alpha * sim_norm + (1-alpha) * rov_norm
-            double score = alpha * simNorm + (1 - alpha) * rovNorm;
-            item.setScore(round(score));
+            double score = VectorUtils.calculateScore(sim, rov, alpha, simMin, VectorUtils.DEFAULT_SIM_MAX, rovP5, rovP95);
+            item.setScore(score);
 
             scoredItems.add(item);
         }
@@ -1472,7 +1460,7 @@ public class VideoSearchServiceImpl implements VideoSearchService {
         scoreParams.setRovP95(rovP95);
         scoreParams.setRovP5(rovP5);
         scoreParams.setSimMin(simMin);
-        scoreParams.setSimMax(DEFAULT_SIM_MAX);
+        scoreParams.setSimMax(VectorUtils.DEFAULT_SIM_MAX);
         vo.setScoreParams(scoreParams);
 
         return vo;
@@ -1520,7 +1508,7 @@ public class VideoSearchServiceImpl implements VideoSearchService {
         scoreParams.setRovP95(rovP95);
         scoreParams.setRovP5(rovP5);
         scoreParams.setSimMin(simMin);
-        scoreParams.setSimMax(DEFAULT_SIM_MAX);
+        scoreParams.setSimMax(VectorUtils.DEFAULT_SIM_MAX);
         vo.setScoreParams(scoreParams);
         return vo;
     }

+ 64 - 0
core/src/main/java/com/tzld/videoVector/util/VectorUtils.java

@@ -18,6 +18,70 @@ public final class VectorUtils {
     private VectorUtils() {
     }
 
+    // ========================== 综合评分计算 ==========================
+
+    /** 默认权重系数 alpha */
+    public static final double DEFAULT_ALPHA = 0.6;
+    /** 默认相似度下限 */
+    public static final double DEFAULT_SIM_MIN = 0.7;
+    /** 默认相似度上限 */
+    public static final double DEFAULT_SIM_MAX = 1.0;
+    /** 默认 ROV P5 */
+    public static final double DEFAULT_ROV_P5 = 0.005;
+    /** 默认 ROV P95 */
+    public static final double DEFAULT_ROV_P95 = 0.1;
+
+    /**
+     * 计算综合评分 score = alpha * simNorm + (1-alpha) * rovNorm
+     * simNorm = clip((sim - simMin) / (simMax - simMin), 0, 1)
+     * rovNorm = clip((rov - rovP5) / (rovP95 - rovP5), 0, 1)
+     *
+     * @param sim    相似度值
+     * @param rov    ROV值
+     * @param alpha  权重系数
+     * @param simMin 相似度归一化下限
+     * @param simMax 相似度归一化上限
+     * @param rovP5  ROV归一化下限
+     * @param rovP95 ROV归一化上限
+     * @return 综合评分(保留4位小数)
+     */
+    public static double calculateScore(double sim, double rov, double alpha,
+                                         double simMin, double simMax,
+                                         double rovP5, double rovP95) {
+        double simNorm = normalize(sim, simMin, simMax);
+        double rovNorm = normalize(rov, rovP5, rovP95);
+
+        // 综合得分
+        double score = alpha * simNorm + (1 - alpha) * rovNorm;
+        return Math.round(score * 10000.0) / 10000.0;
+    }
+
+    /**
+     * 使用默认参数计算综合评分
+     *
+     * @param sim 相似度值
+     * @param rov ROV值
+     * @return 综合评分(保留4位小数)
+     */
+    public static double calculateScore(double sim, double rov) {
+        return calculateScore(sim, rov, DEFAULT_ALPHA, DEFAULT_SIM_MIN, DEFAULT_SIM_MAX, DEFAULT_ROV_P5, DEFAULT_ROV_P95);
+    }
+
+    /**
+     * 通用归一化方法: clip((value - min) / (max - min), 0, 1)
+     *
+     * @param value 原始值
+     * @param min   归一化下限
+     * @param max   归一化上限
+     * @return 归一化后的值,范围 [0, 1]
+     */
+    public static double normalize(double value, double min, double max) {
+        double norm = (max - min) > 0
+                ? (value - min) / (max - min)
+                : 0.0;
+        return Math.max(0.0, Math.min(1.0, norm));
+    }
+
     // ========================== 向量字符串解析 ==========================
 
     /**

+ 2 - 2
core/src/main/resources/mapper/pgVector/ext/ChannelDemandMatchResultMapperExt.xml

@@ -13,7 +13,7 @@
       partner, account, scene_value,
       demand_strategy, drive_dimension_time, demand_filter_sort_strategy,
       demand_type, demand_content_id, demand_content_title,
-      demand_content_topic, uv_ratio, experiment_id)
+      demand_content_topic, uv_ratio, experiment_id, channel_level3)
     values
     <foreach collection="list" item="item" separator=",">
       (#{item.configId,jdbcType=BIGINT}, #{item.dt,jdbcType=VARCHAR}, #{item.channelName,jdbcType=VARCHAR},
@@ -27,7 +27,7 @@
       #{item.partner,jdbcType=VARCHAR}, #{item.account,jdbcType=VARCHAR}, #{item.sceneValue,jdbcType=VARCHAR},
       #{item.demandStrategy,jdbcType=VARCHAR}, #{item.driveDimensionTime,jdbcType=VARCHAR}, #{item.demandFilterSortStrategy,jdbcType=VARCHAR},
       #{item.demandType,jdbcType=VARCHAR}, #{item.demandContentId,jdbcType=VARCHAR}, #{item.demandContentTitle,jdbcType=VARCHAR},
-      #{item.demandContentTopic,jdbcType=VARCHAR}, #{item.uvRatio,jdbcType=DOUBLE}, #{item.experimentId,jdbcType=VARCHAR})
+      #{item.demandContentTopic,jdbcType=VARCHAR}, #{item.uvRatio,jdbcType=DOUBLE}, #{item.experimentId,jdbcType=VARCHAR}, #{item.channelLevel3,jdbcType=VARCHAR})
     </foreach>
   </insert>
 </mapper>