14 godzin temu · 02640039bf
--- a/core/src/main/java/com/tzld/videoVector/api/RecommendApiService.java
+++ b/core/src/main/java/com/tzld/videoVector/api/RecommendApiService.java
@@ -0,0 +1,122 @@
 
				+package com.tzld.videoVector.api;
			
 
				+
			
 
				+import com.alibaba.fastjson.JSONArray;
			
 
				+import com.alibaba.fastjson.JSONObject;
			
 
				+import lombok.extern.slf4j.Slf4j;
			
 
				+import okhttp3.*;
			
 
				+import org.springframework.beans.factory.annotation.Value;
			
 
				+import org.springframework.stereotype.Service;
			
 
				+
			
 
				+import javax.annotation.PostConstruct;
			
 
				+import java.io.IOException;
			
 
				+import java.io.UnsupportedEncodingException;
			
 
				+import java.net.URLEncoder;
			
 
				+import java.util.*;
			
 
				+import java.util.concurrent.TimeUnit;
			
 
				+
			
 
				+/**
			
 
				+ * 外部投放系统 API 服务。
			
 
				+ * <p>用于查询各账号下已投放素材/文章列表，辅助匹配结果去重。
			
 
				+ */
			
 
				+@Slf4j
			
 
				+@Service
			
 
				+public class RecommendApiService {
			
 
				+
			
 
				+    private OkHttpClient client;
			
 
				+
			
 
				+    /** 已发送素材查询接口 */
			
 
				+    @Value("${external.api.get-source-ids.url:http://101.37.174.139:80/api/getSourceIdsByAccount}")
			
 
				+    private String getSourceIdsUrl;
			
 
				+
			
 
				+    /** HTTP 超时（秒） */
			
 
				+    @Value("${external.api.timeout:30}")
			
 
				+    private int timeout;
			
 
				+
			
 
				+    @PostConstruct
			
 
				+    public void init() {
			
 
				+        this.client = new OkHttpClient.Builder()
			
 
				+                .connectTimeout(timeout, TimeUnit.SECONDS)
			
 
				+                .readTimeout(timeout, TimeUnit.SECONDS)
			
 
				+                .writeTimeout(timeout, TimeUnit.SECONDS)
			
 
				+                .build();
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * 获取指定账号已发送的素材 ID 集合。
			
 
				+     *
			
 
				+     * @param accountName 账号名称（channelLevel3）
			
 
				+     * @return 已发送素材 ID 集合，为空或失败时返回空集合
			
 
				+     */
			
 
				+    public Set<String> getSentSourceIds(String accountName) {
			
 
				+        if (accountName == null || accountName.isEmpty()) {
			
 
				+            return Collections.emptySet();
			
 
				+        }
			
 
				+
			
 
				+        try {
			
 
				+            String encodedName = URLEncoder.encode(accountName, "UTF-8");
			
 
				+            String url = getSourceIdsUrl + "?accountName=" + encodedName + "&type=9&position=1";
			
 
				+
			
 
				+            Request request = new Request.Builder()
			
 
				+                    .url(url)
			
 
				+                    .get()
			
 
				+                    .build();
			
 
				+
			
 
				+            try (Response response = client.newCall(request).execute()) {
			
 
				+                if (!response.isSuccessful()) {
			
 
				+                    log.error("获取已发送素材失败: accountName={}, HTTP {}, body={}",
			
 
				+                            accountName, response.code(),
			
 
				+                            response.body() != null ? response.body().string() : "");
			
 
				+                    return Collections.emptySet();
			
 
				+                }
			
 
				+
			
 
				+                String body = response.body() != null ? response.body().string() : "";
			
 
				+                JSONObject res = JSONObject.parseObject(body);
			
 
				+                if (res == null || res.getInteger("code") == null || res.getInteger("code") != 0) {
			
 
				+                    log.error("获取已发送素材接口返回异常: accountName={}, response={}", accountName, body);
			
 
				+                    return Collections.emptySet();
			
 
				+                }
			
 
				+
			
 
				+                JSONArray data = res.getJSONArray("data");
			
 
				+                if (data == null || data.isEmpty()) {
			
 
				+                    return Collections.emptySet();
			
 
				+                }
			
 
				+
			
 
				+                Set<String> result = new LinkedHashSet<>(data.size());
			
 
				+                for (int i = 0; i < data.size(); i++) {
			
 
				+                    String id = data.getString(i);
			
 
				+                    if (id != null && !id.isEmpty()) {
			
 
				+                        result.add(id);
			
 
				+                    }
			
 
				+                }
			
 
				+                log.info("获取已发送素材: accountName={}, count={}", accountName, result.size());
			
 
				+                return result;
			
 
				+            }
			
 
				+        } catch (UnsupportedEncodingException e) {
			
 
				+            log.error("URL 编码失败: accountName={}, {}", accountName, e.getMessage());
			
 
				+            return Collections.emptySet();
			
 
				+        } catch (IOException e) {
			
 
				+            log.error("获取已发送素材网络异常: accountName={}, {}", accountName, e.getMessage());
			
 
				+            return Collections.emptySet();
			
 
				+        } catch (Exception e) {
			
 
				+            log.error("获取已发送素材异常: accountName={}, {}", accountName, e.getMessage());
			
 
				+            return Collections.emptySet();
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * 批量获取多个账号的已发送素材 ID 集合。
			
 
				+     *
			
 
				+     * @param accountNames 账号名称列表
			
 
				+     * @return accountName → 已发送素材 ID 集合
			
 
				+     */
			
 
				+    public Map<String, Set<String>> getAllSentSourceIds(Collection<String> accountNames) {
			
 
				+        Map<String, Set<String>> result = new LinkedHashMap<>();
			
 
				+        if (accountNames == null || accountNames.isEmpty()) {
			
 
				+            return result;
			
 
				+        }
			
 
				+        for (String name : accountNames) {
			
 
				+            result.put(name, getSentSourceIds(name));
			
 
				+        }
			
 
				+        return result;
			
 
				+    }
			
 
				+}
			
--- a/core/src/main/java/com/tzld/videoVector/job/VideoArticleMatchJob.java
+++ b/core/src/main/java/com/tzld/videoVector/job/VideoArticleMatchJob.java
@@ -2,6 +2,7 @@ package com.tzld.videoVector.job;
 
				 
			
 
				 import com.alibaba.fastjson.JSON;
			
 
				 import com.ctrip.framework.apollo.spring.annotation.ApolloJsonValue;
			
 
				+import com.tzld.videoVector.api.RecommendApiService;
			
 
				 import com.tzld.videoVector.api.VideoApiService;
			
 
				 import com.tzld.videoVector.common.constant.VectorConstants;
			
 
				 import com.tzld.videoVector.dao.mapper.pgVector.ChannelDemandMatchResultMapper;
			
@@ -69,6 +70,9 @@ public class VideoArticleMatchJob {
 
				     @Resource
			
 
				     private VideoApiService videoApiService;
			
 
				 
			
 
				+    @Resource
			
 
				+    private RecommendApiService recommendApiService;
			
 
				+
			
 
				     @Resource
			
 
				     private RedisUtils redisUtils;
			
 
				 
			
@@ -173,11 +177,18 @@ public class VideoArticleMatchJob {
 
				                 return ReturnT.SUCCESS;
			
 
				             }
			
 
				 
			
 
				+            // 2.1 获取各账号已发送素材 ID（用于过滤已投文章）
			
 
				+            Map<String, Set<String>> sentSourceIds = fetchSentSourceIds(records);
			
 
				+
			
 
				             // 3. 批量获取视频标题
			
 
				             Map<Long, String> videoTitleMap = fetchVideoTitles(records);
			
 
				 
			
 
				-            // 4. 视频标题 → 长文向量召回
			
 
				-            Map<Long, List<ArticleMatchItem>> videoArticleMatches = matchArticlesByTitles(videoTitleMap);
			
 
				+            // 3.1 构建 videoId → channelLevel3 映射（用于匹配时过滤已发送文章）
			
 
				+            Map<Long, String> videoChannelMap = buildVideoChannelMap(records);
			
 
				+
			
 
				+            // 4. 视频标题 → 长文向量召回（过滤已发送文章）
			
 
				+            Map<Long, List<ArticleMatchItem>> videoArticleMatches =
			
 
				+                    matchArticlesByTitles(videoTitleMap, videoChannelMap, sentSourceIds);
			
 
				 
			
 
				             // 5. 1v1 去重配对
			
 
				             Map<Long, ArticleMatchItem> finalPairs = dedupOneToOne(videoArticleMatches);
			
@@ -233,6 +244,30 @@ public class VideoArticleMatchJob {
 
				         return records;
			
 
				     }
			
 
				 
			
 
				+    // =====================================================
			
 
				+    // 步骤 2.1: 获取各账号已发送素材 ID
			
 
				+    // =====================================================
			
 
				+
			
 
				+    /**
			
 
				+     * 调用外部 API 获取各账号已发送的素材 ID 集合，用于过滤已投放内容。
			
 
				+     *
			
 
				+     * @param records 需求匹配记录列表
			
 
				+     * @return channelLevel3 → 已发送素材 ID 集合
			
 
				+     */
			
 
				+    private Map<String, Set<String>> fetchSentSourceIds(List<ChannelDemandMatchResult> records) {
			
 
				+        // 提取所有唯一的 channelLevel3
			
 
				+        Set<String> uniqueAccounts = records.stream()
			
 
				+                .map(ChannelDemandMatchResult::getChannelLevel3)
			
 
				+                .filter(StringUtils::hasText)
			
 
				+                .collect(Collectors.toSet());
			
 
				+
			
 
				+        Map<String, Set<String>> result = recommendApiService.getAllSentSourceIds(uniqueAccounts);
			
 
				+
			
 
				+        int totalSent = result.values().stream().mapToInt(Set::size).sum();
			
 
				+        log.info("获取各账号已发送素材: {} 个账号, 共 {} 条", result.size(), totalSent);
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				     // =====================================================
			
 
				     // 步骤 3: 批量获取视频标题
			
 
				     // =====================================================
			
@@ -309,6 +344,22 @@ public class VideoArticleMatchJob {
 
				         return videoTitleMap;
			
 
				     }
			
 
				 
			
 
				+    // =====================================================
			
 
				+    // 步骤 3.1: 构建 videoId → channelLevel3 映射
			
 
				+    // =====================================================
			
 
				+
			
 
				+    /**
			
 
				+     * 从匹配记录中提取 videoId → channelLevel3 映射。
			
 
				+     * <p>同一视频可能对应多条记录，取第一条的 channelLevel3。
			
 
				+     */
			
 
				+    private Map<Long, String> buildVideoChannelMap(List<ChannelDemandMatchResult> records) {
			
 
				+        Map<Long, String> map = new LinkedHashMap<>();
			
 
				+        for (ChannelDemandMatchResult r : records) {
			
 
				+            map.putIfAbsent(r.getMatchVideoId(), r.getChannelLevel3());
			
 
				+        }
			
 
				+        return map;
			
 
				+    }
			
 
				+
			
 
				     // =====================================================
			
 
				     // 步骤 4: 视频标题 → 长文向量召回
			
 
				     // =====================================================
			
@@ -319,23 +370,31 @@ public class VideoArticleMatchJob {
 
				      * <p>使用线程池并发执行，单条失败不影响整体流程。
			
 
				      * 每个标题使用 configCodes=[ARTICLE_TITLE, ARTICLE_SUMMARY] 进行并行 ANN 查询，
			
 
				      * 结果只保留 modality=ARTICLE 的条目，按 score 降序排列。
			
 
				+     * 召回结果从上往下（按 score 降序）排除该账号已发送的文章，取剩余的第一条。
			
 
				      *
			
 
				-     * @param videoTitleMap videoId → title 映射
			
 
				-     * @return videoId → 文章匹配列表 映射（按 score 降序）
			
 
				+     * @param videoTitleMap  videoId → title 映射
			
 
				+     * @param videoChannelMap videoId → channelLevel3 映射
			
 
				+     * @param sentSourceIds   channelLevel3 → 已发送素材 ID 集合
			
 
				+     * @return videoId → 文章匹配列表 映射（已过滤已发送，按 score 降序）
			
 
				      */
			
 
				-    private Map<Long, List<ArticleMatchItem>> matchArticlesByTitles(Map<Long, String> videoTitleMap) {
			
 
				+    private Map<Long, List<ArticleMatchItem>> matchArticlesByTitles(
			
 
				+            Map<Long, String> videoTitleMap,
			
 
				+            Map<Long, String> videoChannelMap,
			
 
				+            Map<String, Set<String>> sentSourceIds) {
			
 
				         ConcurrentHashMap<Long, List<ArticleMatchItem>> resultMap = new ConcurrentHashMap<>();
			
 
				         RankingSpec ranking = buildRankingSpec();
			
 
				 
			
 
				         int totalVideos = videoTitleMap.size();
			
 
				         AtomicInteger processed = new AtomicInteger(0);
			
 
				         AtomicInteger matchedCount = new AtomicInteger(0);
			
 
				+        AtomicInteger skippedSentCount = new AtomicInteger(0);
			
 
				 
			
 
				         // 构建并发任务
			
 
				         List<CompletableFuture<Void>> futures = new ArrayList<>(totalVideos);
			
 
				         for (Map.Entry<Long, String> entry : videoTitleMap.entrySet()) {
			
 
				             Long videoId = entry.getKey();
			
 
				             String title = entry.getValue();
			
 
				+            String channelLevel3 = videoChannelMap.get(videoId);
			
 
				 
			
 
				             CompletableFuture<Void> future = CompletableFuture.runAsync(() -> {
			
 
				                 try {
			
@@ -344,17 +403,24 @@ public class VideoArticleMatchJob {
 
				                     List<ArticleMatchItem> articles = extractArticleItems(recallResult);
			
 
				 
			
 
				                     if (!articles.isEmpty()) {
			
 
				-                        resultMap.put(videoId, articles);
			
 
				-                        matchedCount.incrementAndGet();
			
 
				+                        // 从上往下（score 降序）排除已发送的文章，保留剩余列表
			
 
				+                        List<ArticleMatchItem> filtered = filterSentArticles(articles, channelLevel3, sentSourceIds);
			
 
				+                        int skipped = articles.size() - filtered.size();
			
 
				+                        if (skipped > 0) {
			
 
				+                            skippedSentCount.addAndGet(skipped);
			
 
				+                        }
			
 
				+                        if (!filtered.isEmpty()) {
			
 
				+                            resultMap.put(videoId, filtered);
			
 
				+                            matchedCount.incrementAndGet();
			
 
				+                        }
			
 
				                     }
			
 
				                 } catch (Exception e) {
			
 
				                     log.error("视频 {} (标题: {}) 长文匹配失败: {}", videoId, title, e.getMessage());
			
 
				                 } finally {
			
 
				                     int done = processed.incrementAndGet();
			
 
				-                    // 每 10 条或每 50 条倍数的进度输出一次
			
 
				                     if (done % 10 == 0 || done == totalVideos) {
			
 
				-                        log.info("长文匹配进度: {}/{} 视频已处理, {} 个命中",
			
 
				-                                done, totalVideos, matchedCount.get());
			
 
				+                        log.info("长文匹配进度: {}/{} 视频已处理, {} 个命中, 跳过已发送 {} 个",
			
 
				+                                done, totalVideos, matchedCount.get(), skippedSentCount.get());
			
 
				                     }
			
 
				                 }
			
 
				             }, matchExecutor);
			
@@ -362,21 +428,48 @@ public class VideoArticleMatchJob {
 
				             futures.add(future);
			
 
				         }
			
 
				 
			
 
				-        // 等待所有任务完成（每个 future 内部已 catch 异常，不会失败）
			
 
				+        // 等待所有任务完成
			
 
				         CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join();
			
 
				 
			
 
				-        // 二次校验：确保所有任务都已执行完毕
			
 
				         int finalProcessed = processed.get();
			
 
				         if (finalProcessed != totalVideos) {
			
 
				             log.warn("长文匹配未完全完成: 预期 {} 个, 实际完成 {} 个", totalVideos, finalProcessed);
			
 
				         }
			
 
				 
			
 
				-        // 转换回 LinkedHashMap 保持顺序
			
 
				         Map<Long, List<ArticleMatchItem>> result = new LinkedHashMap<>(resultMap);
			
 
				-        log.info("长文匹配完成: {}/{} 个视频命中长文", matchedCount.get(), totalVideos);
			
 
				+        log.info("长文匹配完成: {}/{} 个视频命中长文, 跳过已发送 {} 篇",
			
 
				+                matchedCount.get(), totalVideos, skippedSentCount.get());
			
 
				         return result;
			
 
				     }
			
 
				 
			
 
				+    /**
			
 
				+     * 过滤已发送的文章：从上往下（按原始 score 降序）遍历，跳过在已发送集合中的文章。
			
 
				+     * <p>注意：batchByText 结果已按 score 降序排列，跳过已发送后自动取下一个最优的。
			
 
				+     *
			
 
				+     * @param articles       原始匹配文章列表（已按 score 降序）
			
 
				+     * @param channelLevel3  该视频所属账号
			
 
				+     * @param sentSourceIds  channelLevel3 → 已发送素材 ID 集合
			
 
				+     * @return 过滤后的文章列表（仍按 score 降序）
			
 
				+     */
			
 
				+    private List<ArticleMatchItem> filterSentArticles(
			
 
				+            List<ArticleMatchItem> articles,
			
 
				+            String channelLevel3,
			
 
				+            Map<String, Set<String>> sentSourceIds) {
			
 
				+        Set<String> sentIds = (channelLevel3 != null) ? sentSourceIds.get(channelLevel3) : null;
			
 
				+        if (sentIds == null || sentIds.isEmpty()) {
			
 
				+            return articles;
			
 
				+        }
			
 
				+
			
 
				+        List<ArticleMatchItem> filtered = new ArrayList<>(articles.size());
			
 
				+        for (ArticleMatchItem item : articles) {
			
 
				+            if (item.articleId == null || sentIds.contains(item.articleId)) {
			
 
				+                continue;
			
 
				+            }
			
 
				+            filtered.add(item);
			
 
				+        }
			
 
				+        return filtered;
			
 
				+    }
			
 
				+
			
 
				     /**
			
 
				      * 从 batchByText 返回结果中提取 Article 模态的匹配条目。
			
 
				      *
			
@@ -467,7 +560,7 @@ public class VideoArticleMatchJob {
 
				      *
			
 
				      * <p>每对 (video, article) 只产生一条记录（不关联需求维度），
			
 
				      * channelLevel3 / account 取自原始匹配记录。
			
 
				-     * 先清理同日 dt 旧数据（幂等重跑），再批量插入新结果。
			
 
				+     * 先清理同日 dt 旧数据（幂等重跑），再分批插入新结果。
			
 
				      */
			
 
				     private void saveResults(String dt,
			
 
				                              List<ChannelDemandMatchResult> records,