Przeglądaj źródła

账号过滤已发布文章

wangyunpeng 14 godzin temu
rodzic
commit
02640039bf

+ 122 - 0
core/src/main/java/com/tzld/videoVector/api/RecommendApiService.java

@@ -0,0 +1,122 @@
+package com.tzld.videoVector.api;
+
+import com.alibaba.fastjson.JSONArray;
+import com.alibaba.fastjson.JSONObject;
+import lombok.extern.slf4j.Slf4j;
+import okhttp3.*;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.stereotype.Service;
+
+import javax.annotation.PostConstruct;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.URLEncoder;
+import java.util.*;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * 外部投放系统 API 服务。
+ * <p>用于查询各账号下已投放素材/文章列表,辅助匹配结果去重。
+ */
+@Slf4j
+@Service
+public class RecommendApiService {
+
+    private OkHttpClient client;
+
+    /** 已发送素材查询接口 */
+    @Value("${external.api.get-source-ids.url:http://101.37.174.139:80/api/getSourceIdsByAccount}")
+    private String getSourceIdsUrl;
+
+    /** HTTP 超时(秒) */
+    @Value("${external.api.timeout:30}")
+    private int timeout;
+
+    @PostConstruct
+    public void init() {
+        this.client = new OkHttpClient.Builder()
+                .connectTimeout(timeout, TimeUnit.SECONDS)
+                .readTimeout(timeout, TimeUnit.SECONDS)
+                .writeTimeout(timeout, TimeUnit.SECONDS)
+                .build();
+    }
+
+    /**
+     * 获取指定账号已发送的素材 ID 集合。
+     *
+     * @param accountName 账号名称(channelLevel3)
+     * @return 已发送素材 ID 集合,为空或失败时返回空集合
+     */
+    public Set<String> getSentSourceIds(String accountName) {
+        if (accountName == null || accountName.isEmpty()) {
+            return Collections.emptySet();
+        }
+
+        try {
+            String encodedName = URLEncoder.encode(accountName, "UTF-8");
+            String url = getSourceIdsUrl + "?accountName=" + encodedName + "&type=9&position=1";
+
+            Request request = new Request.Builder()
+                    .url(url)
+                    .get()
+                    .build();
+
+            try (Response response = client.newCall(request).execute()) {
+                if (!response.isSuccessful()) {
+                    log.error("获取已发送素材失败: accountName={}, HTTP {}, body={}",
+                            accountName, response.code(),
+                            response.body() != null ? response.body().string() : "");
+                    return Collections.emptySet();
+                }
+
+                String body = response.body() != null ? response.body().string() : "";
+                JSONObject res = JSONObject.parseObject(body);
+                if (res == null || res.getInteger("code") == null || res.getInteger("code") != 0) {
+                    log.error("获取已发送素材接口返回异常: accountName={}, response={}", accountName, body);
+                    return Collections.emptySet();
+                }
+
+                JSONArray data = res.getJSONArray("data");
+                if (data == null || data.isEmpty()) {
+                    return Collections.emptySet();
+                }
+
+                Set<String> result = new LinkedHashSet<>(data.size());
+                for (int i = 0; i < data.size(); i++) {
+                    String id = data.getString(i);
+                    if (id != null && !id.isEmpty()) {
+                        result.add(id);
+                    }
+                }
+                log.info("获取已发送素材: accountName={}, count={}", accountName, result.size());
+                return result;
+            }
+        } catch (UnsupportedEncodingException e) {
+            log.error("URL 编码失败: accountName={}, {}", accountName, e.getMessage());
+            return Collections.emptySet();
+        } catch (IOException e) {
+            log.error("获取已发送素材网络异常: accountName={}, {}", accountName, e.getMessage());
+            return Collections.emptySet();
+        } catch (Exception e) {
+            log.error("获取已发送素材异常: accountName={}, {}", accountName, e.getMessage());
+            return Collections.emptySet();
+        }
+    }
+
+    /**
+     * 批量获取多个账号的已发送素材 ID 集合。
+     *
+     * @param accountNames 账号名称列表
+     * @return accountName → 已发送素材 ID 集合
+     */
+    public Map<String, Set<String>> getAllSentSourceIds(Collection<String> accountNames) {
+        Map<String, Set<String>> result = new LinkedHashMap<>();
+        if (accountNames == null || accountNames.isEmpty()) {
+            return result;
+        }
+        for (String name : accountNames) {
+            result.put(name, getSentSourceIds(name));
+        }
+        return result;
+    }
+}

+ 108 - 15
core/src/main/java/com/tzld/videoVector/job/VideoArticleMatchJob.java

@@ -2,6 +2,7 @@ package com.tzld.videoVector.job;
 
 import com.alibaba.fastjson.JSON;
 import com.ctrip.framework.apollo.spring.annotation.ApolloJsonValue;
+import com.tzld.videoVector.api.RecommendApiService;
 import com.tzld.videoVector.api.VideoApiService;
 import com.tzld.videoVector.common.constant.VectorConstants;
 import com.tzld.videoVector.dao.mapper.pgVector.ChannelDemandMatchResultMapper;
@@ -69,6 +70,9 @@ public class VideoArticleMatchJob {
     @Resource
     private VideoApiService videoApiService;
 
+    @Resource
+    private RecommendApiService recommendApiService;
+
     @Resource
     private RedisUtils redisUtils;
 
@@ -173,11 +177,18 @@ public class VideoArticleMatchJob {
                 return ReturnT.SUCCESS;
             }
 
+            // 2.1 获取各账号已发送素材 ID(用于过滤已投文章)
+            Map<String, Set<String>> sentSourceIds = fetchSentSourceIds(records);
+
             // 3. 批量获取视频标题
             Map<Long, String> videoTitleMap = fetchVideoTitles(records);
 
-            // 4. 视频标题 → 长文向量召回
-            Map<Long, List<ArticleMatchItem>> videoArticleMatches = matchArticlesByTitles(videoTitleMap);
+            // 3.1 构建 videoId → channelLevel3 映射(用于匹配时过滤已发送文章)
+            Map<Long, String> videoChannelMap = buildVideoChannelMap(records);
+
+            // 4. 视频标题 → 长文向量召回(过滤已发送文章)
+            Map<Long, List<ArticleMatchItem>> videoArticleMatches =
+                    matchArticlesByTitles(videoTitleMap, videoChannelMap, sentSourceIds);
 
             // 5. 1v1 去重配对
             Map<Long, ArticleMatchItem> finalPairs = dedupOneToOne(videoArticleMatches);
@@ -233,6 +244,30 @@ public class VideoArticleMatchJob {
         return records;
     }
 
+    // =====================================================
+    // 步骤 2.1: 获取各账号已发送素材 ID
+    // =====================================================
+
+    /**
+     * 调用外部 API 获取各账号已发送的素材 ID 集合,用于过滤已投放内容。
+     *
+     * @param records 需求匹配记录列表
+     * @return channelLevel3 → 已发送素材 ID 集合
+     */
+    private Map<String, Set<String>> fetchSentSourceIds(List<ChannelDemandMatchResult> records) {
+        // 提取所有唯一的 channelLevel3
+        Set<String> uniqueAccounts = records.stream()
+                .map(ChannelDemandMatchResult::getChannelLevel3)
+                .filter(StringUtils::hasText)
+                .collect(Collectors.toSet());
+
+        Map<String, Set<String>> result = recommendApiService.getAllSentSourceIds(uniqueAccounts);
+
+        int totalSent = result.values().stream().mapToInt(Set::size).sum();
+        log.info("获取各账号已发送素材: {} 个账号, 共 {} 条", result.size(), totalSent);
+        return result;
+    }
+
     // =====================================================
     // 步骤 3: 批量获取视频标题
     // =====================================================
@@ -309,6 +344,22 @@ public class VideoArticleMatchJob {
         return videoTitleMap;
     }
 
+    // =====================================================
+    // 步骤 3.1: 构建 videoId → channelLevel3 映射
+    // =====================================================
+
+    /**
+     * 从匹配记录中提取 videoId → channelLevel3 映射。
+     * <p>同一视频可能对应多条记录,取第一条的 channelLevel3。
+     */
+    private Map<Long, String> buildVideoChannelMap(List<ChannelDemandMatchResult> records) {
+        Map<Long, String> map = new LinkedHashMap<>();
+        for (ChannelDemandMatchResult r : records) {
+            map.putIfAbsent(r.getMatchVideoId(), r.getChannelLevel3());
+        }
+        return map;
+    }
+
     // =====================================================
     // 步骤 4: 视频标题 → 长文向量召回
     // =====================================================
@@ -319,23 +370,31 @@ public class VideoArticleMatchJob {
      * <p>使用线程池并发执行,单条失败不影响整体流程。
      * 每个标题使用 configCodes=[ARTICLE_TITLE, ARTICLE_SUMMARY] 进行并行 ANN 查询,
      * 结果只保留 modality=ARTICLE 的条目,按 score 降序排列。
+     * 召回结果从上往下(按 score 降序)排除该账号已发送的文章,取剩余的第一条。
      *
-     * @param videoTitleMap videoId → title 映射
-     * @return videoId → 文章匹配列表 映射(按 score 降序)
+     * @param videoTitleMap  videoId → title 映射
+     * @param videoChannelMap videoId → channelLevel3 映射
+     * @param sentSourceIds   channelLevel3 → 已发送素材 ID 集合
+     * @return videoId → 文章匹配列表 映射(已过滤已发送,按 score 降序)
      */
-    private Map<Long, List<ArticleMatchItem>> matchArticlesByTitles(Map<Long, String> videoTitleMap) {
+    private Map<Long, List<ArticleMatchItem>> matchArticlesByTitles(
+            Map<Long, String> videoTitleMap,
+            Map<Long, String> videoChannelMap,
+            Map<String, Set<String>> sentSourceIds) {
         ConcurrentHashMap<Long, List<ArticleMatchItem>> resultMap = new ConcurrentHashMap<>();
         RankingSpec ranking = buildRankingSpec();
 
         int totalVideos = videoTitleMap.size();
         AtomicInteger processed = new AtomicInteger(0);
         AtomicInteger matchedCount = new AtomicInteger(0);
+        AtomicInteger skippedSentCount = new AtomicInteger(0);
 
         // 构建并发任务
         List<CompletableFuture<Void>> futures = new ArrayList<>(totalVideos);
         for (Map.Entry<Long, String> entry : videoTitleMap.entrySet()) {
             Long videoId = entry.getKey();
             String title = entry.getValue();
+            String channelLevel3 = videoChannelMap.get(videoId);
 
             CompletableFuture<Void> future = CompletableFuture.runAsync(() -> {
                 try {
@@ -344,17 +403,24 @@ public class VideoArticleMatchJob {
                     List<ArticleMatchItem> articles = extractArticleItems(recallResult);
 
                     if (!articles.isEmpty()) {
-                        resultMap.put(videoId, articles);
-                        matchedCount.incrementAndGet();
+                        // 从上往下(score 降序)排除已发送的文章,保留剩余列表
+                        List<ArticleMatchItem> filtered = filterSentArticles(articles, channelLevel3, sentSourceIds);
+                        int skipped = articles.size() - filtered.size();
+                        if (skipped > 0) {
+                            skippedSentCount.addAndGet(skipped);
+                        }
+                        if (!filtered.isEmpty()) {
+                            resultMap.put(videoId, filtered);
+                            matchedCount.incrementAndGet();
+                        }
                     }
                 } catch (Exception e) {
                     log.error("视频 {} (标题: {}) 长文匹配失败: {}", videoId, title, e.getMessage());
                 } finally {
                     int done = processed.incrementAndGet();
-                    // 每 10 条或每 50 条倍数的进度输出一次
                     if (done % 10 == 0 || done == totalVideos) {
-                        log.info("长文匹配进度: {}/{} 视频已处理, {} 个命中",
-                                done, totalVideos, matchedCount.get());
+                        log.info("长文匹配进度: {}/{} 视频已处理, {} 个命中, 跳过已发送 {} 个",
+                                done, totalVideos, matchedCount.get(), skippedSentCount.get());
                     }
                 }
             }, matchExecutor);
@@ -362,21 +428,48 @@ public class VideoArticleMatchJob {
             futures.add(future);
         }
 
-        // 等待所有任务完成(每个 future 内部已 catch 异常,不会失败)
+        // 等待所有任务完成
         CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join();
 
-        // 二次校验:确保所有任务都已执行完毕
         int finalProcessed = processed.get();
         if (finalProcessed != totalVideos) {
             log.warn("长文匹配未完全完成: 预期 {} 个, 实际完成 {} 个", totalVideos, finalProcessed);
         }
 
-        // 转换回 LinkedHashMap 保持顺序
         Map<Long, List<ArticleMatchItem>> result = new LinkedHashMap<>(resultMap);
-        log.info("长文匹配完成: {}/{} 个视频命中长文", matchedCount.get(), totalVideos);
+        log.info("长文匹配完成: {}/{} 个视频命中长文, 跳过已发送 {} 篇",
+                matchedCount.get(), totalVideos, skippedSentCount.get());
         return result;
     }
 
+    /**
+     * 过滤已发送的文章:从上往下(按原始 score 降序)遍历,跳过在已发送集合中的文章。
+     * <p>注意:batchByText 结果已按 score 降序排列,跳过已发送后自动取下一个最优的。
+     *
+     * @param articles       原始匹配文章列表(已按 score 降序)
+     * @param channelLevel3  该视频所属账号
+     * @param sentSourceIds  channelLevel3 → 已发送素材 ID 集合
+     * @return 过滤后的文章列表(仍按 score 降序)
+     */
+    private List<ArticleMatchItem> filterSentArticles(
+            List<ArticleMatchItem> articles,
+            String channelLevel3,
+            Map<String, Set<String>> sentSourceIds) {
+        Set<String> sentIds = (channelLevel3 != null) ? sentSourceIds.get(channelLevel3) : null;
+        if (sentIds == null || sentIds.isEmpty()) {
+            return articles;
+        }
+
+        List<ArticleMatchItem> filtered = new ArrayList<>(articles.size());
+        for (ArticleMatchItem item : articles) {
+            if (item.articleId == null || sentIds.contains(item.articleId)) {
+                continue;
+            }
+            filtered.add(item);
+        }
+        return filtered;
+    }
+
     /**
      * 从 batchByText 返回结果中提取 Article 模态的匹配条目。
      *
@@ -467,7 +560,7 @@ public class VideoArticleMatchJob {
      *
      * <p>每对 (video, article) 只产生一条记录(不关联需求维度),
      * channelLevel3 / account 取自原始匹配记录。
-     * 先清理同日 dt 旧数据(幂等重跑),再批插入新结果。
+     * 先清理同日 dt 旧数据(幂等重跑),再批插入新结果。
      */
     private void saveResults(String dt,
                              List<ChannelDemandMatchResult> records,