Sfoglia il codice sorgente

召回内容历史表现增加缓存

wangyunpeng 7 mesi fa
parent
commit
b2af41e850

+ 10 - 4
long-article-recommend-service/src/main/java/com/tzld/longarticle/recommend/server/model/entity/longArticle/ArticleTitleHisCache.java

@@ -16,15 +16,21 @@ import java.io.Serializable;
 public class ArticleTitleHisCache implements Serializable {
 
     @Id
-    @Column(name = "title_md5")
-    private String titleMd5;
+    @Column(name = "source_id")
+    private String sourceId;
     @Id
     @Column(name = "type")
     private String type;
     @Column(name = "title")
     private String title;
+    @Column(name = "title_md5")
+    private String titleMd5;
     @Column(name = "crawler_title")
     private String crawlerTitle;
+    @Column(name = "channel_content_id")
+    private String channelContentId;
+    @Column(name = "root_publish_timestamp")
+    private Long rootPublishTimestamp;
     @Column(name = "category")
     private String category;
     @Column(name = "his_publish_article_list")
@@ -38,8 +44,8 @@ public class ArticleTitleHisCache implements Serializable {
     @Data
     public static class PK implements Serializable {
 
-        @Column(name = "title_md5")
-        private String titleMd5;
+        @Column(name = "source_id")
+        private String sourceId;
         @Column(name = "type")
         private String type;
 

+ 3 - 0
long-article-recommend-service/src/main/java/com/tzld/longarticle/recommend/server/model/param/TitleHisCacheParam.java

@@ -6,8 +6,11 @@ import java.util.List;
 
 @Data
 public class TitleHisCacheParam {
+
+    private String sourceId;
     private String title;
     private String crawlerTitle;
+    private String crawlerChannelContentId;
     private String titleMd5;
     private List<String> category;
 }

+ 1 - 1
long-article-recommend-service/src/main/java/com/tzld/longarticle/recommend/server/repository/longArticle/ArticleTitleHisCacheRepository.java

@@ -9,5 +9,5 @@ import java.util.List;
 @Repository
 public interface ArticleTitleHisCacheRepository extends JpaRepository<ArticleTitleHisCache, ArticleTitleHisCache.PK> {
 
-    List<ArticleTitleHisCache> getByTitleMd5InAndType(List<String> titleMd5List, String type);
+    List<ArticleTitleHisCache> getBySourceIdInAndType(List<String> sourceIdList, String type);
 }

+ 44 - 27
long-article-recommend-service/src/main/java/com/tzld/longarticle/recommend/server/service/XxlJobService.java

@@ -46,6 +46,8 @@ import lombok.extern.slf4j.Slf4j;
 import org.apache.commons.collections4.CollectionUtils;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.dao.DuplicateKeyException;
+import org.springframework.data.domain.Page;
+import org.springframework.data.domain.PageRequest;
 import org.springframework.stereotype.Service;
 import org.springframework.util.StringUtils;
 
@@ -565,33 +567,48 @@ public class XxlJobService {
     @XxlJob("refreshArticleHisCache")
     public ReturnT<String> refreshArticleHisCache(String param) {
         // 刷新历史表现缓存
-        List<ArticleTitleHisCache> cacheList = articleTitleHisCacheRepository.findAll();
-        Map<String, Map<String, ArticleTitleHisCache>> cacheMap = cacheList.stream().collect(
-                Collectors.groupingBy(ArticleTitleHisCache::getType,
-                        Collectors.toMap(ArticleTitleHisCache::getTitleMd5, Function.identity())));
-        for (Map.Entry<String, Map<String, ArticleTitleHisCache>> typeEntry : cacheMap.entrySet()) {
-            String type = typeEntry.getKey();
-            Map<String, ArticleTitleHisCache> titleMap = typeEntry.getValue();
-            Set<String> titleMd5List = titleMap.keySet();
-            List<TitleHisCacheParam> paramList = titleMd5List.stream().map(titleMd5 -> {
-                ArticleTitleHisCache cache = cacheMap.get(type).get(titleMd5);
-                TitleHisCacheParam cacheParam = new TitleHisCacheParam();
-                cacheParam.setTitleMd5(titleMd5);
-                cacheParam.setTitle(cache.getTitle());
-                cacheParam.setCrawlerTitle(cache.getCrawlerTitle());
-                if (StringUtils.hasText(cache.getCategory())) {
-                    cacheParam.setCategory(JSONArray.parseArray(cache.getCategory(), String.class));
-                }
-                return cacheParam;
-            }).collect(Collectors.toList());
-            Map<String, Content> hisCacheMap = recallService.getArticleTitleHisCacheMap(paramList, type);
-            for (String titleMd5 : titleMd5List) {
-                Content content = hisCacheMap.get(titleMd5);
-                if (Objects.nonNull(content) && CollectionUtils.isNotEmpty(content.getHisPublishArticleList())) {
-                    ArticleTitleHisCache cache = titleMap.get(titleMd5);
-                    cache.setHisPublishArticleList(JSONObject.toJSONString(content.getHisPublishArticleList()));
-                    cache.setUpdateTimestamp(System.currentTimeMillis());
-                    articleTitleHisCacheRepository.save(cache);
+        long count = articleTitleHisCacheRepository.count();
+        int pageSize = 100;
+        long page = (count / pageSize) + 1;
+        for (int i = 0; i < page; i++) {
+            Page<ArticleTitleHisCache> articleTitleHisCachePage = articleTitleHisCacheRepository.findAll(
+                    PageRequest.of(i, pageSize));
+            List<ArticleTitleHisCache> cacheList = articleTitleHisCachePage.getContent();
+            if (CollectionUtils.isEmpty(cacheList)) {
+                continue;
+            }
+            Map<String, Map<String, ArticleTitleHisCache>> cacheMap = cacheList.stream().collect(
+                    Collectors.groupingBy(ArticleTitleHisCache::getType,
+                            Collectors.toMap(ArticleTitleHisCache::getSourceId, Function.identity())));
+            for (Map.Entry<String, Map<String, ArticleTitleHisCache>> typeEntry : cacheMap.entrySet()) {
+                String type = typeEntry.getKey();
+                Map<String, ArticleTitleHisCache> sourceMap = typeEntry.getValue();
+                Set<String> sourceIdList = sourceMap.keySet();
+                List<TitleHisCacheParam> paramList = sourceIdList.stream().map(sourceId -> {
+                    ArticleTitleHisCache cache = cacheMap.get(type).get(sourceId);
+                    TitleHisCacheParam cacheParam = new TitleHisCacheParam();
+                    cacheParam.setSourceId(sourceId);
+                    cacheParam.setTitleMd5(cache.getTitleMd5());
+                    cacheParam.setTitle(cache.getTitle());
+                    cacheParam.setCrawlerTitle(cache.getCrawlerTitle());
+                    cacheParam.setCrawlerChannelContentId(cache.getChannelContentId());
+                    if (StringUtils.hasText(cache.getCategory())) {
+                        cacheParam.setCategory(JSONArray.parseArray(cache.getCategory(), String.class));
+                    }
+                    return cacheParam;
+                }).collect(Collectors.toList());
+                Map<String, Content> hisCacheMap = recallService.getArticleTitleHisCacheMap(paramList, type);
+                for (String sourceId : sourceIdList) {
+                    Content content = hisCacheMap.get(sourceId);
+                    if (Objects.nonNull(content) && CollectionUtils.isNotEmpty(content.getHisPublishArticleList())) {
+                        ArticleTitleHisCache cache = sourceMap.get(sourceId);
+                        cache.setHisPublishArticleList(JSONObject.toJSONString(content.getHisPublishArticleList()));
+                        if (CollectionUtil.isNotEmpty(content.getCategory())) {
+                            cache.setCategory(JSONObject.toJSONString(content.getCategory()));
+                        }
+                        cache.setUpdateTimestamp(System.currentTimeMillis());
+                        articleTitleHisCacheRepository.save(cache);
+                    }
                 }
             }
         }

+ 80 - 73
long-article-recommend-service/src/main/java/com/tzld/longarticle/recommend/server/service/recommend/recall/RecallService.java

@@ -179,10 +179,7 @@ public class RecallService implements ApplicationContextAware {
                             + "账号名称: " + param.getAccountName());
             return content;
         }
-        // category 查询
-        setContentCategory(content);
         long t3 = System.currentTimeMillis();
-        CostMonitor.logCost("Recall", "GetCategory", t3 - t2);
         // 标题历史均值
         setTitleAvgViewCount(content, param.getGhId(), param.getType());
         long t4 = System.currentTimeMillis();
@@ -216,53 +213,6 @@ public class RecallService implements ApplicationContextAware {
         }
     }
 
-    public void setContentCategory(List<Content> contentList) {
-        contentList.forEach(content -> content.setTitleMd5(Md5Util.encoderByMd5(content.getTitle())));
-        List<String> channelContentIds = contentList.stream().map(Content::getCrawlerChannelContentId)
-                .collect(Collectors.toList());
-        // 查询晋升rootProduceContentId
-        List<ArticlePoolPromotionSource> sourceList = articlePoolPromotionSourceRepository
-                .getByChannelContentIdInAndStatusAndDeleted(channelContentIds,
-                        ArticlePoolPromotionSourceStatusEnum.FINISH.getCode(), 0);
-        Map<String, ArticlePoolPromotionSource> sourceMap = sourceList.stream()
-                .collect(Collectors.toMap(ArticlePoolPromotionSource::getChannelContentId, Function.identity()));
-        List<String> publishContentIds = sourceList.stream().
-                map(ArticlePoolPromotionSource::getRootPublishContentId).collect(Collectors.toList());
-        List<PublishContent> publishContentList = publishContentRepository.getByIdIn(publishContentIds);
-        Map<String, PublishContent> publishContentMap = publishContentList.stream()
-                .collect(Collectors.toMap(PublishContent::getId, Function.identity()));
-        // 根据produceContentId查询category
-        List<ArticleCategory> articleCategoryList = articleCategoryRepository.getByStatus(ArticleCategoryStatusEnum.SUCCESS.getCode());
-        Map<String, ArticleCategory> categoryMap = articleCategoryList.stream()
-                .collect(Collectors.toMap(ArticleCategory::getProduceContentId, Function.identity()));
-        Map<String, ArticleCategory> coldStartCategoryMap = articleCategoryList.stream()
-                .collect(Collectors.toMap(ArticleCategory::getChannelContentId, Function.identity(), (a, b) -> a));
-        Map<String, ArticleCategory> titleCategoryMap = articleCategoryList.stream()
-                .collect(Collectors.toMap(ArticleCategory::getTitleMd5, Function.identity(), (a, b) -> a));
-        for (Content content : contentList) {
-            ArticlePoolPromotionSource source = sourceMap.get(content.getCrawlerChannelContentId());
-            ArticleCategory category = null;
-            if (Objects.nonNull(source) && Objects.nonNull(source.getRootProduceContentId())) {
-                category = categoryMap.get(source.getRootProduceContentId());
-                PublishContent publishContent = publishContentMap.get(source.getRootPublishContentId());
-                if (Objects.nonNull(publishContent)) {
-                    content.setRootPublishTimestamp(publishContent.getPublishTimestamp());
-                }
-            }
-            if (Objects.isNull(category)) {
-                category = coldStartCategoryMap.get(content.getCrawlerChannelContentId());
-            }
-            if (Objects.isNull(category)) {
-                category = titleCategoryMap.get(content.getTitleMd5());
-            }
-            if (Objects.nonNull(category)) {
-                content.setCategory(Collections.singletonList(category.getCategory()));
-                continue;
-            }
-//            log.error("setContentCategory NullError channelContentId:{}", content.getCrawlerChannelContentId());
-        }
-    }
-
     private List<CrawlerMetaArticle> getByUniqueIndexIn(List<String> md5List) {
         if (CollectionUtils.isEmpty(md5List)) {
             return new ArrayList<>();
@@ -297,57 +247,68 @@ public class RecallService implements ApplicationContextAware {
 
     public void setTitleAvgViewCount(List<Content> contentList, String ghId, String type) {
         long start = System.currentTimeMillis();
-        List<String> titleMd5List = contentList.stream().map(Content::getTitleMd5).distinct().collect(Collectors.toList());
-        Map<String, Content> md5ContentMap = contentList.stream().collect(
-                Collectors.toMap(Content::getTitleMd5, Function.identity(), (o1, o2) -> o2));
-        // 根据titleMd5查询数据库获取数据
+        contentList.forEach(content -> content.setTitleMd5(Md5Util.encoderByMd5(content.getTitle())));
+        List<String> sourceIdList = contentList.stream().map(Content::getSourceId).distinct().collect(Collectors.toList());
+        Map<String, Content> sourceContentMap = contentList.stream().collect(
+                Collectors.toMap(Content::getSourceId, Function.identity(), (o1, o2) -> o2));
+        // 根据sourceId查询数据库获取数据
         List<ArticleTitleHisCache> articleTitleHisCacheList = new ArrayList<>();
-        for (List<String> partition : Lists.partition(titleMd5List, 1000)) {
-            articleTitleHisCacheList.addAll(articleTitleHisCacheRepository.getByTitleMd5InAndType(partition, type));
+        for (List<String> partition : Lists.partition(sourceIdList, 1000)) {
+            articleTitleHisCacheList.addAll(articleTitleHisCacheRepository.getBySourceIdInAndType(partition, type));
         }
         Map<String, ArticleTitleHisCache> articleTitleHisCacheMap = articleTitleHisCacheList.stream()
-                .collect(Collectors.toMap(ArticleTitleHisCache::getTitleMd5, Function.identity()));
-        // titleMd5 进行过滤 排除缓存中数据 重新走下方查询
-        titleMd5List.removeIf(articleTitleHisCacheMap::containsKey);
+                .collect(Collectors.toMap(ArticleTitleHisCache::getSourceId, Function.identity()));
+        // sourceId 进行过滤 排除缓存中数据 重新走下方查询
+        sourceIdList.removeIf(articleTitleHisCacheMap::containsKey);
         // 获取账号相关性
         List<AccountCorrelation> accountCorrelationList = accountCorrelationRepository.findByGhIdAndStatus(ghId, 1);
         Map<String, Double> accountCorrelationMap = accountCorrelationList.stream().collect(
                 Collectors.toMap(AccountCorrelation::getRelGhId, AccountCorrelation::getCorrelation));
-        List<TitleHisCacheParam> paramList = titleMd5List.stream().map(titleMd5 -> {
-            Content content = md5ContentMap.get(titleMd5);
+        List<TitleHisCacheParam> paramList = sourceIdList.stream().map(sourceId -> {
+            Content content = sourceContentMap.get(sourceId);
             TitleHisCacheParam cacheParam = new TitleHisCacheParam();
-            cacheParam.setTitleMd5(titleMd5);
+            cacheParam.setSourceId(sourceId);
+            cacheParam.setTitleMd5(content.getTitleMd5());
             cacheParam.setTitle(content.getTitle());
             cacheParam.setCrawlerTitle(content.getCrawlerTitle());
+            cacheParam.setCrawlerChannelContentId(content.getCrawlerChannelContentId());
             cacheParam.setCategory(content.getCategory());
             return cacheParam;
         }).collect(Collectors.toList());
         Map<String, Content> hisArticleCacheMap = getArticleTitleHisCacheMap(paramList, type);
         List<Content> saveList = new ArrayList<>();
-        Set<String> titleMd5Set = new HashSet<>();
+        Set<String> sourceSet = new HashSet<>();
         for (Content content : contentList) {
-            if (articleTitleHisCacheMap.containsKey(content.getTitleMd5())) {
-                ArticleTitleHisCache cache = articleTitleHisCacheMap.get(content.getTitleMd5());
+            if (articleTitleHisCacheMap.containsKey(content.getSourceId())) {
+                ArticleTitleHisCache cache = articleTitleHisCacheMap.get(content.getSourceId());
                 List<ContentHisPublishArticle> hisPublishArticleList =
                         JSONArray.parseArray(cache.getHisPublishArticleList(), ContentHisPublishArticle.class);
                 for (ContentHisPublishArticle article : hisPublishArticleList) {
                     article.setCorrelation(Optional.ofNullable(accountCorrelationMap.get(article.getGhId())).orElse(0.0));
                 }
+                if (StringUtils.hasText(cache.getCategory())) {
+                    content.setCategory(JSONArray.parseArray(cache.getCategory(), String.class));
+                }
+                content.setRootPublishTimestamp(cache.getRootPublishTimestamp());
                 content.setHisPublishArticleList(hisPublishArticleList);
                 setT0Data(content);
                 continue;
             }
-            if (hisArticleCacheMap.containsKey(content.getTitleMd5())) {
-                Content cache = hisArticleCacheMap.get(content.getTitleMd5());
+            if (hisArticleCacheMap.containsKey(content.getSourceId())) {
+                Content cache = hisArticleCacheMap.get(content.getSourceId());
                 content.setHisPublishArticleList(cache.getHisPublishArticleList());
+                if (CollectionUtils.isNotEmpty(cache.getCategory())) {
+                    content.setCategory(cache.getCategory());
+                }
+                content.setRootPublishTimestamp(cache.getRootPublishTimestamp());
                 for (ContentHisPublishArticle article : content.getHisPublishArticleList()) {
                     article.setCorrelation(Optional.ofNullable(accountCorrelationMap.get(article.getGhId())).orElse(0.0));
                 }
                 setT0Data(content);
             }
-            if (!titleMd5Set.contains(content.getTitleMd5())) {
+            if (!sourceSet.contains(content.getSourceId())) {
                 saveList.add(content);
-                titleMd5Set.add(content.getTitleMd5());
+                sourceSet.add(content.getSourceId());
             }
         }
         // 写入缓存
@@ -368,9 +329,11 @@ public class RecallService implements ApplicationContextAware {
                 ArticleTitleHisCache cache = new ArticleTitleHisCache();
                 BeanUtils.copyProperties(content, cache);
                 cache.setType(type);
+                cache.setChannelContentId(content.getCrawlerChannelContentId());
                 if (CollectionUtils.isNotEmpty(content.getCategory())) {
                     cache.setCategory(JSONObject.toJSONString(content.getCategory()));
                 }
+                cache.setRootPublishTimestamp(content.getRootPublishTimestamp());
                 cache.setHisPublishArticleList(JSONObject.toJSONString(content.getHisPublishArticleList()));
                 cache.setCreateTimestamp(System.currentTimeMillis());
                 cacheList.add(cache);
@@ -423,8 +386,52 @@ public class RecallService implements ApplicationContextAware {
         List<AccountCategory> accountCategoryList = accountCategoryRepository.getByStatus(StatusEnum.ONE.getCode());
         Map<String, JSONObject> accountCategoryMap = accountCategoryList.stream().filter(o -> StringUtils.hasText(o.getCategoryMap()))
                 .collect(Collectors.toMap(AccountCategory::getGhId, o -> JSONObject.parseObject(o.getCategoryMap())));
+
+        // 获取品类
+        List<String> channelContentIds = paramList.stream().map(TitleHisCacheParam::getCrawlerChannelContentId)
+                .collect(Collectors.toList());
+        // 查询晋升rootProduceContentId
+        List<ArticlePoolPromotionSource> sourceList = articlePoolPromotionSourceRepository
+                .getByChannelContentIdInAndStatusAndDeleted(channelContentIds,
+                        ArticlePoolPromotionSourceStatusEnum.FINISH.getCode(), 0);
+        Map<String, ArticlePoolPromotionSource> sourceMap = sourceList.stream()
+                .collect(Collectors.toMap(ArticlePoolPromotionSource::getChannelContentId, Function.identity()));
+        List<String> publishContentIds = sourceList.stream().
+                map(ArticlePoolPromotionSource::getRootPublishContentId).collect(Collectors.toList());
+        List<PublishContent> publishContentList = publishContentRepository.getByIdIn(publishContentIds);
+        Map<String, PublishContent> publishContentMap = publishContentList.stream()
+                .collect(Collectors.toMap(PublishContent::getId, Function.identity()));
+        // 根据produceContentId查询category
+        List<ArticleCategory> articleCategoryList = articleCategoryRepository.getByStatus(ArticleCategoryStatusEnum.SUCCESS.getCode());
+        Map<String, ArticleCategory> categoryMap = articleCategoryList.stream()
+                .collect(Collectors.toMap(ArticleCategory::getProduceContentId, Function.identity()));
+        Map<String, ArticleCategory> coldStartCategoryMap = articleCategoryList.stream()
+                .collect(Collectors.toMap(ArticleCategory::getChannelContentId, Function.identity(), (a, b) -> a));
+        Map<String, ArticleCategory> titleCategoryMap = articleCategoryList.stream()
+                .collect(Collectors.toMap(ArticleCategory::getTitleMd5, Function.identity(), (a, b) -> a));
+
         for (TitleHisCacheParam cacheParam : paramList) {
             Content res = new Content();
+            // 设置品类
+            ArticlePoolPromotionSource source = sourceMap.get(cacheParam.getCrawlerChannelContentId());
+            ArticleCategory category = null;
+            if (Objects.nonNull(source) && Objects.nonNull(source.getRootProduceContentId())) {
+                category = categoryMap.get(source.getRootProduceContentId());
+                PublishContent publishContent = publishContentMap.get(source.getRootPublishContentId());
+                if (Objects.nonNull(publishContent)) {
+                    res.setRootPublishTimestamp(publishContent.getPublishTimestamp());
+                }
+            }
+            if (Objects.isNull(category)) {
+                category = coldStartCategoryMap.get(cacheParam.getCrawlerChannelContentId());
+            }
+            if (Objects.isNull(category)) {
+                category = titleCategoryMap.get(cacheParam.getTitleMd5());
+            }
+            if (Objects.nonNull(category)) {
+                res.setCategory(Collections.singletonList(category.getCategory()));
+            }
+            // 设置历史表现
             List<Article> hisArticles = new ArrayList<>();
             Map<Integer, List<Article>> indexArticleMap = map.get(cacheParam.getTitle());
             if (Objects.isNull(indexArticleMap)) {
@@ -457,9 +464,9 @@ public class RecallService implements ApplicationContextAware {
                 // 历史表现 文章品类如果与历史发布账号负相关 则过滤,不计算该历史发布表现
                 JSONObject categoryWeightMap = accountCategoryMap.get(hisArticle.getGhId());
                 if (Objects.nonNull(categoryWeightMap) && CollectionUtils.isNotEmpty(cacheParam.getCategory())) {
-                    String category = cacheParam.getCategory().get(0);
-                    if (categoryWeightMap.containsKey(category)) {
-                        double weight = categoryWeightMap.getDoubleValue(category);
+                    String hisCategory = cacheParam.getCategory().get(0);
+                    if (categoryWeightMap.containsKey(hisCategory)) {
+                        double weight = categoryWeightMap.getDoubleValue(hisCategory);
                         if (weight < 0) {
                             continue;
                         }
@@ -522,7 +529,7 @@ public class RecallService implements ApplicationContextAware {
                 res.getHisPublishArticleList().add(article);
             }
             // 设置头条阅读均值
-            result.put(cacheParam.getTitleMd5(), res);
+            result.put(cacheParam.getSourceId(), res);
         }
         return result;
     }

+ 5 - 3
long-article-recommend-service/src/main/resources/mapper/longArticle/LongArticleBaseMapper.xml

@@ -245,11 +245,13 @@
 
     <insert id="batchInsertArticleTitleHisCache">
         insert into article_title_his_cache
-        (title_md5, type, title, crawler_title, category, his_publish_article_list, create_timestamp)
+        (source_id, type, title, title_md5, channel_content_id, root_publish_timestamp, crawler_title,
+         category, his_publish_article_list, create_timestamp)
         values
         <foreach collection="list" item="item" separator=",">
-            (#{item.titleMd5}, #{item.type}, #{item.title}, #{item.crawlerTitle}, #{item.category},
-             #{item.hisPublishArticleList}, #{item.createTimestamp})
+            (#{item.sourceId}, #{item.type}, #{item.title}, #{item.titleMd5}, #{item.channelContentId},
+             #{item.rootPublishTimestamp}, #{item.crawlerTitle}, #{item.category}, #{item.hisPublishArticleList},
+             #{item.createTimestamp})
         </foreach>
     </insert>