|
@@ -179,10 +179,7 @@ public class RecallService implements ApplicationContextAware {
|
|
|
+ "账号名称: " + param.getAccountName());
|
|
|
return content;
|
|
|
}
|
|
|
- // category 查询
|
|
|
- setContentCategory(content);
|
|
|
long t3 = System.currentTimeMillis();
|
|
|
- CostMonitor.logCost("Recall", "GetCategory", t3 - t2);
|
|
|
// 标题历史均值
|
|
|
setTitleAvgViewCount(content, param.getGhId(), param.getType());
|
|
|
long t4 = System.currentTimeMillis();
|
|
@@ -216,53 +213,6 @@ public class RecallService implements ApplicationContextAware {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- public void setContentCategory(List<Content> contentList) {
|
|
|
- contentList.forEach(content -> content.setTitleMd5(Md5Util.encoderByMd5(content.getTitle())));
|
|
|
- List<String> channelContentIds = contentList.stream().map(Content::getCrawlerChannelContentId)
|
|
|
- .collect(Collectors.toList());
|
|
|
- // 查询晋升rootProduceContentId
|
|
|
- List<ArticlePoolPromotionSource> sourceList = articlePoolPromotionSourceRepository
|
|
|
- .getByChannelContentIdInAndStatusAndDeleted(channelContentIds,
|
|
|
- ArticlePoolPromotionSourceStatusEnum.FINISH.getCode(), 0);
|
|
|
- Map<String, ArticlePoolPromotionSource> sourceMap = sourceList.stream()
|
|
|
- .collect(Collectors.toMap(ArticlePoolPromotionSource::getChannelContentId, Function.identity()));
|
|
|
- List<String> publishContentIds = sourceList.stream().
|
|
|
- map(ArticlePoolPromotionSource::getRootPublishContentId).collect(Collectors.toList());
|
|
|
- List<PublishContent> publishContentList = publishContentRepository.getByIdIn(publishContentIds);
|
|
|
- Map<String, PublishContent> publishContentMap = publishContentList.stream()
|
|
|
- .collect(Collectors.toMap(PublishContent::getId, Function.identity()));
|
|
|
- // 根据produceContentId查询category
|
|
|
- List<ArticleCategory> articleCategoryList = articleCategoryRepository.getByStatus(ArticleCategoryStatusEnum.SUCCESS.getCode());
|
|
|
- Map<String, ArticleCategory> categoryMap = articleCategoryList.stream()
|
|
|
- .collect(Collectors.toMap(ArticleCategory::getProduceContentId, Function.identity()));
|
|
|
- Map<String, ArticleCategory> coldStartCategoryMap = articleCategoryList.stream()
|
|
|
- .collect(Collectors.toMap(ArticleCategory::getChannelContentId, Function.identity(), (a, b) -> a));
|
|
|
- Map<String, ArticleCategory> titleCategoryMap = articleCategoryList.stream()
|
|
|
- .collect(Collectors.toMap(ArticleCategory::getTitleMd5, Function.identity(), (a, b) -> a));
|
|
|
- for (Content content : contentList) {
|
|
|
- ArticlePoolPromotionSource source = sourceMap.get(content.getCrawlerChannelContentId());
|
|
|
- ArticleCategory category = null;
|
|
|
- if (Objects.nonNull(source) && Objects.nonNull(source.getRootProduceContentId())) {
|
|
|
- category = categoryMap.get(source.getRootProduceContentId());
|
|
|
- PublishContent publishContent = publishContentMap.get(source.getRootPublishContentId());
|
|
|
- if (Objects.nonNull(publishContent)) {
|
|
|
- content.setRootPublishTimestamp(publishContent.getPublishTimestamp());
|
|
|
- }
|
|
|
- }
|
|
|
- if (Objects.isNull(category)) {
|
|
|
- category = coldStartCategoryMap.get(content.getCrawlerChannelContentId());
|
|
|
- }
|
|
|
- if (Objects.isNull(category)) {
|
|
|
- category = titleCategoryMap.get(content.getTitleMd5());
|
|
|
- }
|
|
|
- if (Objects.nonNull(category)) {
|
|
|
- content.setCategory(Collections.singletonList(category.getCategory()));
|
|
|
- continue;
|
|
|
- }
|
|
|
-// log.error("setContentCategory NullError channelContentId:{}", content.getCrawlerChannelContentId());
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
private List<CrawlerMetaArticle> getByUniqueIndexIn(List<String> md5List) {
|
|
|
if (CollectionUtils.isEmpty(md5List)) {
|
|
|
return new ArrayList<>();
|
|
@@ -297,57 +247,68 @@ public class RecallService implements ApplicationContextAware {
|
|
|
|
|
|
public void setTitleAvgViewCount(List<Content> contentList, String ghId, String type) {
|
|
|
long start = System.currentTimeMillis();
|
|
|
- List<String> titleMd5List = contentList.stream().map(Content::getTitleMd5).distinct().collect(Collectors.toList());
|
|
|
- Map<String, Content> md5ContentMap = contentList.stream().collect(
|
|
|
- Collectors.toMap(Content::getTitleMd5, Function.identity(), (o1, o2) -> o2));
|
|
|
- // 根据titleMd5查询数据库获取数据
|
|
|
+ contentList.forEach(content -> content.setTitleMd5(Md5Util.encoderByMd5(content.getTitle())));
|
|
|
+ List<String> sourceIdList = contentList.stream().map(Content::getSourceId).distinct().collect(Collectors.toList());
|
|
|
+ Map<String, Content> sourceContentMap = contentList.stream().collect(
|
|
|
+ Collectors.toMap(Content::getSourceId, Function.identity(), (o1, o2) -> o2));
|
|
|
+ // 根据sourceId查询数据库获取数据
|
|
|
List<ArticleTitleHisCache> articleTitleHisCacheList = new ArrayList<>();
|
|
|
- for (List<String> partition : Lists.partition(titleMd5List, 1000)) {
|
|
|
- articleTitleHisCacheList.addAll(articleTitleHisCacheRepository.getByTitleMd5InAndType(partition, type));
|
|
|
+ for (List<String> partition : Lists.partition(sourceIdList, 1000)) {
|
|
|
+ articleTitleHisCacheList.addAll(articleTitleHisCacheRepository.getBySourceIdInAndType(partition, type));
|
|
|
}
|
|
|
Map<String, ArticleTitleHisCache> articleTitleHisCacheMap = articleTitleHisCacheList.stream()
|
|
|
- .collect(Collectors.toMap(ArticleTitleHisCache::getTitleMd5, Function.identity()));
|
|
|
- // titleMd5 进行过滤 排除缓存中数据 重新走下方查询
|
|
|
- titleMd5List.removeIf(articleTitleHisCacheMap::containsKey);
|
|
|
+ .collect(Collectors.toMap(ArticleTitleHisCache::getSourceId, Function.identity()));
|
|
|
+ // sourceId 进行过滤 排除缓存中数据 重新走下方查询
|
|
|
+ sourceIdList.removeIf(articleTitleHisCacheMap::containsKey);
|
|
|
// 获取账号相关性
|
|
|
List<AccountCorrelation> accountCorrelationList = accountCorrelationRepository.findByGhIdAndStatus(ghId, 1);
|
|
|
Map<String, Double> accountCorrelationMap = accountCorrelationList.stream().collect(
|
|
|
Collectors.toMap(AccountCorrelation::getRelGhId, AccountCorrelation::getCorrelation));
|
|
|
- List<TitleHisCacheParam> paramList = titleMd5List.stream().map(titleMd5 -> {
|
|
|
- Content content = md5ContentMap.get(titleMd5);
|
|
|
+ List<TitleHisCacheParam> paramList = sourceIdList.stream().map(sourceId -> {
|
|
|
+ Content content = sourceContentMap.get(sourceId);
|
|
|
TitleHisCacheParam cacheParam = new TitleHisCacheParam();
|
|
|
- cacheParam.setTitleMd5(titleMd5);
|
|
|
+ cacheParam.setSourceId(sourceId);
|
|
|
+ cacheParam.setTitleMd5(content.getTitleMd5());
|
|
|
cacheParam.setTitle(content.getTitle());
|
|
|
cacheParam.setCrawlerTitle(content.getCrawlerTitle());
|
|
|
+ cacheParam.setCrawlerChannelContentId(content.getCrawlerChannelContentId());
|
|
|
cacheParam.setCategory(content.getCategory());
|
|
|
return cacheParam;
|
|
|
}).collect(Collectors.toList());
|
|
|
Map<String, Content> hisArticleCacheMap = getArticleTitleHisCacheMap(paramList, type);
|
|
|
List<Content> saveList = new ArrayList<>();
|
|
|
- Set<String> titleMd5Set = new HashSet<>();
|
|
|
+ Set<String> sourceSet = new HashSet<>();
|
|
|
for (Content content : contentList) {
|
|
|
- if (articleTitleHisCacheMap.containsKey(content.getTitleMd5())) {
|
|
|
- ArticleTitleHisCache cache = articleTitleHisCacheMap.get(content.getTitleMd5());
|
|
|
+ if (articleTitleHisCacheMap.containsKey(content.getSourceId())) {
|
|
|
+ ArticleTitleHisCache cache = articleTitleHisCacheMap.get(content.getSourceId());
|
|
|
List<ContentHisPublishArticle> hisPublishArticleList =
|
|
|
JSONArray.parseArray(cache.getHisPublishArticleList(), ContentHisPublishArticle.class);
|
|
|
for (ContentHisPublishArticle article : hisPublishArticleList) {
|
|
|
article.setCorrelation(Optional.ofNullable(accountCorrelationMap.get(article.getGhId())).orElse(0.0));
|
|
|
}
|
|
|
+ if (StringUtils.hasText(cache.getCategory())) {
|
|
|
+ content.setCategory(JSONArray.parseArray(cache.getCategory(), String.class));
|
|
|
+ }
|
|
|
+ content.setRootPublishTimestamp(cache.getRootPublishTimestamp());
|
|
|
content.setHisPublishArticleList(hisPublishArticleList);
|
|
|
setT0Data(content);
|
|
|
continue;
|
|
|
}
|
|
|
- if (hisArticleCacheMap.containsKey(content.getTitleMd5())) {
|
|
|
- Content cache = hisArticleCacheMap.get(content.getTitleMd5());
|
|
|
+ if (hisArticleCacheMap.containsKey(content.getSourceId())) {
|
|
|
+ Content cache = hisArticleCacheMap.get(content.getSourceId());
|
|
|
content.setHisPublishArticleList(cache.getHisPublishArticleList());
|
|
|
+ if (CollectionUtils.isNotEmpty(cache.getCategory())) {
|
|
|
+ content.setCategory(cache.getCategory());
|
|
|
+ }
|
|
|
+ content.setRootPublishTimestamp(cache.getRootPublishTimestamp());
|
|
|
for (ContentHisPublishArticle article : content.getHisPublishArticleList()) {
|
|
|
article.setCorrelation(Optional.ofNullable(accountCorrelationMap.get(article.getGhId())).orElse(0.0));
|
|
|
}
|
|
|
setT0Data(content);
|
|
|
}
|
|
|
- if (!titleMd5Set.contains(content.getTitleMd5())) {
|
|
|
+ if (!sourceSet.contains(content.getSourceId())) {
|
|
|
saveList.add(content);
|
|
|
- titleMd5Set.add(content.getTitleMd5());
|
|
|
+ sourceSet.add(content.getSourceId());
|
|
|
}
|
|
|
}
|
|
|
// 写入缓存
|
|
@@ -368,9 +329,11 @@ public class RecallService implements ApplicationContextAware {
|
|
|
ArticleTitleHisCache cache = new ArticleTitleHisCache();
|
|
|
BeanUtils.copyProperties(content, cache);
|
|
|
cache.setType(type);
|
|
|
+ cache.setChannelContentId(content.getCrawlerChannelContentId());
|
|
|
if (CollectionUtils.isNotEmpty(content.getCategory())) {
|
|
|
cache.setCategory(JSONObject.toJSONString(content.getCategory()));
|
|
|
}
|
|
|
+ cache.setRootPublishTimestamp(content.getRootPublishTimestamp());
|
|
|
cache.setHisPublishArticleList(JSONObject.toJSONString(content.getHisPublishArticleList()));
|
|
|
cache.setCreateTimestamp(System.currentTimeMillis());
|
|
|
cacheList.add(cache);
|
|
@@ -423,8 +386,52 @@ public class RecallService implements ApplicationContextAware {
|
|
|
List<AccountCategory> accountCategoryList = accountCategoryRepository.getByStatus(StatusEnum.ONE.getCode());
|
|
|
Map<String, JSONObject> accountCategoryMap = accountCategoryList.stream().filter(o -> StringUtils.hasText(o.getCategoryMap()))
|
|
|
.collect(Collectors.toMap(AccountCategory::getGhId, o -> JSONObject.parseObject(o.getCategoryMap())));
|
|
|
+
|
|
|
+ // 获取品类
|
|
|
+ List<String> channelContentIds = paramList.stream().map(TitleHisCacheParam::getCrawlerChannelContentId)
|
|
|
+ .collect(Collectors.toList());
|
|
|
+ // 查询晋升rootProduceContentId
|
|
|
+ List<ArticlePoolPromotionSource> sourceList = articlePoolPromotionSourceRepository
|
|
|
+ .getByChannelContentIdInAndStatusAndDeleted(channelContentIds,
|
|
|
+ ArticlePoolPromotionSourceStatusEnum.FINISH.getCode(), 0);
|
|
|
+ Map<String, ArticlePoolPromotionSource> sourceMap = sourceList.stream()
|
|
|
+ .collect(Collectors.toMap(ArticlePoolPromotionSource::getChannelContentId, Function.identity()));
|
|
|
+ List<String> publishContentIds = sourceList.stream().
|
|
|
+ map(ArticlePoolPromotionSource::getRootPublishContentId).collect(Collectors.toList());
|
|
|
+ List<PublishContent> publishContentList = publishContentRepository.getByIdIn(publishContentIds);
|
|
|
+ Map<String, PublishContent> publishContentMap = publishContentList.stream()
|
|
|
+ .collect(Collectors.toMap(PublishContent::getId, Function.identity()));
|
|
|
+ // 根据produceContentId查询category
|
|
|
+ List<ArticleCategory> articleCategoryList = articleCategoryRepository.getByStatus(ArticleCategoryStatusEnum.SUCCESS.getCode());
|
|
|
+ Map<String, ArticleCategory> categoryMap = articleCategoryList.stream()
|
|
|
+ .collect(Collectors.toMap(ArticleCategory::getProduceContentId, Function.identity()));
|
|
|
+ Map<String, ArticleCategory> coldStartCategoryMap = articleCategoryList.stream()
|
|
|
+ .collect(Collectors.toMap(ArticleCategory::getChannelContentId, Function.identity(), (a, b) -> a));
|
|
|
+ Map<String, ArticleCategory> titleCategoryMap = articleCategoryList.stream()
|
|
|
+ .collect(Collectors.toMap(ArticleCategory::getTitleMd5, Function.identity(), (a, b) -> a));
|
|
|
+
|
|
|
for (TitleHisCacheParam cacheParam : paramList) {
|
|
|
Content res = new Content();
|
|
|
+ // 设置品类
|
|
|
+ ArticlePoolPromotionSource source = sourceMap.get(cacheParam.getCrawlerChannelContentId());
|
|
|
+ ArticleCategory category = null;
|
|
|
+ if (Objects.nonNull(source) && Objects.nonNull(source.getRootProduceContentId())) {
|
|
|
+ category = categoryMap.get(source.getRootProduceContentId());
|
|
|
+ PublishContent publishContent = publishContentMap.get(source.getRootPublishContentId());
|
|
|
+ if (Objects.nonNull(publishContent)) {
|
|
|
+ res.setRootPublishTimestamp(publishContent.getPublishTimestamp());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (Objects.isNull(category)) {
|
|
|
+ category = coldStartCategoryMap.get(cacheParam.getCrawlerChannelContentId());
|
|
|
+ }
|
|
|
+ if (Objects.isNull(category)) {
|
|
|
+ category = titleCategoryMap.get(cacheParam.getTitleMd5());
|
|
|
+ }
|
|
|
+ if (Objects.nonNull(category)) {
|
|
|
+ res.setCategory(Collections.singletonList(category.getCategory()));
|
|
|
+ }
|
|
|
+ // 设置历史表现
|
|
|
List<Article> hisArticles = new ArrayList<>();
|
|
|
Map<Integer, List<Article>> indexArticleMap = map.get(cacheParam.getTitle());
|
|
|
if (Objects.isNull(indexArticleMap)) {
|
|
@@ -457,9 +464,9 @@ public class RecallService implements ApplicationContextAware {
|
|
|
// 历史表现 文章品类如果与历史发布账号负相关 则过滤,不计算该历史发布表现
|
|
|
JSONObject categoryWeightMap = accountCategoryMap.get(hisArticle.getGhId());
|
|
|
if (Objects.nonNull(categoryWeightMap) && CollectionUtils.isNotEmpty(cacheParam.getCategory())) {
|
|
|
- String category = cacheParam.getCategory().get(0);
|
|
|
- if (categoryWeightMap.containsKey(category)) {
|
|
|
- double weight = categoryWeightMap.getDoubleValue(category);
|
|
|
+ String hisCategory = cacheParam.getCategory().get(0);
|
|
|
+ if (categoryWeightMap.containsKey(hisCategory)) {
|
|
|
+ double weight = categoryWeightMap.getDoubleValue(hisCategory);
|
|
|
if (weight < 0) {
|
|
|
continue;
|
|
|
}
|
|
@@ -522,7 +529,7 @@ public class RecallService implements ApplicationContextAware {
|
|
|
res.getHisPublishArticleList().add(article);
|
|
|
}
|
|
|
// 设置头条阅读均值
|
|
|
- result.put(cacheParam.getTitleMd5(), res);
|
|
|
+ result.put(cacheParam.getSourceId(), res);
|
|
|
}
|
|
|
return result;
|
|
|
}
|