|
|
@@ -1,6 +1,8 @@
|
|
|
package com.tzld.longarticle.recommend.server.service.recommend.score.strategy;
|
|
|
|
|
|
import com.ctrip.framework.apollo.spring.annotation.ApolloJsonValue;
|
|
|
+import com.google.common.collect.Lists;
|
|
|
+import com.tzld.longarticle.recommend.server.common.ThreadPoolFactory;
|
|
|
import com.tzld.longarticle.recommend.server.model.dto.Content;
|
|
|
import com.tzld.longarticle.recommend.server.model.dto.ContentHisPublishArticle;
|
|
|
import com.tzld.longarticle.recommend.server.model.entity.crawler.AccountAvgInfo;
|
|
|
@@ -14,13 +16,14 @@ import com.tzld.longarticle.recommend.server.util.MathUtils;
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
import org.apache.commons.collections4.CollectionUtils;
|
|
|
import org.springframework.beans.factory.annotation.Autowired;
|
|
|
-import org.springframework.beans.factory.annotation.Value;
|
|
|
import org.springframework.stereotype.Component;
|
|
|
|
|
|
import java.util.ArrayList;
|
|
|
import java.util.List;
|
|
|
import java.util.Map;
|
|
|
import java.util.Objects;
|
|
|
+import java.util.concurrent.ExecutorService;
|
|
|
+import java.util.concurrent.Future;
|
|
|
|
|
|
@Component
|
|
|
@Slf4j
|
|
|
@@ -36,9 +39,10 @@ public class ViewCountRateStrategy implements ScoreStrategy {
|
|
|
@ApolloJsonValue("${view.count.rate.account.his.filter.days:{}}")
|
|
|
private Map<String, Integer> accountHisFilterDaysConfig;
|
|
|
|
|
|
+ private final ExecutorService pool = ThreadPoolFactory.scorePool();
|
|
|
+
|
|
|
@Override
|
|
|
public List<Score> score(ScoreParam param) {
|
|
|
- List<Score> scores = new ArrayList<>();
|
|
|
String[] contentPools = accountContentPoolConfigService.getContentPools(param.getAccountName());
|
|
|
List<AccountAvgInfo> avgInfoList = accountAvgInfoRepository.getAllByGhIdEqualsAndStatusEquals(param.getGhId(), 1);
|
|
|
double avgViewCountFirst = accountIndexAvgViewCountService.getAvgReadCountByDB(avgInfoList, param.getGhId(), 1);
|
|
|
@@ -46,123 +50,187 @@ public class ViewCountRateStrategy implements ScoreStrategy {
|
|
|
if (avgViewCountFirst < 10) {
|
|
|
avgViewCountFirst = 20000D;
|
|
|
}
|
|
|
- for (Content content : param.getContents()) {
|
|
|
- for (int i = 0; i < contentPools.length; i++) {
|
|
|
- if (!contentPools[i].equals(content.getContentPoolType())) {
|
|
|
- continue;
|
|
|
- }
|
|
|
- double avgViewCountPos = accountIndexAvgViewCountService.getAvgReadCountByDB(avgInfoList, param.getGhId(), i + 1);
|
|
|
- // 缺省头条均值设置为2w,次条为1w
|
|
|
- if (avgViewCountPos < 10) {
|
|
|
- if (i == 0) {
|
|
|
- avgViewCountPos = 20000D;
|
|
|
- } else if (i == 1) {
|
|
|
- avgViewCountPos = 10000D;
|
|
|
- } else {
|
|
|
- avgViewCountPos = 400D;
|
|
|
+
|
|
|
+ // 数据量较小时直接串行处理,避免线程切换开销
|
|
|
+ if (param.getContents().size() <= 1000) {
|
|
|
+ return calculateScoresSequential(param.getContents(), contentPools, avgInfoList,
|
|
|
+ param.getGhId(), avgViewCountFirst, param);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 数据量较大时并行处理
|
|
|
+ return calculateScoresParallel(param.getContents(), contentPools, avgInfoList,
|
|
|
+ param.getGhId(), avgViewCountFirst, param);
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 串行计算评分
|
|
|
+ */
|
|
|
+ private List<Score> calculateScoresSequential(List<Content> contents, String[] contentPools,
|
|
|
+ List<AccountAvgInfo> avgInfoList, String ghId, double avgViewCountFirst, ScoreParam param) {
|
|
|
+ List<Score> scores = new ArrayList<>();
|
|
|
+ for (Content content : contents) {
|
|
|
+ Score score = calculateSingleContentScore(content, contentPools, avgInfoList,
|
|
|
+ ghId, avgViewCountFirst, param);
|
|
|
+ if (score != null) {
|
|
|
+ scores.add(score);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return scores;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 并行计算评分
|
|
|
+ */
|
|
|
+ private List<Score> calculateScoresParallel(List<Content> contents, String[] contentPools,
|
|
|
+ List<AccountAvgInfo> avgInfoList, String ghId, double avgViewCountFirst, ScoreParam param) {
|
|
|
+ List<List<Content>> batches = Lists.partition(contents, 500);
|
|
|
+ List<Future<List<Score>>> futures = new ArrayList<>();
|
|
|
+
|
|
|
+ for (List<Content> batch : batches) {
|
|
|
+ Future<List<Score>> future = pool.submit(() -> {
|
|
|
+ List<Score> batchScores = new ArrayList<>();
|
|
|
+ for (Content content : batch) {
|
|
|
+ Score score = calculateSingleContentScore(content, contentPools, avgInfoList,
|
|
|
+ ghId, avgViewCountFirst, param);
|
|
|
+ if (score != null) {
|
|
|
+ batchScores.add(score);
|
|
|
}
|
|
|
}
|
|
|
- // 阅读量之和
|
|
|
- double showViewCountSum = 0D;
|
|
|
- // 阅读均值置信区间上限之和
|
|
|
- double readAvgCiUpperSum = 0D;
|
|
|
- // 头条阅读量之和
|
|
|
- double showViewCountSumFirst = 0D;
|
|
|
- // 头条阅读均值置信区间上限之和
|
|
|
- double readAvgCiUpperSumFirst = 0D;
|
|
|
- // 次条阅读量之和
|
|
|
- double showViewCountSumSecond = 0D;
|
|
|
- // 次条阅读均值置信区间上限之和
|
|
|
- double readAvgCiUpperSumSecond = 0D;
|
|
|
- // 最大阅读均值置信区间上限
|
|
|
- double maxReadAvgCiUpper = 0D;
|
|
|
- if (CollectionUtils.isEmpty(content.getHisPublishArticleList())) {
|
|
|
+ return batchScores;
|
|
|
+ });
|
|
|
+ futures.add(future);
|
|
|
+ }
|
|
|
+
|
|
|
+ List<Score> scores = new ArrayList<>();
|
|
|
+ for (Future<List<Score>> future : futures) {
|
|
|
+ try {
|
|
|
+ scores.addAll(future.get());
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("ViewCountRateStrategy batch process error", e);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return scores;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 计算单个内容的评分
|
|
|
+ */
|
|
|
+ private Score calculateSingleContentScore(Content content, String[] contentPools,
|
|
|
+ List<AccountAvgInfo> avgInfoList, String ghId, double avgViewCountFirst, ScoreParam param) {
|
|
|
+ for (int i = 0; i < contentPools.length; i++) {
|
|
|
+ if (!contentPools[i].equals(content.getContentPoolType())) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ double avgViewCountPos = accountIndexAvgViewCountService.getAvgReadCountByDB(avgInfoList, param.getGhId(), i + 1);
|
|
|
+ // 缺省头条均值设置为2w,次条为1w
|
|
|
+ if (avgViewCountPos < 10) {
|
|
|
+ if (i == 0) {
|
|
|
+ avgViewCountPos = 20000D;
|
|
|
+ } else if (i == 1) {
|
|
|
+ avgViewCountPos = 10000D;
|
|
|
+ } else {
|
|
|
+ avgViewCountPos = 400D;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // 阅读量之和
|
|
|
+ double showViewCountSum = 0D;
|
|
|
+ // 阅读均值置信区间上限之和
|
|
|
+ double readAvgCiUpperSum = 0D;
|
|
|
+ // 头条阅读量之和
|
|
|
+ double showViewCountSumFirst = 0D;
|
|
|
+ // 头条阅读均值置信区间上限之和
|
|
|
+ double readAvgCiUpperSumFirst = 0D;
|
|
|
+ // 次条阅读量之和
|
|
|
+ double showViewCountSumSecond = 0D;
|
|
|
+ // 次条阅读均值置信区间上限之和
|
|
|
+ double readAvgCiUpperSumSecond = 0D;
|
|
|
+ // 最大阅读均值置信区间上限
|
|
|
+ double maxReadAvgCiUpper = 0D;
|
|
|
+ if (CollectionUtils.isEmpty(content.getHisPublishArticleList())) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ for (ContentHisPublishArticle hisItem : content.getHisPublishArticleList()) {
|
|
|
+ // 过滤掉发布时间晚于19点数据
|
|
|
+ if (ScoreStrategy.hisContentLateFilter(hisItem.getPublishTimestamp())) {
|
|
|
continue;
|
|
|
}
|
|
|
- for (ContentHisPublishArticle hisItem : content.getHisPublishArticleList()) {
|
|
|
- // 过滤掉发布时间晚于19点数据
|
|
|
- if (ScoreStrategy.hisContentLateFilter(hisItem.getPublishTimestamp())) {
|
|
|
- continue;
|
|
|
- }
|
|
|
- // 过滤掉历史数据中,阅读量为0的文章
|
|
|
- Integer hisFilterDays = accountHisFilterDaysConfig.get(param.getGhId());
|
|
|
- if (Objects.nonNull(hisFilterDays)
|
|
|
- && hisItem.getPublishTimestamp() < System.currentTimeMillis() / 1000 - hisFilterDays * 24 * 60 * 60) {
|
|
|
- continue;
|
|
|
- }
|
|
|
- if (hisItem.isInnerAccount() && Objects.nonNull(hisItem.getViewCount())
|
|
|
- && hisItem.getViewCount() > 0 && Objects.nonNull(hisItem.getReadAvgCiUpper())
|
|
|
- && hisItem.getReadAvgCiUpper() > 0) {
|
|
|
- maxReadAvgCiUpper = Math.max(maxReadAvgCiUpper, hisItem.getReadAvgCiUpper());
|
|
|
- if (hisItem.getItemIndex() == 1) {
|
|
|
- showViewCountSumFirst += hisItem.getViewCount();
|
|
|
- readAvgCiUpperSumFirst += hisItem.getReadAvgCiUpper();
|
|
|
- } else if (hisItem.getItemIndex() == 2) {
|
|
|
- if (Objects.nonNull(hisItem.getFirstViewCount()) && hisItem.getFirstViewCount() > 0 &&
|
|
|
- Objects.nonNull(hisItem.getFirstViewCountRate()) && hisItem.getFirstViewCountRate() > 0) {
|
|
|
- showViewCountSumSecond += hisItem.getViewCount();
|
|
|
- if (hisItem.getFirstViewCountRate() > 1) {
|
|
|
- // 对于头条均值倍数大于1的情况,次条均值线性增加,用于debias;
|
|
|
- // TODO: 对于小于1的情况,是否要减去?
|
|
|
- readAvgCiUpperSumSecond += hisItem.getReadAvgCiUpper() * hisItem.getFirstViewCountRate();
|
|
|
- } else {
|
|
|
- readAvgCiUpperSumSecond += hisItem.getReadAvgCiUpper();
|
|
|
- }
|
|
|
+ // 过滤掉历史数据中,阅读量为0的文章
|
|
|
+ Integer hisFilterDays = accountHisFilterDaysConfig.get(param.getGhId());
|
|
|
+ if (Objects.nonNull(hisFilterDays)
|
|
|
+ && hisItem.getPublishTimestamp() < System.currentTimeMillis() / 1000 - hisFilterDays * 24 * 60 * 60) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ if (hisItem.isInnerAccount() && Objects.nonNull(hisItem.getViewCount())
|
|
|
+ && hisItem.getViewCount() > 0 && Objects.nonNull(hisItem.getReadAvgCiUpper())
|
|
|
+ && hisItem.getReadAvgCiUpper() > 0) {
|
|
|
+ maxReadAvgCiUpper = Math.max(maxReadAvgCiUpper, hisItem.getReadAvgCiUpper());
|
|
|
+ if (hisItem.getItemIndex() == 1) {
|
|
|
+ showViewCountSumFirst += hisItem.getViewCount();
|
|
|
+ readAvgCiUpperSumFirst += hisItem.getReadAvgCiUpper();
|
|
|
+ } else if (hisItem.getItemIndex() == 2) {
|
|
|
+ if (Objects.nonNull(hisItem.getFirstViewCount()) && hisItem.getFirstViewCount() > 0 &&
|
|
|
+ Objects.nonNull(hisItem.getFirstViewCountRate()) && hisItem.getFirstViewCountRate() > 0) {
|
|
|
+ showViewCountSumSecond += hisItem.getViewCount();
|
|
|
+ if (hisItem.getFirstViewCountRate() > 1) {
|
|
|
+ // 对于头条均值倍数大于1的情况,次条均值线性增加,用于debias;
|
|
|
+ // TODO: 对于小于1的情况,是否要减去?
|
|
|
+ readAvgCiUpperSumSecond += hisItem.getReadAvgCiUpper() * hisItem.getFirstViewCountRate();
|
|
|
+ } else {
|
|
|
+ readAvgCiUpperSumSecond += hisItem.getReadAvgCiUpper();
|
|
|
}
|
|
|
- } else {
|
|
|
- if (Objects.nonNull(hisItem.getFirstViewCount()) && hisItem.getFirstViewCount() > 0
|
|
|
- && Objects.nonNull(hisItem.getFirstViewCountRate()) && hisItem.getFirstViewCountRate() > 0) {
|
|
|
- showViewCountSum += hisItem.getViewCount();
|
|
|
- if (hisItem.getFirstViewCountRate() > 1) {
|
|
|
- // 对于头条均值倍数大于1的情况,次条均值线性增加,用于debias;
|
|
|
- // TODO: 对于小于1的情况,是否要减去?
|
|
|
- readAvgCiUpperSum += hisItem.getReadAvgCiUpper() * hisItem.getFirstViewCountRate();
|
|
|
- } else {
|
|
|
- readAvgCiUpperSum += hisItem.getReadAvgCiUpper();
|
|
|
- }
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ if (Objects.nonNull(hisItem.getFirstViewCount()) && hisItem.getFirstViewCount() > 0
|
|
|
+ && Objects.nonNull(hisItem.getFirstViewCountRate()) && hisItem.getFirstViewCountRate() > 0) {
|
|
|
+ showViewCountSum += hisItem.getViewCount();
|
|
|
+ if (hisItem.getFirstViewCountRate() > 1) {
|
|
|
+ // 对于头条均值倍数大于1的情况,次条均值线性增加,用于debias;
|
|
|
+ // TODO: 对于小于1的情况,是否要减去?
|
|
|
+ readAvgCiUpperSum += hisItem.getReadAvgCiUpper() * hisItem.getFirstViewCountRate();
|
|
|
+ } else {
|
|
|
+ readAvgCiUpperSum += hisItem.getReadAvgCiUpper();
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- double viewCountRate = 0D; // 设置默认值
|
|
|
- double bigRateW = 1D;
|
|
|
- // 如果有头条反馈数据,优先选取头条反馈数据;
|
|
|
- if (showViewCountSumFirst > 0) {
|
|
|
- showViewCountSum = showViewCountSumFirst;
|
|
|
- readAvgCiUpperSum = readAvgCiUpperSumFirst;
|
|
|
- } else if (showViewCountSumSecond > 0) {
|
|
|
- showViewCountSum = showViewCountSumSecond;
|
|
|
- readAvgCiUpperSum = readAvgCiUpperSumSecond;
|
|
|
- // 如果是大号头条,则降权
|
|
|
- if (avgViewCountFirst >= 3000 && i == 0) {
|
|
|
- bigRateW = 0.001D;
|
|
|
- }
|
|
|
- }
|
|
|
- // 均值倍数
|
|
|
- if (readAvgCiUpperSum > 0) {
|
|
|
- viewCountRate = showViewCountSum / readAvgCiUpperSum;
|
|
|
+ }
|
|
|
+ double viewCountRate = 0D; // 设置默认值
|
|
|
+ double bigRateW = 1D;
|
|
|
+ // 如果有头条反馈数据,优先选取头条反馈数据;
|
|
|
+ if (showViewCountSumFirst > 0) {
|
|
|
+ showViewCountSum = showViewCountSumFirst;
|
|
|
+ readAvgCiUpperSum = readAvgCiUpperSumFirst;
|
|
|
+ } else if (showViewCountSumSecond > 0) {
|
|
|
+ showViewCountSum = showViewCountSumSecond;
|
|
|
+ readAvgCiUpperSum = readAvgCiUpperSumSecond;
|
|
|
+ // 如果是大号头条,则降权
|
|
|
+ if (avgViewCountFirst >= 3000 && i == 0) {
|
|
|
+ bigRateW = 0.001D;
|
|
|
}
|
|
|
- // 置信度
|
|
|
- double viewCountRateW = MathUtils.sigmoid(readAvgCiUpperSum, 0.002, avgViewCountPos);
|
|
|
- double viewCountRateScore = 0;
|
|
|
+ }
|
|
|
+ // 均值倍数
|
|
|
+ if (readAvgCiUpperSum > 0) {
|
|
|
+ viewCountRate = showViewCountSum / readAvgCiUpperSum;
|
|
|
+ }
|
|
|
+ // 置信度
|
|
|
+ double viewCountRateW = MathUtils.sigmoid(readAvgCiUpperSum, 0.002, avgViewCountPos);
|
|
|
+ double viewCountRateScore = 0;
|
|
|
|
|
|
- if (viewCountRate > 0) {
|
|
|
- // 最终分数 = 置信度 * 均值倍数
|
|
|
- if (viewCountRate > 1 && bigRateW < 1) {
|
|
|
- // 如果是大号头条,则降权
|
|
|
- viewCountRateScore = viewCountRateW * ((viewCountRate - 1) * bigRateW + 1);
|
|
|
- } else {
|
|
|
- viewCountRateScore = viewCountRateW * viewCountRate;
|
|
|
- }
|
|
|
+ if (viewCountRate > 0) {
|
|
|
+ // 最终分数 = 置信度 * 均值倍数
|
|
|
+ if (viewCountRate > 1 && bigRateW < 1) {
|
|
|
+ // 如果是大号头条,则降权
|
|
|
+ viewCountRateScore = viewCountRateW * ((viewCountRate - 1) * bigRateW + 1);
|
|
|
+ } else {
|
|
|
+ viewCountRateScore = viewCountRateW * viewCountRate;
|
|
|
}
|
|
|
- Score score = new Score();
|
|
|
- score.setStrategy(this);
|
|
|
- score.setContentId(content.getId());
|
|
|
- score.setScore(viewCountRateScore);
|
|
|
- scores.add(score);
|
|
|
- break;
|
|
|
}
|
|
|
+ Score score = new Score();
|
|
|
+ score.setStrategy(this);
|
|
|
+ score.setContentId(content.getId());
|
|
|
+ score.setScore(viewCountRateScore);
|
|
|
+ return score;
|
|
|
}
|
|
|
- return scores;
|
|
|
+ return null;
|
|
|
}
|
|
|
}
|