|
@@ -0,0 +1,244 @@
|
|
|
+package com.tzld.longarticle.recommend.server.service.recommend;
|
|
|
+
|
|
|
+import com.google.common.util.concurrent.ThreadFactoryBuilder;
|
|
|
+import com.tzld.longarticle.recommend.server.common.CommonThreadPoolExecutor;
|
|
|
+import com.tzld.longarticle.recommend.server.common.enums.aigc.PublishContentTypeEnum;
|
|
|
+import com.tzld.longarticle.recommend.server.mapper.aigc.AigcBaseMapper;
|
|
|
+import com.tzld.longarticle.recommend.server.mapper.crawler.CrawlerBaseMapper;
|
|
|
+import com.tzld.longarticle.recommend.server.model.dto.CrawlerContent;
|
|
|
+import com.tzld.longarticle.recommend.server.model.entity.aigc.PublishAccount;
|
|
|
+import com.tzld.longarticle.recommend.server.model.entity.aigc.PublishContent;
|
|
|
+import com.tzld.longarticle.recommend.server.model.entity.aigc.PublishContentOutput;
|
|
|
+import com.tzld.longarticle.recommend.server.model.entity.crawler.Article;
|
|
|
+import com.tzld.longarticle.recommend.server.model.param.ArticleFindSourceParam;
|
|
|
+import com.tzld.longarticle.recommend.server.repository.aigc.PublishAccountRepository;
|
|
|
+import com.tzld.longarticle.recommend.server.repository.aigc.PublishContentOutputRepository;
|
|
|
+import com.tzld.longarticle.recommend.server.repository.crawler.AccountAvgInfoRepository;
|
|
|
+import com.tzld.longarticle.recommend.server.repository.crawler.ArticleRepository;
|
|
|
+import com.tzld.longarticle.recommend.server.service.recommend.config.AccountIndexAvgViewCountService;
|
|
|
+import com.tzld.longarticle.recommend.server.util.DateUtils;
|
|
|
+import com.tzld.longarticle.recommend.server.util.TitleSimilarCheckUtil;
|
|
|
+import lombok.extern.slf4j.Slf4j;
|
|
|
+import org.apache.commons.collections4.CollectionUtils;
|
|
|
+import org.springframework.beans.factory.annotation.Autowired;
|
|
|
+import org.springframework.stereotype.Service;
|
|
|
+import org.springframework.util.StringUtils;
|
|
|
+
|
|
|
+import java.util.List;
|
|
|
+import java.util.Map;
|
|
|
+import java.util.Objects;
|
|
|
+import java.util.concurrent.*;
|
|
|
+import java.util.stream.Collectors;
|
|
|
+
|
|
|
+/**
|
|
|
+ * @author dyp
|
|
|
+ */
|
|
|
+@Service
|
|
|
+@Slf4j
|
|
|
+public class ArticleService {
|
|
|
+
|
|
|
+ @Autowired
|
|
|
+ AccountIndexAvgViewCountService accountIndexAvgViewCountService;
|
|
|
+ @Autowired
|
|
|
+ AccountAvgInfoRepository accountAvgInfoRepository;
|
|
|
+ @Autowired
|
|
|
+ ArticleRepository articleRepository;
|
|
|
+ @Autowired
|
|
|
+ PublishContentOutputRepository publishContentOutputRepository;
|
|
|
+ @Autowired
|
|
|
+ PublishAccountRepository publishAccountRepository;
|
|
|
+ @Autowired
|
|
|
+ AigcBaseMapper aigcBaseMapper;
|
|
|
+ @Autowired
|
|
|
+ CrawlerBaseMapper crawlerBaseMapper;
|
|
|
+
|
|
|
+ private final static ExecutorService pool = new CommonThreadPoolExecutor(
|
|
|
+ 32,
|
|
|
+ 128,
|
|
|
+ 0L, TimeUnit.SECONDS,
|
|
|
+ new LinkedBlockingQueue<>(1000),
|
|
|
+ new ThreadFactoryBuilder().setNameFormat("DEFAULT-%d").build(),
|
|
|
+ new ThreadPoolExecutor.AbortPolicy());
|
|
|
+
|
|
|
+ public void findSource(ArticleFindSourceParam param) {
|
|
|
+ if (StringUtils.hasText(param.getDateStr())) {
|
|
|
+ long minUpdateTimestamp = DateUtils.dateStrToTimestamp(param.getDateStr(), "yyyyMMdd") - 86400 * 7;
|
|
|
+ } else {
|
|
|
+ long minUpdateTimestamp = DateUtils.getTodayStart() - 86400 * 7;
|
|
|
+ }
|
|
|
+ long minUpdateTimestamp = 1704081913L;
|
|
|
+ while (true) {
|
|
|
+ List<Article> articleList = crawlerBaseMapper.getWaitingFindArticle(minUpdateTimestamp);
|
|
|
+ if (CollectionUtils.isEmpty(articleList)) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ CountDownLatch cdl = new CountDownLatch(articleList.size());
|
|
|
+ for (Article article : articleList) {
|
|
|
+ pool.execute(() -> {
|
|
|
+ try {
|
|
|
+ syncAigcIdByWxSn(article.getWxSn());
|
|
|
+ } finally {
|
|
|
+ cdl.countDown();
|
|
|
+ }
|
|
|
+ });
|
|
|
+ minUpdateTimestamp = minUpdateTimestamp > article.getUpdateTime() ? minUpdateTimestamp : article.getUpdateTime();
|
|
|
+ }
|
|
|
+ try {
|
|
|
+ cdl.await();
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("cdl error", e);
|
|
|
+ }
|
|
|
+ log.info("findSource timestamp:{}", minUpdateTimestamp);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private void syncAigcIdByWxSn(String wxSn) {
|
|
|
+ Article article = articleRepository.getByWxSn(wxSn);
|
|
|
+ String ghId = article.getGhId();
|
|
|
+ String title = article.getTitle();
|
|
|
+ PublishAccount publishAccount = publishAccountRepository.getByGhId(ghId);
|
|
|
+ List<PublishContent> publishContentList = aigcBaseMapper.getNearestPublishContent(publishAccount.getId(), null);
|
|
|
+ if (CollectionUtils.isEmpty(publishContentList)) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ log.info("syncAigcIdByWxSn publishContentList finish");
|
|
|
+ Map<String, PublishContent> publishContentMap = publishContentList.stream().collect(
|
|
|
+ Collectors.toMap(PublishContent::getId, publishContent -> publishContent));
|
|
|
+ List<String> publishContentIds = publishContentList.stream().map(PublishContent::getId).collect(Collectors.toList());
|
|
|
+ List<PublishContentOutput> publishContentOutputList = publishContentOutputRepository.
|
|
|
+ getByPublishContentIdInAndContentTypeAndSelectStatus(publishContentIds, PublishContentTypeEnum.title.getVal(), 1);
|
|
|
+ Map<String, List<PublishContentOutput>> publishContentOutputMap = publishContentOutputList.stream().collect(
|
|
|
+ Collectors.groupingBy(PublishContentOutput::getOutput));
|
|
|
+ log.info("syncAigcIdByWxSn publishContentOutputList finish");
|
|
|
+ List<String> titles = publishContentOutputList.stream().map(PublishContentOutput::getOutput).collect(Collectors.toList());
|
|
|
+ String publishContentId = null;
|
|
|
+ String channelContentId = null;
|
|
|
+ if (titles.contains(title)) {
|
|
|
+ PublishContent publishContent = getPublishContentByTitle(publishContentOutputMap,
|
|
|
+ publishContentMap, title, article.getUpdateTime() * 1000);
|
|
|
+ publishContentId = publishContent.getId();
|
|
|
+ channelContentId = publishContent.getCrawlerChannelContentId();
|
|
|
+ } else {
|
|
|
+ for (String aTitle : titles) {
|
|
|
+ if (TitleSimilarCheckUtil.isSimilar(title, aTitle, TitleSimilarCheckUtil.SIMILARITY_THRESHOLD)) {
|
|
|
+ PublishContent publishContent = getPublishContentByTitle(publishContentOutputMap,
|
|
|
+ publishContentMap, aTitle, article.getUpdateTime() * 1000);
|
|
|
+ publishContentId = publishContent.getId();
|
|
|
+ channelContentId = publishContent.getCrawlerChannelContentId();
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ log.info("syncAigcIdByWxSn titleMatch finish");
|
|
|
+ if (Objects.isNull(channelContentId)) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ // 更新 official_article_v2
|
|
|
+ crawlerBaseMapper.updateArticleAigcId(wxSn, publishContentId, channelContentId);
|
|
|
+ // 查找记录根记录
|
|
|
+ long start = System.currentTimeMillis();
|
|
|
+ Article result = getRootPublishContent(channelContentId, null, publishContentId, 0);
|
|
|
+ log.info("syncAigcIdByWxSn getRootPublishContent finish cost:{}", System.currentTimeMillis() - start);
|
|
|
+ // 更新source root publish_content_id
|
|
|
+ crawlerBaseMapper.updateArticleSourceRootId(wxSn, result.getSourcePublishContentId(), result.getRootPublishContentId());
|
|
|
+ }
|
|
|
+
|
|
|
+ private PublishContent getPublishContentByTitle(Map<String, List<PublishContentOutput>> publishContentOutputMap,
|
|
|
+ Map<String, PublishContent> publishContentMap,
|
|
|
+ String title,
|
|
|
+ Long publishTimestamp) {
|
|
|
+ List<PublishContentOutput> outputList = publishContentOutputMap.get(title);
|
|
|
+ List<PublishContent> publishContents = outputList.stream().map(o -> publishContentMap.get(o.getPublishContentId()))
|
|
|
+ .collect(Collectors.toList());
|
|
|
+ return getNearestContent(publishContents, publishTimestamp);
|
|
|
+ }
|
|
|
+
|
|
|
+ private PublishContent getNearestContent(List<PublishContent> publishContents, Long publishTimestamp) {
|
|
|
+ if (publishContents.size() == 1) {
|
|
|
+ return publishContents.get(0);
|
|
|
+ }
|
|
|
+ PublishContent result = null;
|
|
|
+ Long nearest = 0L;
|
|
|
+ for (PublishContent publishContent : publishContents) {
|
|
|
+ Long timestamp = publishContent.getPublishTimestamp();
|
|
|
+ if (Objects.isNull(result)) {
|
|
|
+ result = publishContent;
|
|
|
+ nearest = timestamp;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ if (Math.abs(timestamp - publishTimestamp) < Math.abs(nearest - publishTimestamp)) {
|
|
|
+ result = publishContent;
|
|
|
+ nearest = timestamp;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+
|
|
|
+ public Article getRootPublishContent(String channelContentId, String sourcePublishContentId, String rootPublishContentId,
|
|
|
+ int times) {
|
|
|
+ Article result = new Article();
|
|
|
+ result.setSourcePublishContentId(sourcePublishContentId);
|
|
|
+ result.setRootPublishContentId(rootPublishContentId);
|
|
|
+ if (times > 20) {
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+ List<Article> articleList = articleRepository.getByChannelContentIdAndRootPublishContentIdIsNotNull(channelContentId);
|
|
|
+ if (CollectionUtils.isNotEmpty(articleList)) {
|
|
|
+ if (!StringUtils.hasText(sourcePublishContentId)) {
|
|
|
+ result.setSourcePublishContentId(articleList.get(0).getSourcePublishContentId());
|
|
|
+ }
|
|
|
+ result.setRootPublishContentId(articleList.get(0).getRootPublishContentId());
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+ result.setRootPublishContentId(rootPublishContentId);
|
|
|
+ CrawlerContent crawlerContent = aigcBaseMapper.getCrawlerContentByChannelContentId(channelContentId);
|
|
|
+ if (Objects.isNull(crawlerContent) || !StringUtils.hasText(crawlerContent.getGhId())) {
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+ PublishAccount publishAccount = publishAccountRepository.getByGhId(crawlerContent.getGhId());
|
|
|
+ if (Objects.isNull(publishAccount)) {
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+ String title = crawlerContent.getTitle();
|
|
|
+ List<PublishContent> publishContentList = aigcBaseMapper.getNearestPublishContent(publishAccount.getId(), null);
|
|
|
+ if (CollectionUtils.isEmpty(publishContentList)) {
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+ Map<String, PublishContent> publishContentMap = publishContentList.stream().collect(
|
|
|
+ Collectors.toMap(PublishContent::getId, publishContent -> publishContent));
|
|
|
+ List<String> publishContentIds = publishContentList.stream().map(PublishContent::getId).collect(Collectors.toList());
|
|
|
+ List<PublishContentOutput> publishContentOutputList = publishContentOutputRepository.
|
|
|
+ getByPublishContentIdInAndContentTypeAndSelectStatus(publishContentIds, PublishContentTypeEnum.title.getVal(), 1);
|
|
|
+ Map<String, List<PublishContentOutput>> publishContentOutputMap = publishContentOutputList.stream().collect(
|
|
|
+ Collectors.groupingBy(PublishContentOutput::getOutput));
|
|
|
+ List<String> titles = publishContentOutputList.stream().map(PublishContentOutput::getOutput).collect(Collectors.toList());
|
|
|
+ if (titles.contains(title)) {
|
|
|
+ PublishContent publishContent = getPublishContentByTitle(publishContentOutputMap,
|
|
|
+ publishContentMap, title, crawlerContent.getPublishTimestamp());
|
|
|
+ if (!StringUtils.hasText(sourcePublishContentId)) {
|
|
|
+ result.setSourcePublishContentId(publishContent.getId());
|
|
|
+ }
|
|
|
+ result.setRootPublishContentId(publishContent.getId());
|
|
|
+ channelContentId = publishContent.getCrawlerChannelContentId();
|
|
|
+ } else {
|
|
|
+ for (String aTitle : titles) {
|
|
|
+ if (TitleSimilarCheckUtil.isSimilar(title, aTitle, TitleSimilarCheckUtil.SIMILARITY_THRESHOLD)) {
|
|
|
+ PublishContent publishContent = getPublishContentByTitle(publishContentOutputMap,
|
|
|
+ publishContentMap, aTitle, crawlerContent.getPublishTimestamp());
|
|
|
+ if (!StringUtils.hasText(sourcePublishContentId)) {
|
|
|
+ result.setSourcePublishContentId(publishContent.getId());
|
|
|
+ }
|
|
|
+ result.setRootPublishContentId(publishContent.getId());
|
|
|
+ channelContentId = publishContent.getCrawlerChannelContentId();
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (channelContentId.equals(crawlerContent.getChannelContentId())) {
|
|
|
+ return result;
|
|
|
+ } else {
|
|
|
+ return getRootPublishContent(channelContentId, result.getSourcePublishContentId(), result.getRootPublishContentId(), ++times);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+}
|