Browse Source

数据迁移

wangyunpeng 9 months ago
parent
commit
dd9b0cf79c

+ 3 - 4
long-article-recommend-service/src/main/java/com/tzld/longarticle/recommend/server/mapper/crawler/CrawlerBaseMapper.java

@@ -1,9 +1,6 @@
 package com.tzld.longarticle.recommend.server.mapper.crawler;
 
-import com.tzld.longarticle.recommend.server.model.dto.ArticleMatchVideos;
-import com.tzld.longarticle.recommend.server.model.dto.GetOffVideos;
-import com.tzld.longarticle.recommend.server.model.dto.LongArticlesRootSourceId;
-import com.tzld.longarticle.recommend.server.model.dto.LongArticlesVideo;
+import com.tzld.longarticle.recommend.server.model.dto.*;
 import com.tzld.longarticle.recommend.server.model.entity.crawler.AccountCorrelation;
 
 import java.util.List;
@@ -27,4 +24,6 @@ public interface CrawlerBaseMapper {
     Integer countLongArticlesVideos();
 
     List<LongArticlesVideo> pageLongArticlesVideos(int offset, int pageSize);
+
+    List<LongArticlesText> getLongArticlesText();
 }

+ 3 - 0
long-article-recommend-service/src/main/java/com/tzld/longarticle/recommend/server/mapper/longArticle/LongArticleBaseMapper.java

@@ -25,4 +25,7 @@ public interface LongArticleBaseMapper {
 
     int batchInsertLongArticlesMatchVideos(List<LongArticlesMatchVideos> list);
 
+    List<LongArticlesText> getNeedUpdateRecords();
+
+    int updateLongArticlesText(LongArticlesText item);
 }

+ 27 - 2
long-article-recommend-service/src/main/java/com/tzld/longarticle/recommend/server/service/DataFlushService.java

@@ -105,6 +105,9 @@ public class DataFlushService {
         if (pageNum == null) {
             pageNum = 1;
         }
+        List<LongArticlesText> kimiTitleList = crawlerBaseMapper.getLongArticlesText();
+        Map<String, LongArticlesText> kimiTitleMap =  kimiTitleList.stream().collect(
+                Collectors.toMap(LongArticlesText::getContentId, o -> o, (existing, replacement) -> replacement));
         int count = crawlerBaseMapper.countLongArticlesVideos();
         int totalPage = count / pageSize + 1;
         while (pageNum <= totalPage) {
@@ -125,8 +128,14 @@ public class DataFlushService {
                     if (StringUtils.hasText(video.getArticleText())) {
                         longArticlesText.setKimiTitle(video.getKimiTitle().replace("\"", ""));
                     }
-                    longArticlesText.setKimiSummary(video.getKimiSummary());
-                    longArticlesText.setKimiKeys(video.getKimiKeys());
+                    if (StringUtils.hasText(video.getKimiSummary())) {
+                        longArticlesText.setKimiSummary(video.getKimiSummary());
+                        longArticlesText.setKimiKeys(video.getKimiKeys());
+                    } else {
+                        LongArticlesText text = kimiTitleMap.get(video.getContentId());
+                        longArticlesText.setKimiSummary(text.getKimiSummary());
+                        longArticlesText.setKimiKeys(text.getKimiKeys());
+                    }
                     longArticlesText.setKimiStatus(1);
                     batchSaveLongArticlesTextList.add(longArticlesText);
                     existsIdSet.add(video.getContentId());
@@ -221,4 +230,20 @@ public class DataFlushService {
         }
         return null;
     }
+
+    public void flushLongArticlesText() {
+        List<LongArticlesText> kimiTitleList = crawlerBaseMapper.getLongArticlesText();
+        Map<String, LongArticlesText> kimiTitleMap =  kimiTitleList.stream().collect(
+                Collectors.toMap(LongArticlesText::getContentId, o -> o, (existing, replacement) -> replacement));
+        List<LongArticlesText> updateList = longArticleBaseMapper.getNeedUpdateRecords();
+        for (LongArticlesText update : updateList) {
+            LongArticlesText kimi = kimiTitleMap.get(update.getContentId());
+            if (Objects.nonNull(kimi)) {
+                update.setKimiTitle(kimi.getKimiTitle());
+                update.setKimiSummary(kimi.getKimiSummary());
+                update.setKimiKeys(kimi.getKimiKeys());
+                longArticleBaseMapper.updateLongArticlesText(update);
+            }
+        }
+    }
 }

+ 7 - 0
long-article-recommend-service/src/main/java/com/tzld/longarticle/recommend/server/web/DataFlushController.java

@@ -39,5 +39,12 @@ public class DataFlushController {
             service.flushLongArticlesVideos(pageNum);
         }).start();
     }
+    @GetMapping("/flush/long_articles_text")
+    public void flushLongArticlesText() {
+        new Thread(() -> {
+            service.flushLongArticlesText();
+        }).start();
+    }
+
 
 }

+ 9 - 6
long-article-recommend-service/src/main/resources/mapper/crawler/CrawlerBaseMapper.xml

@@ -10,23 +10,23 @@
         </foreach>
     </insert>
     <select id="countGetOffVideos" resultType="java.lang.Integer">
-        select count(1) from get_off_videos
+        select count(1) from get_off_videos where publish_time > 1727239202
     </select>
     <select id="pageGetOffVideos" resultType="com.tzld.longarticle.recommend.server.model.dto.GetOffVideos">
-        select * from get_off_videos order by video_id limit #{offset}, #{pageSize}
+        select * from get_off_videos where publish_time > 1727239202 order by video_id limit #{offset}, #{pageSize}
     </select>
     <select id="countLongArticlesRootSourceId" resultType="java.lang.Integer">
-        select count(1) from long_articles_root_source_id
+        select count(1) from long_articles_root_source_id where rootSourceId > 'WeCom_ffddf52cc30880856d2408fa2b06ab64'
     </select>
     <select id="pageLongArticlesRootSourceId" resultType="com.tzld.longarticle.recommend.server.model.dto.LongArticlesRootSourceId">
-        select * from long_articles_root_source_id order by rootSourceId limit #{offset}, #{pageSize}
+        select * from long_articles_root_source_id where rootSourceId > 'WeCom_ffddf52cc30880856d2408fa2b06ab64' order by rootSourceId limit #{offset}, #{pageSize}
     </select>
     <select id="countArticleMatchVideos" resultType="java.lang.Integer">
-        select count(1) from article_match_videos where DATE(update_time) &lt; '2024-09-20'
+        select count(1) from article_match_videos where DATE(update_time) &gt;= '2024-09-20'
     </select>
     <select id="pageArticleMatchVideos"
             resultType="com.tzld.longarticle.recommend.server.model.dto.ArticleMatchVideos">
-        select * from article_match_videos where DATE(update_time) &lt; '2024-09-20' order by video_id
+        select * from article_match_videos where DATE(update_time) &gt;= '2024-09-20' order by video_id
     </select>
     <select id="countLongArticlesVideos" resultType="java.lang.Integer">
         select count(1) from long_articles_video where DATE(update_time) &gt;= '2024-09-20' and success = 1 and content_status = 2
@@ -35,5 +35,8 @@
             resultType="com.tzld.longarticle.recommend.server.model.dto.LongArticlesVideo">
         select * from long_articles_video where DATE(update_time) &gt;= '2024-09-20' and success = 1 and content_status = 2 order by id limit #{offset}, #{pageSize}
     </select>
+    <select id="getLongArticlesText" resultType="com.tzld.longarticle.recommend.server.model.dto.LongArticlesText">
+        select content_id, kimi_summary, kimi_keys from long_articles_video where kimi_summary is not null group by content_id
+    </select>
 
 </mapper>

+ 11 - 0
long-article-recommend-service/src/main/resources/mapper/longArticle/LongArticleBaseMapper.xml

@@ -83,6 +83,17 @@
             #{item}
         </foreach>
     </select>
+    <select id="getNeedUpdateRecords"
+            resultType="com.tzld.longarticle.recommend.server.model.dto.LongArticlesText">
+        select * from long_articles_text_copy1 where kimi_summary is null
+    </select>
+    <update id="updateLongArticlesText">
+        update long_articles_text_copy1
+        set kimi_title = #{kimiTitle},
+            kimi_summary = #{kimiSummary},
+            kimi_keys = #{kimiKeys}
+        where content_id = #{contentId}
+    </update>
     <insert id="batchInsertLongArticlesMatchVideos" parameterType="list">
         INSERT INTO long_articles_match_videos_copy1 (trace_id, content_id, flow_pool_level, gh_id, account_name,
                                                       content_status, success_status, request_timestamp, response,

+ 136 - 0
long-article-recommend-service/src/test/java/com/tzld/longarticle/recommend/server/RecommendTest.java

@@ -406,4 +406,140 @@ public class RecommendTest {
         }
     }
 
+
+    @Test
+    public void correlation() {
+        List<String> ghIds = Lists.newArrayList("gh_e24da99dc899",
+                "gh_183d80deffb8",
+                "gh_be8c29139989",
+                "gh_c69776baf2cd",
+                "gh_b15de7c99912",
+                "gh_1d887d61088c",
+                "gh_3ed305b5817f",
+                "gh_3e91f0624545",
+                "gh_30816d8adb52",
+                "gh_970460d9ccec",
+                "gh_749271f1ccd5",
+                "gh_ac43e43b253b"
+        );
+        List<PublishSortLog> sortLogList = publishSortLogRepository.findByGhIdInAndDateStrGreaterThanEqual(ghIds, "20240907");
+        sortLogList = sortLogList.stream().filter(o -> o.getIndex() == 1).collect(Collectors.toList());
+        sortLogList.sort(Comparator.comparing(PublishSortLog::getGhId).thenComparing(PublishSortLog::getDateStr));
+        List<Article> articleList = articleRepository.getByGhIdInAndUpdateTimeGreaterThanAndTypeEquals(ghIds, 1725638400L, "9");
+        articleList = articleList.stream().filter(o -> o.getItemIndex() == 1).collect(Collectors.toList());
+        Map<String, Map<String, Article>> articleMap = articleList.stream().collect(Collectors.groupingBy(Article::getGhId, Collectors.toMap(
+                o -> DateUtils.timestampToYMDStr(o.getUpdateTime(),"yyyyMMdd"), o -> o,
+                (existing, replacement) -> replacement)));
+        List<AccountAvgInfo> accountAvgInfoList = accountAvgInfoRepository.getAllByGhIdIn(new HashSet<>(ghIds));
+        Map<String, Map<String, AccountAvgInfo>> accountAvgInfoMap = accountAvgInfoList.stream()
+                .filter(o -> Objects.equals(o.getPosition(), "1")).collect(Collectors.groupingBy(AccountAvgInfo::getGhId,
+                        Collectors.toMap(AccountAvgInfo::getUpdateTime, o -> o)));
+        int rowNum = 0;
+        Map<String, List<PublishSortLog>> sortLogMap = sortLogList.stream().collect(Collectors.groupingBy(PublishSortLog::getGhId));
+        PearsonsCorrelation correlation = new PearsonsCorrelation();
+        Workbook workbook = new XSSFWorkbook();
+        Sheet sheet = workbook.createSheet("ExampleSheet");
+        // 创建标题行
+        Row titleRow = sheet.createRow(rowNum);
+        for (Map.Entry<String, List<PublishSortLog>> entry : sortLogMap.entrySet()) {
+            String ghId = entry.getKey();
+            String name = entry.getValue().get(0).getAccountName();
+            List<PublishSortLog> itemList = entry.getValue();
+            String title = "";
+            double[] scoreArr = new double[itemList.size()];
+            double[] HisFissionFansRateRateStrategyArr = new double[itemList.size()];
+            double[] HisFissionAvgReadRateRateStrategyArr = new double[itemList.size()];
+            double[] PublishTimesStrategyArr = new double[itemList.size()];
+            double[] ViewCountRateCorrelationStrategyArr = new double[itemList.size()];
+            double[] HisFissionAvgReadSumRateStrategyArr = new double[itemList.size()];
+            double[] HisFissionAvgReadRateCorrelationRateStrategyArr = new double[itemList.size()];
+            double[] HisFissionFansSumRateStrategyArr = new double[itemList.size()];
+            double[] SimilarityStrategyArr = new double[itemList.size()];
+            double[] ViewCountStrategyArr = new double[itemList.size()];
+            double[] ViewCountRateStrategyArr = new double[itemList.size()];
+            double[] HisFissionDeWeightAvgReadSumRateStrategyArr = new double[itemList.size()];
+            double[] scoreRateArr = new double[itemList.size()];
+            for (int i = 0; i < itemList.size(); i++) {
+                PublishSortLog publishSortLog = itemList.get(i);
+                Map<String, Article> dateArticleMap = articleMap.get(publishSortLog.getGhId());
+                Article article = dateArticleMap.get(publishSortLog.getDateStr());
+                if (Objects.isNull(article) || !publishSortLog.getTitle().equals(article.getTitle())) {
+                    continue;
+                }
+                if (publishSortLog.getTitle().equals(title)) {
+                    continue;
+                }
+                title = publishSortLog.getTitle();
+                scoreArr[i] = Double.parseDouble(publishSortLog.getScore());
+                JSONObject scoreMap = JSONObject.parseObject(publishSortLog.getScoreMap());
+                HisFissionFansRateRateStrategyArr[i] =  Double.parseDouble(String.format("%.3f", Optional.of(scoreMap.getDoubleValue("HisFissionFansRateRateStrategy")).orElse(0.0)));
+                HisFissionAvgReadRateRateStrategyArr[i] =  Double.parseDouble(String.format("%.3f", Optional.of(scoreMap.getDoubleValue("HisFissionAvgReadRateRateStrategy")).orElse(0.0)));
+                PublishTimesStrategyArr[i] =  Double.parseDouble(String.format("%.3f", Optional.of(scoreMap.getDoubleValue("PublishTimesStrategy")).orElse(0.0)));
+                ViewCountRateCorrelationStrategyArr[i] =  Double.parseDouble(String.format("%.3f", Optional.of(scoreMap.getDoubleValue("ViewCountRateCorrelationStrategy")).orElse(0.0)));
+                HisFissionAvgReadSumRateStrategyArr[i] =  Double.parseDouble(String.format("%.3f", Optional.of(scoreMap.getDoubleValue("HisFissionAvgReadSumRateStrategy")).orElse(0.0)));
+                HisFissionAvgReadRateCorrelationRateStrategyArr[i] =  Double.parseDouble(String.format("%.3f", Optional.of(scoreMap.getDoubleValue("HisFissionAvgReadRateCorrelationRateStrategy")).orElse(0.0)));
+                HisFissionFansSumRateStrategyArr[i] =  Double.parseDouble(String.format("%.3f", Optional.of(scoreMap.getDoubleValue("HisFissionFansSumRateStrategy")).orElse(0.0)));
+                SimilarityStrategyArr[i] =  Double.parseDouble(String.format("%.3f", Optional.of(scoreMap.getDoubleValue("SimilarityStrategy")).orElse(0.0)));
+                ViewCountStrategyArr[i] =  Double.parseDouble(String.format("%.3f", Optional.of(scoreMap.getDoubleValue("ViewCountStrategy")).orElse(0.0)));
+                ViewCountRateStrategyArr[i] =  Double.parseDouble(String.format("%.3f", Optional.of(scoreMap.getDoubleValue("ViewCountRateStrategy")).orElse(0.0)));
+                HisFissionDeWeightAvgReadSumRateStrategyArr[i] =  Double.parseDouble(String.format("%.3f", Optional.of(scoreMap.getDoubleValue("HisFissionDeWeightAvgReadSumRateStrategy")).orElse(0.0)));
+                Map<String, AccountAvgInfo> map = accountAvgInfoMap.get(article.getGhId());
+                if (Objects.nonNull(map)) {
+                    List<String> avgMapDateList = new ArrayList<>(map.keySet());
+                    String publishDate = DateUtils.findNearestDate(avgMapDateList,
+                            DateUtils.timestampToYMDStr(article.getUpdateTime(), "yyyy-MM-dd"), "yyyy-MM-dd");
+                    AccountAvgInfo accountAvgInfo = map.get(publishDate);
+                    if (Objects.nonNull(accountAvgInfo)) {
+                        scoreRateArr[i] = Double.parseDouble(String.format("%.3f", article.getShowViewCount() / (double) accountAvgInfo.getReadAvg()));
+                    }
+                }
+            }
+
+            rowNum++;
+            Row row = sheet.createRow(rowNum);
+            Cell cell = row.createCell(0);
+            cell = row.createCell(1);
+            cell.setCellValue(ghId);
+            cell = row.createCell(2);
+            cell.setCellValue(name);
+            cell = row.createCell(3);
+            cell = row.createCell(4);
+            cell = row.createCell(5);
+            cell.setCellValue(correlation.correlation(scoreArr, scoreRateArr));
+            cell = row.createCell(6);
+            cell.setCellValue(correlation.correlation(HisFissionFansRateRateStrategyArr, scoreRateArr));
+            cell = row.createCell(7);
+            cell.setCellValue(correlation.correlation(HisFissionAvgReadRateRateStrategyArr, scoreRateArr));
+            cell = row.createCell(8);
+            cell.setCellValue(correlation.correlation(PublishTimesStrategyArr, scoreRateArr));
+            cell = row.createCell(9);
+            cell.setCellValue(correlation.correlation(ViewCountRateCorrelationStrategyArr, scoreRateArr));
+            cell = row.createCell(10);
+            cell.setCellValue(correlation.correlation(HisFissionAvgReadSumRateStrategyArr, scoreRateArr));
+            cell = row.createCell(11);
+            cell.setCellValue(correlation.correlation(HisFissionAvgReadRateCorrelationRateStrategyArr, scoreRateArr));
+            cell = row.createCell(12);
+            cell.setCellValue(correlation.correlation(HisFissionFansSumRateStrategyArr, scoreRateArr));
+            cell = row.createCell(13);
+            cell.setCellValue(correlation.correlation(SimilarityStrategyArr, scoreRateArr));
+            cell = row.createCell(14);
+            cell.setCellValue(correlation.correlation(ViewCountStrategyArr, scoreRateArr));
+            cell = row.createCell(15);
+            cell.setCellValue(correlation.correlation(ViewCountRateStrategyArr, scoreRateArr));
+            cell = row.createCell(16);
+            cell.setCellValue(correlation.correlation(HisFissionDeWeightAvgReadSumRateStrategyArr, scoreRateArr));
+        }
+
+        try (FileOutputStream outputStream = new FileOutputStream("/Users/wangyunpeng/Downloads/example.xlsx")) {
+            workbook.write(outputStream);
+        } catch (IOException e) {
+            e.printStackTrace();
+        } finally {
+            try {
+                workbook.close();
+            } catch (IOException e) {
+                e.printStackTrace();
+            }
+        }
+    }
 }