wangyunpeng пре 7 месеци
родитељ
комит
d5f8a95206

+ 2 - 0
long-article-recommend-service/src/main/java/com/tzld/longarticle/recommend/server/mapper/longArticle/LongArticleBaseMapper.java

@@ -34,4 +34,6 @@ public interface LongArticleBaseMapper {
     List<GetOffVideos> getGetOffVideos(List<Long> videoIds);
 
     List<LongArticlesMatchVideos> getLongArticlesMatchVideos(List<String> traceIds);
+
+    List<LongArticlesCrawlerVideos> getLongArticlesCrawlerVideos(List<String> contentIds);
 }

+ 8 - 0
long-article-recommend-service/src/main/java/com/tzld/longarticle/recommend/server/service/DataFlushService.java

@@ -113,6 +113,14 @@ public class DataFlushService {
         }
         if (!CollectionUtils.isEmpty(batchSaveList)) {
             for (List<LongArticlesCrawlerVideos> partition : Lists.partition(batchSaveList, 1000)) {
+                List<String> contentIds = partition.stream().map(LongArticlesCrawlerVideos::getContentId).collect(Collectors.toList());
+                List<LongArticlesCrawlerVideos> existsList = longArticleBaseMapper.getLongArticlesCrawlerVideos(contentIds);
+                Map<String, List<LongArticlesCrawlerVideos>> existsMap = existsList.stream()
+                        .collect(Collectors.groupingBy(LongArticlesCrawlerVideos::getContentId));
+                partition = partition.stream().filter(o -> {
+                    List<LongArticlesCrawlerVideos> itemList = existsMap.get(o.getContentId());
+                    return CollectionUtil.isEmpty(itemList) || itemList.size() < 3;
+                }).collect(Collectors.toList());
                 longArticleBaseMapper.batchInsertLongArticlesCrawlerVideos(partition);
             }
         }

+ 3 - 3
long-article-recommend-service/src/main/resources/mapper/crawler/CrawlerBaseMapper.xml

@@ -13,7 +13,7 @@
         select count(1) from get_off_videos where publish_time > 1727239202
     </select>
     <select id="pageGetOffVideos" resultType="com.tzld.longarticle.recommend.server.model.dto.GetOffVideos">
-        select * from get_off_videos where publish_time > 1727239202 order by video_id limit #{offset}, #{pageSize}
+        select * from get_off_videos order by video_id limit #{offset}, #{pageSize}
     </select>
     <select id="countLongArticlesRootSourceId" resultType="java.lang.Integer">
         select count(1) from long_articles_root_source_id where requestTime > 1727192229
@@ -26,14 +26,14 @@
     </select>
     <select id="pageArticleMatchVideos"
             resultType="com.tzld.longarticle.recommend.server.model.dto.ArticleMatchVideos">
-        select * from article_match_videos where DATE(update_time) &gt;= '2024-09-20' order by video_id
+        select * from article_match_videos order by video_id
     </select>
     <select id="countLongArticlesVideos" resultType="java.lang.Integer">
         select count(1) from long_articles_video where DATE(update_time) &gt;= '2024-09-20' and success = 1 and content_status = 2
     </select>
     <select id="pageLongArticlesVideos"
             resultType="com.tzld.longarticle.recommend.server.model.dto.LongArticlesVideo">
-        select * from long_articles_video where DATE(update_time) &gt;= '2024-09-20' and success = 1 and content_status = 2 order by id limit #{offset}, #{pageSize}
+        select * from long_articles_video where success = 1 and content_status = 2 order by id limit #{offset}, #{pageSize}
     </select>
     <select id="getLongArticlesText" resultType="com.tzld.longarticle.recommend.server.model.dto.LongArticlesText">
         select content_id, kimi_title, kimi_summary, kimi_keys from long_articles_video where kimi_summary is not null group by content_id

+ 8 - 0
long-article-recommend-service/src/main/resources/mapper/longArticle/LongArticleBaseMapper.xml

@@ -123,6 +123,14 @@
             #{item}
         </foreach>
     </select>
+    <select id="getLongArticlesCrawlerVideos"
+            resultType="com.tzld.longarticle.recommend.server.model.dto.LongArticlesCrawlerVideos">
+        select * from long_article_crawler_videos where content_id in
+        <foreach collection="contentIds" item="item" open="(" close=")" separator=",">
+            #{item}
+        </foreach>
+        and download_status = 2
+    </select>
 
     <insert id="batchInsertLongArticlesMatchVideos" parameterType="list">
         INSERT INTO long_articles_match_videos (trace_id, content_id, flow_pool_level, gh_id, account_name,