Преглед на файлове

文章匹配小程序优化

xueyiming преди 5 месеца
родител
ревизия
0ba9c020d4

+ 30 - 0
long-article-server/src/main/java/com/tzld/piaoquan/longarticle/component/RedisLock.java

@@ -0,0 +1,30 @@
+package com.tzld.piaoquan.longarticle.component;
+
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.data.redis.core.RedisTemplate;
+import org.springframework.data.redis.core.ValueOperations;
+import org.springframework.stereotype.Component;
+
+import java.util.concurrent.TimeUnit;
+
+@Component
+public class RedisLock {
+
+    @Autowired
+    private RedisTemplate<String, Object> redisTemplate;
+
+    public boolean tryLock(String lockKey, String lockValue, long expireTime, TimeUnit timeUnit) {
+        ValueOperations<String, Object> valueOps = redisTemplate.opsForValue();
+        Boolean result = valueOps.setIfAbsent(lockKey, lockValue, expireTime, timeUnit);
+        // setIfAbsent 方法会在键不存在时设置键值对,并返回是否成功
+        return result != null && result;
+    }
+
+    public void unlock(String lockKey, String lockValue) {
+        ValueOperations<String, Object> valueOps = redisTemplate.opsForValue();
+        // 确保只有锁的持有者才能释放锁,避免误释放
+        if (lockValue.equals(valueOps.get(lockKey))) {
+            redisTemplate.delete(lockKey);
+        }
+    }
+}

+ 18 - 5
long-article-server/src/main/java/com/tzld/piaoquan/longarticle/service/local/impl/CrawlerVideoServiceImpl.java

@@ -33,6 +33,8 @@ public class CrawlerVideoServiceImpl {
 
     private static final Double NLP_SIMILARITY_THRESHOLD = 0.45;
 
+    private static final int MAX_NUM = 3;
+
     private static final List<String> sensitiveWords = new ArrayList<String>() {{
         add("人民");
         add("必胜");
@@ -47,21 +49,26 @@ public class CrawlerVideoServiceImpl {
     }};
 
     public void addCrawlerVideo(String contentId, LongArticlesText kimiText) {
+        boolean b = existCrawlerVideo(contentId);
+        if (b) {
+            return;
+        }
         List<CrawlerVideo> crawlerVideoList = searchVideosFromWeb(kimiText);
         if (!CollectionUtils.isEmpty(crawlerVideoList)) {
             for (CrawlerVideo crawlerVideo : crawlerVideoList) {
                 crawlerVideo.setContentId(contentId);
-                crawlerVideo.setStatus(0);
+                crawlerVideo.setDownloadStatus(0);
                 crawlerVideoMapper.insertSelective(crawlerVideo);
             }
             pushOss(contentId);
         }
     }
 
-    public long countCrawlerVideo(String contentId) {
+    public boolean existCrawlerVideo(String contentId) {
         CrawlerVideoExample example = new CrawlerVideoExample();
-        example.createCriteria().andContentIdEqualTo(contentId).andStatusEqualTo(2);
-        return crawlerVideoMapper.countByExample(example);
+        example.createCriteria().andContentIdEqualTo(contentId).andDownloadStatusEqualTo(2);
+        long l = crawlerVideoMapper.countByExample(example);
+        return l >= MAX_NUM;
     }
 
 
@@ -194,10 +201,12 @@ public class CrawlerVideoServiceImpl {
     public void pushOss(String contentId) {
         CrawlerVideoExample example = new CrawlerVideoExample();
         example.createCriteria().andContentIdEqualTo(contentId).andDownloadStatusEqualTo(0);
+        example.setOrderByClause("score desc");
         List<CrawlerVideo> crawlerVideoList = crawlerVideoMapper.selectByExampleWithBLOBs(example);
         if (CollectionUtils.isEmpty(crawlerVideoList)) {
             return;
         }
+        int count = 0;
         for (CrawlerVideo crawlerVideo : crawlerVideoList) {
             String platform = crawlerVideo.getPlatform();
             String outVideoId = crawlerVideo.getOutVideoId();
@@ -213,6 +222,7 @@ public class CrawlerVideoServiceImpl {
                     udpateCrawlerVideo.setId(crawlerVideo.getId());
                     udpateCrawlerVideo.setDownloadStatus(2);
                     crawlerVideoMapper.updateByPrimaryKeySelective(udpateCrawlerVideo);
+                    count++;
                     try {
                         Files.delete(Paths.get(videoPath));
                         Files.delete(Paths.get(coverPath));
@@ -223,10 +233,13 @@ public class CrawlerVideoServiceImpl {
                     //下载失败
                     CrawlerVideo udpateCrawlerVideo = new CrawlerVideo();
                     udpateCrawlerVideo.setId(crawlerVideo.getId());
-                    udpateCrawlerVideo.setDownloadStatus(1);
+                    udpateCrawlerVideo.setDownloadStatus(3);
                     crawlerVideoMapper.updateByPrimaryKeySelective(udpateCrawlerVideo);
                 }
             }
+            if (count >= MAX_NUM) {
+                break;
+            }
         }
     }
 

+ 115 - 35
long-article-server/src/main/java/com/tzld/piaoquan/longarticle/service/local/impl/MatchVideoServiceImpl.java

@@ -2,6 +2,7 @@ package com.tzld.piaoquan.longarticle.service.local.impl;
 
 import cn.hutool.json.JSONObject;
 import com.alibaba.fastjson.JSONArray;
+import com.tzld.piaoquan.longarticle.component.RedisLock;
 import com.tzld.piaoquan.longarticle.dao.mapper.CrawlerVideoMapper;
 import com.tzld.piaoquan.longarticle.dao.mapper.LongArticlesTextMapper;
 import com.tzld.piaoquan.longarticle.dao.mapper.MatchVideoMapper;
@@ -11,18 +12,25 @@ import com.tzld.piaoquan.longarticle.model.vo.MatchVideoVo;
 import com.tzld.piaoquan.longarticle.service.local.KimiService;
 import com.tzld.piaoquan.longarticle.utils.*;
 import com.tzld.piaoquan.longarticle.utils.other.*;
+import com.tzld.piaoquan.longarticle.utils.page.Page;
 import org.apache.commons.lang3.StringUtils;
 import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.data.redis.core.RedisTemplate;
 import org.springframework.stereotype.Service;
 import org.springframework.transaction.annotation.Transactional;
 import org.springframework.util.CollectionUtils;
 
 import java.util.*;
+import java.util.concurrent.TimeUnit;
 
 @Service
 public class MatchVideoServiceImpl {
 
-    private static final int MAX_NUM = 3;
+
+    private static final String KIMI_LOCK_KEY = "kimi_lock_key_%s";
+
+    private static final String CRAWLER_LOCK_KEY = "crawler_lock_key_%s";
+
 
     @Autowired
     KimiService kimiService;
@@ -30,15 +38,18 @@ public class MatchVideoServiceImpl {
     @Autowired
     private MatchVideoMapper matchVideoMapper;
 
-    @Autowired
-    private CrawlerVideoMapper crawlerVideoMapper;
-
     @Autowired
     private LongArticlesTextMapper longArticlesTextMapper;
 
     @Autowired
     private CrawlerVideoServiceImpl crawlerVideoService;
 
+    @Autowired
+    private RedisLock redisLock;
+
+    @Autowired
+    private RedisTemplate<String, Object> redisTemplate;
+
     @Transactional
     public void addMatchVideo(MatchContent matchContent) {
         MatchVideoExample example = new MatchVideoExample();
@@ -53,9 +64,9 @@ public class MatchVideoServiceImpl {
         matchVideo.setContentId(matchContent.getSourceId());
         matchVideo.setAccountName(matchContent.getAccountName());
         matchVideo.setGhId(matchContent.getGhId());
-        matchVideo.setFlowPoolLevel(matchVideo.getFlowPoolLevel());
-        matchVideo.setProcessTimes(1);
+        matchVideo.setFlowPoolLevel(matchContent.getFlowPoolLevelTag());
         matchVideo.setContentStatus(0);
+        matchVideo.setPublishFlag(2);
         long timestamp = System.currentTimeMillis() / 1000;
         matchVideo.setContentStatusUpdateTime(Long.valueOf(timestamp).intValue());
         matchVideo.setRequestTimestamp(Long.valueOf(timestamp).intValue());
@@ -73,47 +84,116 @@ public class MatchVideoServiceImpl {
         }
     }
 
-    public void matchContent() {
-        CrawlerVideoExample example = new CrawlerVideoExample();
-        example.createCriteria().andDownloadStatusEqualTo(0);
-        crawlerVideoMapper.selectByExample(example);
+    public void getMatchVideo() {
+        List<Integer> targetStatus = new ArrayList<Integer>() {{
+            add(0);
+            add(1);
+            add(2);
+        }};
+        List<MatchVideo> matchVideos = null;
+        //循环增加游标位置,通过主键索引过滤已经处理的数据
+        do {
+            Integer id = (Integer) redisTemplate.opsForValue().get("last_match_video_id");
+            if (id == null) {
+                id = 0;
+            }
+            MatchVideoExample example = new MatchVideoExample();
+            example.createCriteria().andIdGreaterThan(id);
+            example.setOrderByClause("id asc");
+            Page page = new Page<>();
+            page.setCurrentPage(1);
+            page.setPageSize(5000);
+            example.setPage(page);
+            matchVideos = matchVideoMapper.selectByExample(example);
+            boolean flag = true;
+            for (MatchVideo matchVideo : matchVideos) {
+                if (targetStatus.contains(matchVideo.getContentStatus())) {
+                    flag = false;
+                    break;
+                }
+            }
+            if (flag) {
+                Integer lastId = matchVideos.get(matchVideos.size() - 1).getId();
+                redisTemplate.opsForValue().set("last_match_video_id", lastId);
+            }
+        } while (CollectionUtils.isEmpty(matchVideos));
+
+        int pageNum = 1;
+        do {
+            Integer id = (Integer) redisTemplate.opsForValue().get("last_match_video_id");
+            if (id == null) {
+                break;
+            }
+
+            int pageSize = 5000;
+            MatchVideoExample example = new MatchVideoExample();
+            example.createCriteria().andIdGreaterThan(id).andContentStatusIn(targetStatus);
+            example.setOrderByClause("id asc");
+            Page page = new Page<>();
+            page.setCurrentPage(pageNum);
+            page.setPageSize(pageSize);
+            example.setPage(page);
+            matchVideos = matchVideoMapper.selectByExample(example);
+            for (MatchVideo matchVideo : matchVideos) {
+                //TODO 加入等待队列 多线程处理
+            }
+            pageNum++;
+        } while (CollectionUtils.isEmpty(matchVideos));
+
     }
 
 
     public void processMatchContent(MatchVideo matchVideo) {
-        //1.执行kimi任务
-        LongArticlesText kimiText = kimiService.getKimiText(matchVideo.getContentId());
-        if (kimiText == null) {
-            //TODO 查询信息重新生成kimi信息
-            return;
-        }
-        if (kimiText.getKimiStatus() == 0) {
-            //TODO 加锁
-            kimiText = kimiService.getAndUpdateContent(matchVideo.getContentId());
+        if (matchVideo.getContentStatus() == 0) {
+            //1.执行kimi任务
+            LongArticlesText kimiText = kimiService.getKimiText(matchVideo.getContentId());
             if (kimiText == null) {
-                //TODO kimi结果获取失败
+                //TODO 查询信息重新生成kimi信息
                 return;
             }
+            //执行kimi任务
+            if (kimiText.getKimiStatus() == 0) {
+                String lockKey = String.format(KIMI_LOCK_KEY, matchVideo.getContentId());
+                String lockValue = UUID.randomUUID().toString();
+                boolean lock = redisLock.tryLock(lockKey, lockValue, 300, TimeUnit.SECONDS);
+                if (lock) {
+                    kimiText = kimiService.getAndUpdateContent(matchVideo.getContentId());
+                    redisLock.unlock(lockKey, lockValue);
+                    if (kimiText == null) {
+                        return;
+                    }
+                } else {
+                    return;
+                }
+            } else {
+                //更新状态为kimi执行完成
+                updateStatus(matchVideo.getId(), 1);
+            }
         }
-        boolean existCrawlerVideo = existCrawlerVideo(matchVideo.getContentId());
-        if(!existCrawlerVideo){
-
-        }
-
-        //2.执行爬虫任务
-        int retry = 0;
-        long count = crawlerVideoService.countCrawlerVideo(matchVideo.getContentId());
-        if (count < 3) {
-            crawlerVideoService.addCrawlerVideo(matchVideo.getContentId(), kimiText);
+        boolean existCrawlerVideo = crawlerVideoService.existCrawlerVideo(matchVideo.getContentId());
+        if (!existCrawlerVideo) {
+            String lockKey = String.format(CRAWLER_LOCK_KEY, matchVideo.getContentId());
+            String lockValue = UUID.randomUUID().toString();
+            boolean lock = redisLock.tryLock(lockKey, lockValue, 600, TimeUnit.SECONDS);
+            if (lock) {
+                LongArticlesText kimiText = kimiService.getKimiText(matchVideo.getContentId());
+                crawlerVideoService.addCrawlerVideo(matchVideo.getContentId(), kimiText);
+                redisLock.unlock(lockKey, lockValue);
+            }
+        } else {
+            //更新状态为etl执行完成
+            updateStatus(matchVideo.getId(), 3);
         }
     }
 
-    public boolean existCrawlerVideo(String contentId) {
-        CrawlerVideoExample example = new CrawlerVideoExample();
-        example.createCriteria().andContentIdEqualTo(contentId).andDownloadStatusEqualTo(2);
-        long l = crawlerVideoMapper.countByExample(example);
-        return l >= MAX_NUM;
+    private void updateStatus(Integer id, Integer status) {
+        MatchVideo matchVideo = new MatchVideo();
+        matchVideo.setId(id);
+        matchVideo.setContentStatus(status);
+        matchVideoMapper.updateByPrimaryKeySelective(matchVideo);
     }
+
+
 }
 
 

+ 9 - 5
long-article-server/src/main/java/com/tzld/piaoquan/longarticle/utils/other/DouyinSearch.java

@@ -33,7 +33,7 @@ public class DouyinSearch {
     public static void main(String[] args) {
         List<String> sensitiveWords = new ArrayList<>();
         List<JSONObject> list = douyinSearch("北京女孩与父亲的法律纠纷引发对", sensitiveWords, "");
-        for (JSONObject jsonObject : list){
+        for (JSONObject jsonObject : list) {
             System.out.println(jsonObject);
             CrawlerVideo crawlerVideo = dyVideoProduce(jsonObject);
             System.out.println(crawlerVideo);
@@ -59,7 +59,6 @@ public class DouyinSearch {
         try {
             JSONObject jsonResponse = JSONObject.parseObject(response.body());
             JSONArray dtList = jsonResponse.getJSONObject("data").getJSONArray("data");
-//            List<JSONObject> dtList = jsonResponse.getByPath("data.data", List.class);
             for (int i = 0; i < dtList.size(); i++) {
                 JSONObject obj = dtList.getJSONObject(i);
                 try {
@@ -67,7 +66,7 @@ public class DouyinSearch {
                     String videoId = obj.getString("video_id");
                     int duration = obj.getInteger("duration");
 
-                    if (sensitiveFlag(sensitiveWords.toString(), title) && duration < 30000) {
+                    if (sensitiveFlag(sensitiveWords, title) && duration < 30000) {
                         JSONObject res = douyinDetail(videoId);
                         if (res != null) {
                             resultList.add(res);
@@ -103,9 +102,14 @@ public class DouyinSearch {
         }
     }
 
-    private static boolean sensitiveFlag(String sensitiveWords, String title) {
+    private static boolean sensitiveFlag(List<String> sensitiveWords, String title) {
         // 实现敏感词检查逻辑
-        return true; // 示例
+        for (String word : sensitiveWords) {
+            if (title.contains(word)) {
+                return false;
+            }
+        }
+        return true;
     }
 
 }

+ 8 - 2
long-article-server/src/main/java/com/tzld/piaoquan/longarticle/utils/other/HkspSearch.java

@@ -76,7 +76,7 @@ public class HkspSearch {
                     .header("accept", "*/*")
                     .header("accept-language", "zh,en;q=0.9,zh-CN;q=0.8")
                     .header("cookie", "BIDUPSID=" + base64String)
-                    .header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0") // 假用户代理
+                    .header("user-agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
                     .header("x-requested-with", "xmlhttprequest")
                     .timeout(120000) // 设置超时时间
                     .setProxy(new Proxy(Proxy.Type.HTTP, new InetSocketAddress("l901.kdltps.com", 15818)))
@@ -132,7 +132,12 @@ public class HkspSearch {
 
     private static boolean sensitiveFlag(List<String> sensitiveWords, String title) {
         // 实现敏感词检查逻辑
-        return true; // 示例
+        for (String word : sensitiveWords) {
+            if (title.contains(word)) {
+                return false;
+            }
+        }
+        return true;
     }
 
     public static JSONObject getVideoDetail(String videoId) {
@@ -161,6 +166,7 @@ public class HkspSearch {
                 .header("Connection", "keep-alive")
                 .header("Content-Type", "application/x-www-form-urlencoded")
                 .header("Referer", "https://haokan.baidu.com")
+                .header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
                 .form(params)
                 .setProxy(new Proxy(Proxy.Type.HTTP, new InetSocketAddress("l901.kdltps.com", 15818)))
                 .execute();