瀏覽代碼

长文匹配优化

xueyiming 5 月之前
父節點
當前提交
23a748ac4e

+ 4 - 0
long-article-server/src/main/java/com/tzld/piaoquan/longarticle/service/local/KimiService.java

@@ -5,4 +5,8 @@ import com.tzld.piaoquan.longarticle.model.po.LongArticlesText;
 public interface KimiService {
 
     LongArticlesText getKimiText(String contentId);
+
+    LongArticlesText addAndGetLongArticlesText(String contentId, String articleTitle, String articleText);
+
+    LongArticlesText getAndUpdateContent(String contentId);
 }

+ 57 - 15
long-article-server/src/main/java/com/tzld/piaoquan/longarticle/service/local/impl/CrawlerVideoServiceImpl.java

@@ -5,30 +5,30 @@ import com.alibaba.fastjson.JSONArray;
 import com.alibaba.fastjson.JSONObject;
 import com.tzld.piaoquan.longarticle.dao.mapper.CrawlerVideoMapper;
 import com.tzld.piaoquan.longarticle.model.po.CrawlerVideo;
+import com.tzld.piaoquan.longarticle.model.po.CrawlerVideoExample;
 import com.tzld.piaoquan.longarticle.model.po.LongArticlesText;
 import com.tzld.piaoquan.longarticle.service.local.KimiService;
-import com.tzld.piaoquan.longarticle.utils.other.DouyinSearch;
-import com.tzld.piaoquan.longarticle.utils.other.HkspSearch;
-import com.tzld.piaoquan.longarticle.utils.other.NlpUtils;
+import com.tzld.piaoquan.longarticle.utils.other.*;
+import lombok.extern.slf4j.Slf4j;
 import org.apache.commons.lang3.StringUtils;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.stereotype.Service;
 import org.springframework.util.CollectionUtils;
 
+import java.nio.file.Files;
+import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.List;
 import java.util.stream.Collectors;
 
+@Slf4j
 @Service
 public class CrawlerVideoServiceImpl {
 
     @Autowired
     private CrawlerVideoMapper crawlerVideoMapper;
 
-    @Autowired
-    private KimiService kimiService;
-
     private static final String default_user_id = "69637498";
 
     private static final Double NLP_SIMILARITY_THRESHOLD = 0.45;
@@ -46,16 +46,24 @@ public class CrawlerVideoServiceImpl {
         add("习");
     }};
 
-    public void addCrawlerVideo(String contentId) {
-        LongArticlesText kimiText = kimiService.getKimiText(contentId);
+    public void addCrawlerVideo(String contentId, LongArticlesText kimiText) {
         List<CrawlerVideo> crawlerVideoList = searchVideosFromWeb(kimiText);
         if (!CollectionUtils.isEmpty(crawlerVideoList)) {
             for (CrawlerVideo crawlerVideo : crawlerVideoList) {
+                crawlerVideo.setContentId(contentId);
+                crawlerVideo.setStatus(0);
                 crawlerVideoMapper.insertSelective(crawlerVideo);
             }
+            pushOss(contentId);
         }
     }
 
+    public long countCrawlerVideo(String contentId) {
+        CrawlerVideoExample example = new CrawlerVideoExample();
+        example.createCriteria().andContentIdEqualTo(contentId).andStatusEqualTo(2);
+        return crawlerVideoMapper.countByExample(example);
+    }
+
 
     public List<CrawlerVideo> searchVideosFromWeb(LongArticlesText longArticlesText) {
         String articleSummary = longArticlesText.getKimiSummary().substring(0, 15);
@@ -79,14 +87,12 @@ public class CrawlerVideoServiceImpl {
             res.addAll(list2);
         }
         if (CollectionUtils.isEmpty(res)) {
-            //TODO 搜索失败
             return null;
         }
         res = res.stream().filter(f -> StringUtils.isNotEmpty(f.getString("title"))).collect(Collectors.toList());
         List<String> titleList = res.stream().map(e -> e.getString("title")).collect(Collectors.toList());
         List<Float> titleSimilarityWithNlp = getTitleSimilarityWithNlp(oriTitle, titleList);
         if (CollectionUtils.isEmpty(titleSimilarityWithNlp) || titleSimilarityWithNlp.size() != res.size()) {
-            //TODO 评分失败
             return null;
         }
         for (int i = 0; i < res.size(); i++) {
@@ -116,10 +122,6 @@ public class CrawlerVideoServiceImpl {
     }
 
 
-    public void downloadVideos(CrawlerVideo crawlerVideo) {
-
-    }
-
     public List<Float> getTitleSimilarityWithNlp(String oriTitle, List<String> titleList) {
         List<Float> baseScores = NlpUtils.baseNlpTitleSimilarity(oriTitle, titleList);
         if (!CollectionUtils.isEmpty(baseScores)) {
@@ -154,7 +156,7 @@ public class CrawlerVideoServiceImpl {
 
     public CrawlerVideo dyVideoProduce(JSONObject jsonObject) {
         CrawlerVideo crawlerVideo = new CrawlerVideo();
-        crawlerVideo.setPlatform(jsonObject.getString("c"));
+        crawlerVideo.setPlatform(jsonObject.getString("platform"));
         crawlerVideo.setVideoTitle(jsonObject.getString("title"));
         crawlerVideo.setOutVideoId(jsonObject.getString("channel_content_id"));
         if (jsonObject.getLong("publish_timestamp") != null) {
@@ -188,4 +190,44 @@ public class CrawlerVideoServiceImpl {
         crawlerVideo.setUserId(default_user_id);
         return crawlerVideo;
     }
+
+    public void pushOss(String contentId) {
+        CrawlerVideoExample example = new CrawlerVideoExample();
+        example.createCriteria().andContentIdEqualTo(contentId).andDownloadStatusEqualTo(0);
+        List<CrawlerVideo> crawlerVideoList = crawlerVideoMapper.selectByExampleWithBLOBs(example);
+        if (CollectionUtils.isEmpty(crawlerVideoList)) {
+            return;
+        }
+        for (CrawlerVideo crawlerVideo : crawlerVideoList) {
+            String platform = crawlerVideo.getPlatform();
+            String outVideoId = crawlerVideo.getOutVideoId();
+            String videoPath = VideoDownloader.downloadVideo(outVideoId, platform, crawlerVideo.getVideoUrl());
+            String coverPath = VideoDownloader.downloadCover(outVideoId, platform, crawlerVideo.getCoverUrl());
+            if (StringUtils.isNotEmpty(videoPath) && StringUtils.isNotEmpty(coverPath)) {
+                String videoOssPath = OSSUploader.uploadToOSS(videoPath, "video");
+                String coverOssPath = OSSUploader.uploadToOSS(coverPath, "image");
+                if (StringUtils.isNotEmpty(videoOssPath) && StringUtils.isNotEmpty(coverOssPath)) {
+                    CrawlerVideo udpateCrawlerVideo = new CrawlerVideo();
+                    udpateCrawlerVideo.setVideoOssPath(videoOssPath);
+                    udpateCrawlerVideo.setCoverOssPath(coverOssPath);
+                    udpateCrawlerVideo.setId(crawlerVideo.getId());
+                    udpateCrawlerVideo.setDownloadStatus(2);
+                    crawlerVideoMapper.updateByPrimaryKeySelective(udpateCrawlerVideo);
+                    try {
+                        Files.delete(Paths.get(videoPath));
+                        Files.delete(Paths.get(coverPath));
+                    } catch (Exception e) {
+                        log.error("pushOss del file error", e);
+                    }
+                } else {
+                    //下载失败
+                    CrawlerVideo udpateCrawlerVideo = new CrawlerVideo();
+                    udpateCrawlerVideo.setId(crawlerVideo.getId());
+                    udpateCrawlerVideo.setDownloadStatus(1);
+                    crawlerVideoMapper.updateByPrimaryKeySelective(udpateCrawlerVideo);
+                }
+            }
+        }
+    }
+
 }

+ 6 - 2
long-article-server/src/main/java/com/tzld/piaoquan/longarticle/service/local/impl/KimiServiceImpl.java

@@ -32,7 +32,7 @@ public class KimiServiceImpl implements KimiService {
         return longArticlesTexts.get(0);
     }
 
-    public void addContent(String contentId, String articleTitle, String articleText) {
+    public LongArticlesText addAndGetLongArticlesText(String contentId, String articleTitle, String articleText) {
         LongArticlesTextExample example = new LongArticlesTextExample();
         example.createCriteria().andContentIdEqualTo(contentId);
         long l = longArticlesTextMapper.countByExample(example);
@@ -44,7 +44,11 @@ public class KimiServiceImpl implements KimiService {
             longArticlesText.setKimiStatus(0);
             longArticlesTextMapper.insertSelective(longArticlesText);
         }
-        longArticlesTextMapper.selectByExample(example);
+        List<LongArticlesText> longArticlesTexts = longArticlesTextMapper.selectByExample(example);
+        if (!CollectionUtils.isEmpty(longArticlesTexts)) {
+            return longArticlesTexts.get(0);
+        }
+        return null;
     }
 
     public LongArticlesText getAndUpdateContent(String contentId) {

+ 51 - 187
long-article-server/src/main/java/com/tzld/piaoquan/longarticle/service/local/impl/MatchVideoServiceImpl.java

@@ -3,7 +3,9 @@ package com.tzld.piaoquan.longarticle.service.local.impl;
 import cn.hutool.json.JSONObject;
 import com.alibaba.fastjson.JSONArray;
 import com.tzld.piaoquan.longarticle.dao.mapper.CrawlerVideoMapper;
+import com.tzld.piaoquan.longarticle.dao.mapper.LongArticlesTextMapper;
 import com.tzld.piaoquan.longarticle.dao.mapper.MatchVideoMapper;
+import com.tzld.piaoquan.longarticle.model.bo.MatchContent;
 import com.tzld.piaoquan.longarticle.model.po.*;
 import com.tzld.piaoquan.longarticle.model.vo.MatchVideoVo;
 import com.tzld.piaoquan.longarticle.service.local.KimiService;
@@ -12,6 +14,7 @@ import com.tzld.piaoquan.longarticle.utils.other.*;
 import org.apache.commons.lang3.StringUtils;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Transactional;
 import org.springframework.util.CollectionUtils;
 
 import java.util.*;
@@ -30,9 +33,16 @@ public class MatchVideoServiceImpl {
     @Autowired
     private CrawlerVideoMapper crawlerVideoMapper;
 
-    public void addMatchVideo(MatchVideoVo matchVideoVo) {
+    @Autowired
+    private LongArticlesTextMapper longArticlesTextMapper;
+
+    @Autowired
+    private CrawlerVideoServiceImpl crawlerVideoService;
+
+    @Transactional
+    public void addMatchVideo(MatchContent matchContent) {
         MatchVideoExample example = new MatchVideoExample();
-        example.createCriteria().andContentIdEqualTo(matchVideoVo.getArticleId()).andGhIdEqualTo(matchVideoVo.getGhId());
+        example.createCriteria().andContentIdEqualTo(matchContent.getSourceId()).andGhIdEqualTo(matchContent.getGhId());
         long l = matchVideoMapper.countByExample(example);
         if (l > 0) {
             return;
@@ -40,14 +50,27 @@ public class MatchVideoServiceImpl {
         String traceId = "search-" + UUID.randomUUID();
         MatchVideo matchVideo = new MatchVideo();
         matchVideo.setTraceId(traceId);
-        matchVideo.setContentId(matchVideoVo.getArticleId());
-        matchVideo.setAccountName(matchVideoVo.getAccountName());
+        matchVideo.setContentId(matchContent.getSourceId());
+        matchVideo.setAccountName(matchContent.getAccountName());
+        matchVideo.setGhId(matchContent.getGhId());
         matchVideo.setFlowPoolLevel(matchVideo.getFlowPoolLevel());
         matchVideo.setProcessTimes(1);
         matchVideo.setContentStatus(0);
         long timestamp = System.currentTimeMillis() / 1000;
         matchVideo.setContentStatusUpdateTime(Long.valueOf(timestamp).intValue());
+        matchVideo.setRequestTimestamp(Long.valueOf(timestamp).intValue());
         matchVideoMapper.insertSelective(matchVideo);
+        LongArticlesTextExample longArticlesTextExample = new LongArticlesTextExample();
+        longArticlesTextExample.createCriteria().andContentIdEqualTo(matchContent.getSourceId());
+        long l1 = longArticlesTextMapper.countByExample(longArticlesTextExample);
+        if (l1 == 0) {
+            LongArticlesText longArticlesText = new LongArticlesText();
+            longArticlesText.setArticleTitle(matchContent.getTitle());
+            longArticlesText.setArticleText(matchContent.getContent());
+            longArticlesText.setContentId(matchContent.getSourceId());
+            longArticlesText.setKimiStatus(0);
+            longArticlesTextMapper.insertSelective(longArticlesText);
+        }
     }
 
     public void matchContent() {
@@ -56,201 +79,42 @@ public class MatchVideoServiceImpl {
         crawlerVideoMapper.selectByExample(example);
     }
 
-    public Boolean existHistoryContent(String contentId) {
-        CrawlerVideoExample example = new CrawlerVideoExample();
-        example.createCriteria().andContentIdEqualTo(contentId).andDownloadStatusEqualTo(2);
-        long l = crawlerVideoMapper.countByExample(example);
-        return l >= MAX_NUM;
-    }
 
-    public void pushOss(String traceId) {
-        CrawlerVideoExample example = new CrawlerVideoExample();
-        example.createCriteria().andTraceIdEqualTo(traceId).andDownloadStatusEqualTo(0);
-        List<CrawlerVideo> crawlerVideoList = crawlerVideoMapper.selectByExampleWithBLOBs(example);
-        if (CollectionUtils.isEmpty(crawlerVideoList)) {
+    public void processMatchContent(MatchVideo matchVideo) {
+        //1.执行kimi任务
+        LongArticlesText kimiText = kimiService.getKimiText(matchVideo.getContentId());
+        if (kimiText == null) {
+            //TODO 查询信息重新生成kimi信息
             return;
         }
-        for (CrawlerVideo crawlerVideo : crawlerVideoList) {
-            String platform = crawlerVideo.getPlatform();
-            String outVideoId = crawlerVideo.getOutVideoId();
-            String videoPath = VideoDownloader.generateVideoPath(platform, outVideoId);
-            String coverPath = VideoDownloader.generateCoverPath(platform, outVideoId);
-            videoPath = VideoDownloader.downloadVideo(videoPath, platform, crawlerVideo.getVideoUrl(), "video");
-            coverPath = VideoDownloader.downloadCover(coverPath, platform, crawlerVideo.getCoverUrl());
-            if (StringUtils.isNotEmpty(videoPath) && StringUtils.isNotEmpty(coverPath)) {
-                String videoOssPath = OSSUploader.uploadToOSS(videoPath, "video");
-                String coverOssPath = OSSUploader.uploadToOSS(coverPath, "image");
-                if (StringUtils.isNotEmpty(videoOssPath) && StringUtils.isNotEmpty(coverOssPath)) {
-                    CrawlerVideo udpateCrawlerVideo = new CrawlerVideo();
-                    udpateCrawlerVideo.setVideoOssPath(videoOssPath);
-                    udpateCrawlerVideo.setCoverOssPath(coverOssPath);
-                    udpateCrawlerVideo.setId(crawlerVideo.getId());
-                    udpateCrawlerVideo.setDownloadStatus(2);
-                    crawlerVideoMapper.updateByPrimaryKeySelective(udpateCrawlerVideo);
-                }
+        if (kimiText.getKimiStatus() == 0) {
+            //TODO 加锁
+            kimiText = kimiService.getAndUpdateContent(matchVideo.getContentId());
+            if (kimiText == null) {
+                //TODO kimi结果获取失败
+                return;
             }
         }
-    }
+        boolean existCrawlerVideo = existCrawlerVideo(matchVideo.getContentId());
+        if(!existCrawlerVideo){
 
-    public boolean addVideo(String contentId, String traceId) {
-        LongArticlesText kimiText = kimiService.getKimiText(contentId);
-        if (kimiText == null) {
-            //TODO 报警  KIMI查询不到
-            return false;
-        }
-        List<JSONObject> video = getVideo(contentId, kimiText);
-        List<JSONObject> jsonObjects = SortUtil.titleSimilarityRank(kimiText.getKimiTitle(), video);
-        List<CrawlerVideo> crawlerVideoList = getCrawlerVideoList(jsonObjects, traceId, contentId);
-        if (!CollectionUtils.isEmpty(crawlerVideoList)) {
-            for (CrawlerVideo crawlerVideo : crawlerVideoList) {
-                crawlerVideoMapper.insertSelective(crawlerVideo);
-            }
         }
-        System.out.println();
-        System.out.println();
-        System.out.println(crawlerVideoList);
-        return false;
-    }
-
-    private List<CrawlerVideo> getCrawlerVideoList(List<JSONObject> jsonObjects, String traceId, String contentId) {
-        List<CrawlerVideo> crawlerVideoList = new ArrayList<>();
-        for (JSONObject jsonObject : jsonObjects) {
-            String platform = jsonObject.getStr("platform");
-            CrawlerVideo crawlerVideo = new CrawlerVideo();
-            crawlerVideo.setContentId(contentId);
-            crawlerVideo.setTraceId(traceId);
-            crawlerVideo.setPlatform(platform);
-            if (Objects.equals(platform, "dy_search")) {
-                crawlerVideo.setOutVideoId(jsonObject.getStr("channel_content_id"));
-                crawlerVideo.setVideoTitle(jsonObject.getStr("title"));
-                crawlerVideo.setPublishTime(jsonObject.getDate("publish_timestamp"));
-                List<JSONObject> videoUrlList = jsonObject.get("video_url_list", List.class);
-                if (!CollectionUtils.isEmpty(videoUrlList)) {
-                    crawlerVideo.setVideoUrl(videoUrlList.get(0).getStr("video_url"));
-                    crawlerVideo.setDuration(videoUrlList.get(0).getInt("video_duration"));
-                }
-                List<JSONObject> imageUrlList = jsonObject.get("image_url_list", List.class);
-                if (!CollectionUtils.isEmpty(imageUrlList)) {
-                    crawlerVideo.setCoverUrl(imageUrlList.get(0).getStr("image_url"));
-                }
-                crawlerVideo.setPlayCount(jsonObject.getInt("play_count"));
-                crawlerVideo.setLikeCount(jsonObject.getInt("like_count"));
-                crawlerVideo.setScore(jsonObject.getFloat("score"));
-                if (cheakCrawlerVideo(crawlerVideo)) {
-                    crawlerVideoList.add(crawlerVideo);
-                }
 
-                continue;
-            }
-            if (Objects.equals(platform, "baidu_search")) {
-                crawlerVideo.setOutVideoId(jsonObject.getStr("id"));
-                crawlerVideo.setVideoTitle(jsonObject.getStr("title"));
-                crawlerVideo.setPublishTime(jsonObject.getDate("publish_timestamp"));
-                crawlerVideo.setVideoUrl(jsonObject.getStr("playurl"));
-                crawlerVideo.setCoverUrl(jsonObject.getStr("poster"));
-                crawlerVideo.setPlayCount(jsonObject.getInt("play_cnt"));
-                crawlerVideo.setLikeCount(jsonObject.getInt("like_count") == null ? 0 : jsonObject.getInt("like_count"));
-                crawlerVideo.setDuration(jsonObject.getInt("duration"));
-                crawlerVideo.setScore(jsonObject.getFloat("score"));
-                if (cheakCrawlerVideo(crawlerVideo)) {
-                    crawlerVideoList.add(crawlerVideo);
-                }
-                continue;
-            }
+        //2.执行爬虫任务
+        int retry = 0;
+        long count = crawlerVideoService.countCrawlerVideo(matchVideo.getContentId());
+        if (count < 3) {
+            crawlerVideoService.addCrawlerVideo(matchVideo.getContentId(), kimiText);
         }
-        return crawlerVideoList;
     }
 
-    private boolean cheakCrawlerVideo(CrawlerVideo crawlerVideo) {
-        if (StringUtils.isEmpty(crawlerVideo.getOutVideoId())) {
-            return false;
-        }
-        if (StringUtils.isEmpty(crawlerVideo.getVideoUrl())) {
-            return false;
-        }
-        if (StringUtils.isEmpty(crawlerVideo.getCoverUrl())) {
-            return false;
-        }
-        if (StringUtils.isEmpty(crawlerVideo.getPlatform())) {
-            return false;
-        }
-        if (Objects.isNull(crawlerVideo.getPublishTime())) {
-            crawlerVideo.setPublishTime(new Date());
-        }
-        if (Objects.isNull(crawlerVideo.getDuration())) {
-            crawlerVideo.setDuration(0);
-        }
-        if (Objects.isNull(crawlerVideo.getPlayCount())) {
-            crawlerVideo.setPlayCount(0);
-        }
-        if (Objects.isNull(crawlerVideo.getLikeCount())) {
-            crawlerVideo.setLikeCount(0);
-        }
-        if (Objects.isNull(crawlerVideo.getShareCount())) {
-            crawlerVideo.setShareCount(0);
-        }
-        return true;
-    }
-
-    public List<JSONObject> getVideo(String contentId, LongArticlesText kimiText) {
-
-        List<JSONObject> res = new ArrayList<>();
-        List<JSONObject> kimiSummarys = searchVideo(kimiText.getKimiSummary().substring(0, 15), new ArrayList<>(), "");
-        if (!CollectionUtils.isEmpty(kimiSummarys)) {
-            res.addAll(kimiSummarys);
-        }
-        if (res.size() > 3) {
-            return res;
-        }
-        List<JSONObject> kimiTitles = searchVideo(kimiText.getKimiTitle().substring(0, 15), new ArrayList<>(), "");
-        if (!CollectionUtils.isEmpty(kimiTitles)) {
-            res.addAll(kimiTitles);
-        }
-        if (res.size() > 3) {
-            return res;
-        }
-        String kimiKeys = kimiText.getKimiKeys();
-        JSONArray jsonArray = JSONArray.parseArray(kimiKeys);
-        if (jsonArray == null || jsonArray.isEmpty()) {
-            return res;
-        }
-        for (int i = 0; i < jsonArray.size(); i++) {
-            String key = jsonArray.getString(i);
-            List<JSONObject> keys = searchVideo(key, new ArrayList<>(), "");
-            if (!CollectionUtils.isEmpty(keys)) {
-                res.addAll(keys);
-            }
-            if (res.size() > 3) {
-                return res;
-            }
-        }
-        return res;
+    public boolean existCrawlerVideo(String contentId) {
+        CrawlerVideoExample example = new CrawlerVideoExample();
+        example.createCriteria().andContentIdEqualTo(contentId).andDownloadStatusEqualTo(2);
+        long l = crawlerVideoMapper.countByExample(example);
+        return l >= MAX_NUM;
     }
+}
 
 
-    public List<JSONObject> searchVideo(String keyword, List<String> words, String traceId) {
-//        List<JSONObject> jsonObjects = DouyinSearch.douyinSearch(keyword, words, traceId);
-//        if (!CollectionUtils.isEmpty(jsonObjects)) {
-//            for (JSONObject jsonObject : jsonObjects) {
-//                jsonObject.put("platform", "dy_search");
-//            }
-//        }
-//        if (jsonObjects.size() >= 3) {
-//            return jsonObjects;
-//        }
-//        List<JSONObject> jsonObjects1 = HkspSearch.hkspSearch(keyword, words, traceId);
-//        if (CollectionUtils.isEmpty(jsonObjects1)) {
-//            return jsonObjects;
-//        }
-//        for (JSONObject jsonObject : jsonObjects1) {
-//            jsonObject.put("platform", "baidu_search");
-//        }
-//        if (CollectionUtils.isEmpty(jsonObjects)) {
-//            return jsonObjects1;
-//        }
-//        jsonObjects.addAll(jsonObjects1);
-//        return jsonObjects;
-        return null;
-    }
 
-}

+ 1 - 1
long-article-server/src/main/java/com/tzld/piaoquan/longarticle/utils/other/DouyinSearch.java

@@ -32,7 +32,7 @@ public class DouyinSearch {
 
     public static void main(String[] args) {
         List<String> sensitiveWords = new ArrayList<>();
-        List<JSONObject> list = douyinSearch("你好", sensitiveWords, "");
+        List<JSONObject> list = douyinSearch("北京女孩与父亲的法律纠纷引发对", sensitiveWords, "");
         for (JSONObject jsonObject : list){
             System.out.println(jsonObject);
             CrawlerVideo crawlerVideo = dyVideoProduce(jsonObject);

+ 1 - 2
long-article-server/src/main/java/com/tzld/piaoquan/longarticle/utils/other/HkspSearch.java

@@ -34,7 +34,7 @@ public class HkspSearch {
 
     public static void main(String[] args) {
         List<String> sensitiveWords = new ArrayList<>();
-        List<JSONObject> list = hkspSearch("你好", sensitiveWords, "");
+        List<JSONObject> list = hkspSearch("法律纠纷,教育投资,家庭责任", sensitiveWords, "");
         for (JSONObject jsonObject : list){
             CrawlerVideo crawlerVideo = baiduVideoProduce(jsonObject);
             System.out.println(jsonObject);
@@ -87,7 +87,6 @@ public class HkspSearch {
             System.out.println(response.body());
             JSONObject jsonResponse = JSONObject.parseObject(response.body());
             JSONArray dataList = jsonResponse.getJSONObject("data").getJSONArray("list");
-//            List<JSONObject> dataList = jsonResponse.getByPath("data.list", List.class);
 
             for (int i = 0; i < dataList.size(); i++) {
                 JSONObject data = dataList.getJSONObject(i);

+ 78 - 120
long-article-server/src/main/java/com/tzld/piaoquan/longarticle/utils/other/VideoDownloader.java

@@ -7,155 +7,113 @@ import org.apache.http.impl.client.CloseableHttpClient;
 import org.apache.http.impl.client.HttpClients;
 import org.apache.http.message.BasicHeader;
 
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
+import java.io.*;
+import java.net.HttpURLConnection;
+import java.net.URL;
 import java.util.Objects;
 import java.util.UUID;
 
 public class VideoDownloader {
 
-    public static String downloadCover(String filePath, String platform, String coverUrl) {
-        try {
-            HttpResponse response = sendRequest(platform, coverUrl, "cover");
-            if (response.getStatusLine().getStatusCode() != 200) {
-                return null;
-            }
-
-            try (InputStream inputStream = response.getEntity().getContent();
-                 FileOutputStream outputStream = new FileOutputStream(new File(filePath))) {
-                byte[] buffer = new byte[1024];
-                int bytesRead;
-                while ((bytesRead = inputStream.read(buffer)) != -1) {
-                    outputStream.write(buffer, 0, bytesRead);
-                }
-            }
-            return filePath;
-        } catch (Exception e) {
+    private static final String PROXY_HOST = "l901.kdltps.com";
+    private static final int PROXY_PORT = 15818;
+    private static final String USERNAME = "t11983523373311";
+    private static final String PASSWORD = "mtuhdr2z";
+    private static final int MAX_RETRIES = 3;
 
+    public static String downloadCover(String outVideoId, String platform, String coverUrl) {
+        String path = generateCoverPath(platform, outVideoId);
+        if (download(path, coverUrl, platform)) {
+            return path;
         }
         return "";
     }
 
-    public static HttpResponse sendRequest(String platform, String url, String downloadType) throws IOException {
-        CloseableHttpClient httpClient = HttpClients.createDefault();
-        HttpGet request = new HttpGet(url);
-        request.setHeaders(requestHeader(platform, url, downloadType));
-        return httpClient.execute(request);
-    }
-
-    public static org.apache.http.Header[] requestHeader(String platform, String url, String downloadType) {
-        switch (platform) {
-            case "xg_search":
-                // 根据 URL 设置不同的请求头
-                if (url.contains("v9-xg-web-pc.ixigua.com")) {
-                    return new org.apache.http.Header[]{
-                            new BasicHeader("Accept", "*/*"),
-                            new BasicHeader("Accept-Language", "zh-CN,zh;q=0.9"),
-                            new BasicHeader("Host", "v9-xg-web-pc.ixigua.com"),
-                            new BasicHeader("User-Agent", "Mozilla/5.0"),
-                            new BasicHeader("Origin", "https://www.ixigua.com/"),
-                            new BasicHeader("Referer", "https://www.ixigua.com/")
-                    };
-                } else if (url.contains("v3-xg-web-pc.ixigua.com")) {
-                    return new org.apache.http.Header[]{
-                            new BasicHeader("Accept", "*/*"),
-                            new BasicHeader("Accept-Language", "zh-CN,zh;q=0.9"),
-                            new BasicHeader("Host", "v3-xg-web-pc.ixigua.com"),
-                            new BasicHeader("User-Agent", "Mozilla/5.0"),
-                            new BasicHeader("Origin", "https://www.ixigua.com/"),
-                            new BasicHeader("Referer", "https://www.ixigua.com/")
-                    };
-                } else if (Objects.equals(downloadType, "cover")) {
-                    return new org.apache.http.Header[]{
-                            new BasicHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"),
-                            new BasicHeader("Accept-Language", "en,zh;q=0.9,zh-CN;q=0.8"),
-                            new BasicHeader("Cache-Control", "max-age=0"),
-                            new BasicHeader("Proxy-Connection", "keep-alive"),
-                            new BasicHeader("Upgrade-Insecure-Requests", "1"),
-                            new BasicHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36")
-                    };
-                }
-                // 其他 headers 根据 downloadType 处理
-                break;
-            case "baidu_search":
-                return new org.apache.http.Header[]{
-                        new BasicHeader("Accept", "*/*"),
-                        new BasicHeader("Accept-Language", "zh-CN,zh;q=0.9"),
-                        new BasicHeader("User-Agent", "Mozilla/5.0")
-                };
-            case "wx_search":
-                return new org.apache.http.Header[]{
-                        new BasicHeader("Accept", "*/*"),
-                        new BasicHeader("Accept-Language", "zh-CN,zh;q=0.9"),
-                        new BasicHeader("User-Agent", "Mozilla/5.0"),
-                        new BasicHeader("Origin", "https://mp.weixin.qq.com"),
-                        new BasicHeader("Referer", "https://mp.weixin.qq.com")
-                };
-            case "dy_search":
-                return new org.apache.http.Header[]{
-                        new BasicHeader("accept", "*/*"),
-                        new BasicHeader("accept-language", "en,zh;q=0.9,zh-CN;q=0.8"),
-                        new BasicHeader("priority", "i"),
-                        new BasicHeader("referer", "https://v11-coldf.douyinvod.com/"),
-                        new BasicHeader("user-agent", "Mozilla/5.0")
-                };
-            default:
-                return new org.apache.http.Header[]{};
+    public static String downloadVideo(String outVideoId, String platform, String videoUrl) {
+        String path = generateVideoPath(platform, outVideoId);
+        if (download(path, videoUrl, platform)) {
+            return path;
         }
-        return new org.apache.http.Header[]{};
+        return "";
     }
 
-    public static String downloadVideo(String filePath, String platform, String videoUrl, String downloadType) {
+    public static boolean download(String path, String videoUrl, String platform) {
         try {
-            HttpResponse response = sendRequest(platform, videoUrl, downloadType);
-            if (response.getStatusLine().getStatusCode() != 200 && response.getStatusLine().getStatusCode() != 206) {
-                System.out.println("Error: " + response.getStatusLine().getStatusCode());
-                return null;
-            }
 
-            try (InputStream inputStream = response.getEntity().getContent();
-                 FileOutputStream outputStream = new FileOutputStream(new File(filePath))) {
-                byte[] buffer = new byte[1024 * 1024];
-                int bytesRead;
-                while ((bytesRead = inputStream.read(buffer)) != -1) {
-                    outputStream.write(buffer, 0, bytesRead);
+            int retries = 0;
+            long fileSize = 0;
+
+            while (retries < MAX_RETRIES) {
+                File file = new File(path);
+                if (file.exists()) {
+                    file.delete();
+                }
+                HttpURLConnection connection = (HttpURLConnection) new URL(videoUrl).openConnection();
+                connection.setRequestMethod("GET");
+                connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0");
+                connection.setRequestProperty("Accept", "*/*");
+                connection.setRequestProperty("Range", "bytes=" + fileSize + "-");
+                connection.setRequestProperty("accept-language", "en,zh;q=0.9,zh-CN;q=0.8");
+                if (Objects.equals(platform, "dy_search")) {
+                    connection.setRequestProperty("referer", "https://v11-coldf.douyinvod.com/");
+                }
+
+
+                // 设置代理
+                System.setProperty("http.proxyHost", PROXY_HOST);
+                System.setProperty("http.proxyPort", String.valueOf(PROXY_PORT));
+                System.setProperty("http.proxyUser", USERNAME);
+                System.setProperty("http.proxyPassword", PASSWORD);
+
+                // 连接并获取响应
+                connection.connect();
+
+                int responseCode = connection.getResponseCode();
+                if (responseCode == HttpURLConnection.HTTP_OK || responseCode == HttpURLConnection.HTTP_PARTIAL) {
+                    // 下载文件
+                    try (InputStream inputStream = connection.getInputStream();
+                         OutputStream outputStream = new FileOutputStream(file, true)) { // 追加模式
+                        byte[] buffer = new byte[1024 * 1024]; // 1MB buffer
+                        int bytesRead;
+                        while ((bytesRead = inputStream.read(buffer)) != -1) {
+                            outputStream.write(buffer, 0, bytesRead);
+                        }
+                    }
+
+                    // 检查文件是否为空
+                    if (file.length() == 0) {
+                        System.out.println("下载的文件为空,重试...");
+                        retries++;
+                        continue;
+                    } else {
+                        return true;
+                    }
+                } else {
+                    System.out.println("下载失败,HTTP 状态码:" + responseCode);
+                    retries++;
                 }
             }
-            return filePath;
+
+            System.out.println("下载失败,已达到最大重试次数:" + MAX_RETRIES);
+            return false;
         } catch (Exception e) {
-            System.out.println(e.getMessage());
+            e.printStackTrace();
         }
-        return "";
+        return false;
     }
 
-    public static String generateVideoPath(String platform, String videoId) {
+    private static String generateVideoPath(String platform, String videoId) {
         String index = String.format("%s-%s-%s", platform, videoId, UUID.randomUUID());
         String md5Hash = DigestUtils.md5Hex(index);
         String fileName = String.format("%s.mp4", md5Hash);
-        String filePath = String.join(File.separator, System.getProperty("user.dir"), fileName);
-        return filePath; // 返回文件路径和封面路径
+        return String.join(File.separator, System.getProperty("user.dir"), fileName); // 返回文件路径和封面路径
     }
 
-    public static String generateCoverPath(String platform, String videoId) {
+    private static String generateCoverPath(String platform, String videoId) {
         String index = String.format("%s-%s-%s", platform, videoId, UUID.randomUUID());
         String md5Hash = DigestUtils.md5Hex(index);
         String coverName = String.format("%s.png", md5Hash);
-        String coverPath = String.join(File.separator, System.getProperty("user.dir"), coverName);
-        return coverPath; // 返回文件路径和封面路径
+        return String.join(File.separator, System.getProperty("user.dir"), coverName); // 返回文件路径和封面路径
     }
 
-    public static void main(String[] args) {
-        // 示例用法
-        String coverUrl = "https://example.com/cover.jpg";
-        String filePath = "path/to/cover.png";
-        downloadCover(filePath, "xg_search", coverUrl);
-        String videoUrl = "https://example.com/video.mp4";
-        String videoFilePath = "path/to/video.mp4";
-        downloadVideo(videoFilePath, "xg_search", videoUrl, "video");
-        String paths = generateVideoPath("xg_search", "videoId");
-        System.out.println("Generated paths: " + paths);
-
-    }
 }

+ 14 - 0
long-article-server/src/main/resources/application-dev.properties

@@ -0,0 +1,14 @@
+server.port=8080
+
+spring.datasource.username=crawler
+spring.datasource.password=crawler123456@
+spring.datasource.url=jdbc:mysql://rm-bp1k5853td1r25g3n690.mysql.rds.aliyuncs.com:3306/long_articles?useUnicode=true&characterEncoding=utf-8&zeroDateTimeBehavior=convertToNull&useSSL=false&allowMultiQueries=true
+
+spring.redis.database=2
+spring.redis.host=r-bp154bpw97gptefiqkpd.redis.rds.aliyuncs.com
+spring.redis.port=6379
+spring.redis.password=Qingqu2019
+
+apollo.meta: https://apolloconfig-internal.piaoquantv.com
+
+xxl.job.admin.addresses=http://xxl-job-internal.piaoquantv.com/xxl-job-admin

+ 1 - 1
long-article-server/src/main/resources/application.properties

@@ -1,4 +1,4 @@
-spring.profiles.active=test
+spring.profiles.active=dev
 spring.application.name=long-article-server
 
 spring.datasource.driver-class-name=com.mysql.jdbc.Driver