Просмотр исходного кода

好看视频爬取改为使用爬虫接口 download

wangyunpeng 4 дней назад
Родитель
Сommit
d451b27685

+ 5 - 0
long-article-server/src/main/java/com/tzld/piaoquan/longarticle/controller/JobController.java

@@ -19,6 +19,11 @@ public class JobController {
         newMatchVideoJob.matchCrawlerVideoJob(null);
     }
 
+    @GetMapping("/uploadCrawlerVideoJob")
+    public void uploadCrawlerVideoJob() {
+        newMatchVideoJob.uploadCrawlerVideoJob(null);
+    }
+
     @GetMapping("/vectorMatchVideoJob")
     public void vectorMatchVideoJob(String flowPoolLevel) {
         newMatchVideoJob.vectorMatchVideoJob(flowPoolLevel);

+ 62 - 30
long-article-server/src/main/java/com/tzld/piaoquan/longarticle/utils/other/VideoDownloader.java

@@ -3,12 +3,16 @@ package com.tzld.piaoquan.longarticle.utils.other;
 import com.alibaba.fastjson.JSONArray;
 import com.alibaba.fastjson.JSONObject;
 import lombok.extern.slf4j.Slf4j;
+import okhttp3.*;
+import okhttp3.Authenticator;
 import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.beans.factory.annotation.Value;
 import org.springframework.stereotype.Service;
 import org.springframework.util.CollectionUtils;
+
+import javax.annotation.PostConstruct;
 import java.io.File;
 import java.io.InputStream;
 import java.io.OutputStream;
@@ -16,6 +20,7 @@ import java.net.*;
 import java.nio.file.Files;
 import java.util.Objects;
 import java.util.UUID;
+import java.util.concurrent.TimeUnit;
 
 import static com.tzld.piaoquan.longarticle.common.constants.ProxyConstant.*;
 
@@ -31,6 +36,33 @@ public class VideoDownloader {
 
     private static final int MAX_RETRIES = 2;
 
+    private OkHttpClient proxyHttpClient;
+    private OkHttpClient directHttpClient;
+
+    @PostConstruct
+    public void init() {
+        Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(PROXY_HOST, PROXY_PORT));
+        Authenticator authenticator = (route, response) -> {
+            String credential = Credentials.basic(USERNAME, PASSWORD);
+            return response.request().newBuilder()
+                    .header("Proxy-Authorization", credential)
+                    .build();
+        };
+        proxyHttpClient = new OkHttpClient.Builder()
+                .proxy(proxy)
+                .proxyAuthenticator(authenticator)
+                .connectTimeout(10, TimeUnit.SECONDS)
+                .readTimeout(60, TimeUnit.SECONDS)
+                .writeTimeout(60, TimeUnit.SECONDS)
+                .build();
+
+        directHttpClient = new OkHttpClient.Builder()
+                .connectTimeout(10, TimeUnit.SECONDS)
+                .readTimeout(60, TimeUnit.SECONDS)
+                .writeTimeout(60, TimeUnit.SECONDS)
+                .build();
+    }
+
     public String downloadCover(String outVideoId, String platform, String coverUrl) {
         String path = generateCoverPath(platform, outVideoId);
         if (download(path, coverUrl, platform) == 0) {
@@ -81,8 +113,18 @@ public class VideoDownloader {
     }
 
     public int download(String path, String videoUrl, String platform) {
+        // 先尝试代理下载,失败则直连下载
+        int result = doDownload(path, videoUrl, platform, true);
+        if (result != 0) {
+            log.info("download with proxy failed, retry without proxy, path={}", path);
+            result = doDownload(path, videoUrl, platform, false);
+        }
+        return result;
+    }
+
+    private int doDownload(String path, String videoUrl, String platform, boolean useProxy) {
         try {
-            log.info("download video path={}", path);
+            log.info("download path={} useProxy={}", path, useProxy);
             if (StringUtils.isEmpty(videoUrl)) {
                 return -1;
             }
@@ -90,42 +132,40 @@ public class VideoDownloader {
             if (file.exists()) {
                 file.delete();
             }
-            Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(PROXY_HOST, PROXY_PORT));
-            HttpURLConnection connection = (HttpURLConnection) new URL(videoUrl).openConnection(proxy);
-            connection.setConnectTimeout(10000);
-            // 设置读取超时时间(单位:毫秒)
-            connection.setReadTimeout(60000);
-            connection.setRequestMethod("GET");
-            connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0");
-            connection.setRequestProperty("Accept", "*/*");
-            connection.setRequestProperty("accept-language", "en,zh;q=0.9,zh-CN;q=0.8");
+
+            Request.Builder builder = new Request.Builder()
+                    .url(videoUrl)
+                    .header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0")
+                    .header("Accept", "*/*")
+                    .header("accept-language", "en,zh;q=0.9,zh-CN;q=0.8");
             if (Objects.equals(platform, "dy_search")) {
-                connection.setRequestProperty("referer", "https://v11-coldf.douyinvod.com/");
+                builder.header("referer", "https://v11-coldf.douyinvod.com/");
             }
-            // 连接并获取响应
-            connection.connect();
 
-            int responseCode = connection.getResponseCode();
-            if (responseCode == HttpURLConnection.HTTP_OK || responseCode == HttpURLConnection.HTTP_PARTIAL) {
-                long expectedSize = connection.getContentLengthLong();
-                InputStream inputStream = connection.getInputStream();
+            OkHttpClient client = useProxy ? proxyHttpClient : directHttpClient;
+            Response response = client.newCall(builder.build()).execute();
+
+            if (response.isSuccessful()) {
+                InputStream inputStream = response.body().byteStream();
                 OutputStream outputStream = Files.newOutputStream(file.toPath());
                 byte[] buffer = new byte[1024 * 1024];
                 int bytesRead;
-                log.info("download start path={} expectedSize={}", path, expectedSize);
+                log.info("download start path={}", path);
                 long size = 0;
                 while ((bytesRead = inputStream.read(buffer)) != -1) {
                     size += bytesRead;
                     outputStream.write(buffer, 0, bytesRead);
                 }
-                log.info("download end path={}", path);
+                outputStream.close();
+                inputStream.close();
+                log.info("download end path={} size={}", path, size);
                 if (file.length() != 0) {
                     return 0;
                 }
             }
-            return responseCode;
+            return response.code();
         } catch (Exception e) {
-            log.error("download error:{}", e.getMessage());
+            log.error("download error useProxy={}:{}", useProxy, e.getMessage());
         }
         return -1;
     }
@@ -153,17 +193,9 @@ public class VideoDownloader {
     }
 
     public static void main(String[] args) {
-        System.setProperty("jdk.http.auth.tunneling.disabledSchemes", "");
-        Authenticator.setDefault(
-                new Authenticator() {
-                    public PasswordAuthentication getPasswordAuthentication() {
-                        return new PasswordAuthentication(
-                                USERNAME, PASSWORD.toCharArray());
-                    }
-                }
-        );
         VideoDownloader videoDownloader = new VideoDownloader();
         videoDownloader.downloadPath = "/download";
+        videoDownloader.init();
         videoDownloader.downloadVideo("2562802497357206175", "baidu_search", "https://vdept3.bdstatic.com/mda-qkt120240g2y8uka/cae_h264/1732792181901415854/mda-qkt120240g2y8uka.mp4?v_from_s=hkapp-haokan-suzhou&auth_key=1738759058-0-0-115429d241bdab4208d998b5f3f95507&bcevod_channel=searchbox_feed&cr=0&cd=0&pd=1&pt=3&logid=2258462520&vid=2562802497357206175&abtest=132219_1");
     }