Переглянути джерело

File download Add User-Agent header

ehlxr 1 рік тому
батько
коміт
6821f8f648

+ 32 - 21
etl-core/src/main/java/com/tzld/crawler/etl/service/impl/EtlServiceImpl.java

@@ -27,6 +27,7 @@ package com.tzld.crawler.etl.service.impl;
 import com.ctrip.framework.apollo.spring.annotation.ApolloJsonValue;
 import com.google.common.base.Stopwatch;
 import com.google.common.base.Strings;
+import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Lists;
 import com.tzld.commons.aliyun.oss.AliyunOssManager;
 import com.tzld.crawler.etl.common.CustomValidator;
@@ -59,6 +60,7 @@ import org.springframework.beans.BeanUtils;
 import org.springframework.beans.factory.annotation.Value;
 import org.springframework.dao.DuplicateKeyException;
 import org.springframework.stereotype.Service;
+import org.springframework.util.CollectionUtils;
 
 import javax.annotation.PostConstruct;
 import java.io.File;
@@ -117,6 +119,12 @@ public class EtlServiceImpl implements EtlService {
     private String ffmpegPath;
     @ApolloJsonValue("${fail.retry.times:{}}")
     private Map<String, Integer> failRetryTimes;
+    // @Value("${youget.path:you-get}")
+    // private String yougetPath;
+    // @Value("${youget.download.platform:}")
+    // private List<String> useYougetPlatfrom;
+    @Value("${random.ua.platform:}")
+    private List<String> randomUaPlatfom;
 
     private Executor pool;
 
@@ -327,10 +335,10 @@ public class EtlServiceImpl implements EtlService {
         String strategy = data.getStrategy();
         String audioUrl = data.getAudioUrl();
 
-        String videoPath = urlDownload(data.getVideoUrl(), "longvideo/crawler_local/video", title, data.getOutVideoId());
+        String videoPath = urlDownload(data.getVideoUrl(), "longvideo/crawler_local/video", title, data.getOutVideoId(), platform);
         // 音、视频合成
         if (!Strings.isNullOrEmpty(audioUrl)) {
-            String audioPath = urlDownload(data.getAudioUrl(), "longvideo/crawler_local/audio", title, data.getOutVideoId());
+            String audioPath = urlDownload(data.getAudioUrl(), "longvideo/crawler_local/audio", title, data.getOutVideoId(), platform);
             try {
                 retryFunc(t -> {
                     try {
@@ -348,7 +356,6 @@ public class EtlServiceImpl implements EtlService {
 
             // 清理合成音频之前的文件
             Files.deleteIfExists(Paths.get(new File(downloadPath + File.separator + videoPath).getPath()));
-            Files.deleteIfExists(Paths.get(new File(downloadPath + File.separator + audioPath).getPath()));
             videoPath += "_comp.mp4";
         }
 
@@ -372,7 +379,7 @@ public class EtlServiceImpl implements EtlService {
         Files.deleteIfExists(Paths.get(new File(tempFilePath).getPath()));
 
         // 视频封面下载、上传 OSS
-        String coverPath = urlDownload(data.getCoverUrl(), "longvideo/crawler_local/image", title, data.getOutVideoId());
+        String coverPath = urlDownload(data.getCoverUrl(), "longvideo/crawler_local/image", title, data.getOutVideoId(), platform);
         file2oss(downloadPath + File.separator + coverPath, coverPath, platform, strategy);
         data.setCoverOssPath(coverPath);
         tempFilePath = downloadPath + File.separator + coverPath;
@@ -382,7 +389,7 @@ public class EtlServiceImpl implements EtlService {
         Files.deleteIfExists(Paths.get(new File(tempFilePath).getPath()));
     }
 
-    private String urlDownload(String fileUrl, String typeDir, String title, String outVideoId) {
+    private String urlDownload(String fileUrl, String typeDir, String title, String outVideoId, String platform) {
         String fileName = MD5Util.md5(title) + outVideoId;
         String relFileDir = typeDir + File.separator
                 + env + File.separator
@@ -398,22 +405,26 @@ public class EtlServiceImpl implements EtlService {
 
         String filePath = absFileDir + File.separator + fileName;
         try {
-            retryFunc((fpath) -> {
-                try {
-                    // 下载文件
-                    FileUtils.download(fileUrl, fpath);
-                    return false;
-                } catch (CommonException e) {
-                    if (e.getCode() == ExceptionEnum.URL_FORBIDDEN.getCode()) {
-                        log.error("access to the url [{}] of remote server is prohibited.", fileUrl);
-                        metric(MetricTypeEnum.INVALID_URL, fileUrl);
-                        return false;
-                    }
-                    throw new RuntimeException(e);
-                } catch (Exception e) {
-                    throw new RuntimeException(e);
-                }
-            }, filePath, "download", String.format("download file from [%s] to [%s].", fileUrl, filePath));
+            retryFunc((param) -> {
+                        String fpath = param.getOrDefault("filePath", "");
+                        String platfrm = param.getOrDefault("platform", "");
+                        try {
+                            // 下载文件
+                            FileUtils.download(fileUrl, fpath,
+                                    !CollectionUtils.isEmpty(randomUaPlatfom) && randomUaPlatfom.contains(platfrm));
+                            return false;
+                        } catch (CommonException e) {
+                            if (e.getCode() == ExceptionEnum.URL_FORBIDDEN.getCode()) {
+                                log.error("access to the url [{}] of remote server is prohibited.", fileUrl);
+                                metric(MetricTypeEnum.INVALID_URL, fileUrl);
+                                return false;
+                            }
+                            throw new RuntimeException(e);
+                        } catch (Exception e) {
+                            throw new RuntimeException(e);
+                        }
+                    }, ImmutableMap.of("filePath", filePath, "platform", platform),
+                    "download", String.format("download file from [%s] to [%s].", fileUrl, filePath));
         } catch (Exception e) {
             metric(MetricTypeEnum.DOWNLOAD_FAILED, String.format("download file from [%s] error. 'cause of [%s]",
                     fileUrl, CommonUtils.getRootCause(e).getMessage()));

+ 159 - 1
etl-core/src/main/java/com/tzld/crawler/etl/util/FileUtils.java

@@ -30,10 +30,15 @@ import com.tzld.crawler.etl.common.exception.CommonException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.io.BufferedReader;
 import java.io.FileOutputStream;
 import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.net.HttpURLConnection;
 import java.net.URL;
+import java.util.Random;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 /**
  * @author ehlxr
@@ -56,16 +61,121 @@ public class FileUtils {
         }
     }
 
+    // 这个数组中的字符串代表了几种常见的用户代理字符串
+    private static final String[] USER_AGENTS = {
+            // Chrome browser
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
+            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36",
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
+            // Firefox browser
+            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
+            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1",
+            // Safari browser
+            "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
+            "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; de-at) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
+            // Opera browser
+            "Opera/9.80 (Macintosh; Intel Mac OS X; U; en) Presto/2.2.15 Version/10.00",
+            "Mozilla/5.0 (Windows NT 6.1; WOW64;rv:15.0) Gecko/20120427 Firefox/15.0a1 Opera/12.12",
+            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Ubuntu/12.04 Chromium/18.0.1025.168 Chrome/18.0.1025.168 Safari/535.24",
+            // MSIE browser
+            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
+            "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
+            "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; ASU2JS; rv:11.0) like Gecko",
+            // Mobile browsers
+            "Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1A543a Safari/419.3",
+            "Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522+ (KHTML, like Gecko) Safari/419.3",
+            "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36",
+            "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1",
+
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0",
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:74.0) Gecko/20100101 Firefox/74.0",
+            "Mozilla/5.0 (X11; Linux i686; rv:74.0) Gecko/20100101 Firefox/74.0",
+            "Mozilla/5.0 (Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0",
+            "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:74.0) Gecko/20100101 Firefox/74.0",
+            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0",
+            "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0",
+            "Mozilla/5.0 (Android 10; Mobile; rv:68.0) Gecko/68.0 Firefox/68.6.0",
+            "Mozilla/5.0 (Android 10; Mobile; LG-M255; rv:68.6.0) Gecko/68.6.0 Firefox/68.6.0",
+            "Mozilla/5.0 (iPhone; CPU iPhone OS 10_15_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/24.1 Mobile/15E148 Safari/605.1.15",
+            "Mozilla/5.0 (iPad; CPU OS 10_15_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/24.1 Mobile/15E148 Safari/605.1.15",
+            "Mozilla/5.0 (iPod touch; CPU iPhone OS 10_15_4 like Mac OS X) AppleWebKit/604.5.6 (KHTML, like Gecko) FxiOS/24.1 Mobile/15E148 Safari/605.1.15",
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.6.1) Gecko/20100101 Firefox/68.6.1",
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:68.6.1) Gecko/20100101 Firefox/68.6.1",
+            "Mozilla/5.0 (X11; Linux i686; rv:68.6.1) Gecko/20100101 Firefox/68.6.1",
+            "Mozilla/5.0 (Linux x86_64; rv:68.6.1) Gecko/20100101 Firefox/68.6.1",
+            "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:68.6.1) Gecko/20100101 Firefox/68.6.1",
+            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:68.6.1) Gecko/20100101 Firefox/68.6.1",
+            "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:68.6.1) Gecko/20100101 Firefox/68.6.1",
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36 OPR/67.0.3575.115",
+            "Mozilla/5.0 (Windows NT 10.0; WOW64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36 OPR/67.0.3575.115",
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36 OPR/67.0.3575.115",
+            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36 OPR/67.0.3575.115",
+            "Mozilla/5.0 (Linux; Android 10; VOG-L29) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Mobile Safari/537.36 OPR/55.2.2719",
+            "Mozilla/5.0 (Linux; Android 10; SM-G970F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Mobile Safari/537.36 OPR/55.2.2719",
+            "Mozilla/5.0 (Linux; Android 10; SM-N975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Mobile Safari/537.36 OPR/55.2.2719",
+            "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)",
+            "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)",
+            "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; InfoPath.2)",
+            "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322)",
+            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)",
+            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; WOW64; Trident/5.0)",
+            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; BOIE9;ENUS)",
+            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Win64; x64; Trident/5.0)",
+            "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
+            "Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko",
+            "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2)",
+            "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)",
+            "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; x64; Trident/6.0)",
+            "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)",
+            "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",
+            "Mozilla/5.0 (Windows NT 6.3; ARM; Trident/7.0; Touch; rv:11.0) like Gecko"
+    };
+
+    public static String yougetDownload(String yougetPath, String fileUrl, String filePath) throws Exception {
+        ProcessBuilder processBuilder = new ProcessBuilder(yougetPath, "-d", fileUrl, "-O", filePath);
+        Process process = processBuilder.start();
+        process.waitFor();
+
+        // 创建 BufferedReader 以读取进程的输出
+        BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream()));
+        String line;
+
+        // 创建正则表达式以匹配文件路径
+        Pattern pattern = Pattern.compile("Downloading\\s+(\\S*)");
+
+        // 读取并处理输出
+        while ((line = reader.readLine()) != null) {
+            // System.out.println(line);
+            // 如果输出行包含文件路径,打印路径
+            Matcher matcher = pattern.matcher(line);
+            if (matcher.find()) {
+                filePath = matcher.group(1);
+                System.out.println("Downloaded file path: " + filePath);
+                break;
+            }
+        }
+
+        return filePath;
+    }
+
     public static void download(String fileUrl, String filePath) throws Exception {
+        download(fileUrl, filePath, false);
+    }
+
+    public static void download(String fileUrl, String filePath, boolean useUa) throws Exception {
         log.info("begin download [{}] to [{}]", fileUrl, filePath);
         URL url = new URL(fileUrl);
         HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+        if (useUa) {
+            conn.setRequestProperty("User-Agent", getRandomUserAgent());
+        }
         if (conn.getResponseCode() == HttpURLConnection.HTTP_FORBIDDEN) {
             throw new CommonException(ExceptionEnum.URL_FORBIDDEN);
         }
         conn.setConnectTimeout(5000);
         conn.setReadTimeout(5000);
-        conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.81");
         log.info("download file size is {} of url [{}]", formatFileSize(conn.getContentLength()), fileUrl);
 
         InputStream inputStream = conn.getInputStream();
@@ -79,4 +189,52 @@ public class FileUtils {
         outputStream.close();
         log.info("downloaded successfully [{}] to [{}]", fileUrl, filePath);
     }
+
+    public static void main(String[] args) throws Exception {
+        // try {
+        download("http://mpvideo.qpic.cn/0bc3zyaagaaaieajowybojsvbtwdaphaaaya.f10002.mp4?dis_k=afa8996b6f4aac67ff2d6b3b7abaa4b4&dis_t=1694751571&play_scene=10120&auth_info=WsS8pdtVOTQL3MuqxRszQlg3FBNoCCQ4PQATPSseNWV8Sz4/BF1kW2kwH14QOSR4Ug==&auth_key=d33c33aa66ca8bd8a05204709a5f92b1&vid=wxv_3103954619094892547&format_id=10002&support_redirect=0&mmversion=false", "/Users/ehlxr/Downloads/" + System.currentTimeMillis(), true);
+        //
+        // } catch (Exception e) {
+        //     e.printStackTrace();
+        // }
+        // String s = "/Users/ehlxr/Downloads/" + System.currentTimeMillis();
+        // String filePath = yougetDownload("/Users/ehlxr/Desktop/you-get-0.4.1650/you-get", "http://mpvideo.qpic.cn/0bc3amabsaaa7mamoixezzrfaa6ddebqagia.f10002.mp4?dis_k=683cf1375dd8cd051ebc48c9861dbf65&dis_t=1694751341&play_scene=10120&auth_info=JO3Lo+01NkNXiZupx3kqLiQ0OB8XZmsYYXs4QSNiQDMCV3IxCDlrLDU9D2AcXT0ULg==&auth_key=33f6a5927b824d086369a12ff4a7a635&vid=wxv_2509438417202888705&format_id=10002&support_redirect=0&mmversion=false", s);
+        //
+        // File file = new File(filePath);
+        // System.out.println(file.exists());
+
+        // try {
+        //     // 创建和启动进程
+        //     ProcessBuilder pb = new ProcessBuilder("/Users/ehlxr/Desktop/you-get-0.4.1650/you-get", "-O", "/Users/ehlxr/Downloads/" + System.currentTimeMillis(), "http://mpvideo.qpic.cn/0bc3zyaagaaaieajowybojsvbtwdaphaaaya.f10002.mp4?dis_k=afa8996b6f4aac67ff2d6b3b7abaa4b4&dis_t=1694751571&play_scene=10120&auth_info=WsS8pdtVOTQL3MuqxRszQlg3FBNoCCQ4PQATPSseNWV8Sz4/BF1kW2kwH14QOSR4Ug==&auth_key=d33c33aa66ca8bd8a05204709a5f92b1&vid=wxv_3103954619094892547&format_id=10002&support_redirect=0&mmversion=false");
+        //     pb.redirectErrorStream(true);
+        //     Process process = pb.start();
+        //
+        //     // 创建 BufferedReader 以读取进程的输出
+        //     BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream()));
+        //     String line;
+        //
+        //     // 创建正则表达式以匹配文件路径
+        //     Pattern pattern = Pattern.compile("Downloading (.*)", Pattern.CASE_INSENSITIVE);
+        //
+        //     // 读取并处理输出
+        //     while ((line = reader.readLine()) != null) {
+        //         // 如果输出行包含文件路径,打印路径
+        //         Matcher matcher = pattern.matcher(line);
+        //         if (matcher.find()) {
+        //             System.out.println("Downloaded file path: " + matcher.group(1));
+        //         }
+        //     }
+        //
+        //     // 等待进程结束
+        //     process.waitFor();
+        // } catch (Exception e) {
+        //     e.printStackTrace();
+        // }
+    }
+
+    // 此方法返回一个随机的用户代理字符串
+    public static String getRandomUserAgent() {
+        int randomIndex = new Random().nextInt(USER_AGENTS.length);
+        return USER_AGENTS[randomIndex];
+    }
 }