|
@@ -30,10 +30,15 @@ import com.tzld.crawler.etl.common.exception.CommonException;
|
|
|
import org.slf4j.Logger;
|
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
|
|
+import java.io.BufferedReader;
|
|
|
import java.io.FileOutputStream;
|
|
|
import java.io.InputStream;
|
|
|
+import java.io.InputStreamReader;
|
|
|
import java.net.HttpURLConnection;
|
|
|
import java.net.URL;
|
|
|
+import java.util.Random;
|
|
|
+import java.util.regex.Matcher;
|
|
|
+import java.util.regex.Pattern;
|
|
|
|
|
|
/**
|
|
|
* @author ehlxr
|
|
@@ -56,16 +61,121 @@ public class FileUtils {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ // 这个数组中的字符串代表了几种常见的用户代理字符串
|
|
|
+ private static final String[] USER_AGENTS = {
|
|
|
+ // Chrome browser
|
|
|
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
|
|
|
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
|
|
|
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36",
|
|
|
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
|
|
|
+ // Firefox browser
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
|
|
|
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
|
|
|
+ "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1",
|
|
|
+ // Safari browser
|
|
|
+ "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
|
|
|
+ "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; de-at) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
|
|
|
+ // Opera browser
|
|
|
+ "Opera/9.80 (Macintosh; Intel Mac OS X; U; en) Presto/2.2.15 Version/10.00",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; WOW64;rv:15.0) Gecko/20120427 Firefox/15.0a1 Opera/12.12",
|
|
|
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Ubuntu/12.04 Chromium/18.0.1025.168 Chrome/18.0.1025.168 Safari/535.24",
|
|
|
+ // MSIE browser
|
|
|
+ "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
|
|
|
+ "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; ASU2JS; rv:11.0) like Gecko",
|
|
|
+ // Mobile browsers
|
|
|
+ "Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1A543a Safari/419.3",
|
|
|
+ "Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522+ (KHTML, like Gecko) Safari/419.3",
|
|
|
+ "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36",
|
|
|
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1",
|
|
|
+
|
|
|
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0",
|
|
|
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:74.0) Gecko/20100101 Firefox/74.0",
|
|
|
+ "Mozilla/5.0 (X11; Linux i686; rv:74.0) Gecko/20100101 Firefox/74.0",
|
|
|
+ "Mozilla/5.0 (Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0",
|
|
|
+ "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:74.0) Gecko/20100101 Firefox/74.0",
|
|
|
+ "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0",
|
|
|
+ "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0",
|
|
|
+ "Mozilla/5.0 (Android 10; Mobile; rv:68.0) Gecko/68.0 Firefox/68.6.0",
|
|
|
+ "Mozilla/5.0 (Android 10; Mobile; LG-M255; rv:68.6.0) Gecko/68.6.0 Firefox/68.6.0",
|
|
|
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 10_15_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/24.1 Mobile/15E148 Safari/605.1.15",
|
|
|
+ "Mozilla/5.0 (iPad; CPU OS 10_15_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/24.1 Mobile/15E148 Safari/605.1.15",
|
|
|
+ "Mozilla/5.0 (iPod touch; CPU iPhone OS 10_15_4 like Mac OS X) AppleWebKit/604.5.6 (KHTML, like Gecko) FxiOS/24.1 Mobile/15E148 Safari/605.1.15",
|
|
|
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.6.1) Gecko/20100101 Firefox/68.6.1",
|
|
|
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:68.6.1) Gecko/20100101 Firefox/68.6.1",
|
|
|
+ "Mozilla/5.0 (X11; Linux i686; rv:68.6.1) Gecko/20100101 Firefox/68.6.1",
|
|
|
+ "Mozilla/5.0 (Linux x86_64; rv:68.6.1) Gecko/20100101 Firefox/68.6.1",
|
|
|
+ "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:68.6.1) Gecko/20100101 Firefox/68.6.1",
|
|
|
+ "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:68.6.1) Gecko/20100101 Firefox/68.6.1",
|
|
|
+ "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:68.6.1) Gecko/20100101 Firefox/68.6.1",
|
|
|
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36 OPR/67.0.3575.115",
|
|
|
+ "Mozilla/5.0 (Windows NT 10.0; WOW64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36 OPR/67.0.3575.115",
|
|
|
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36 OPR/67.0.3575.115",
|
|
|
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36 OPR/67.0.3575.115",
|
|
|
+ "Mozilla/5.0 (Linux; Android 10; VOG-L29) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Mobile Safari/537.36 OPR/55.2.2719",
|
|
|
+ "Mozilla/5.0 (Linux; Android 10; SM-G970F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Mobile Safari/537.36 OPR/55.2.2719",
|
|
|
+ "Mozilla/5.0 (Linux; Android 10; SM-N975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Mobile Safari/537.36 OPR/55.2.2719",
|
|
|
+ "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)",
|
|
|
+ "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)",
|
|
|
+ "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; InfoPath.2)",
|
|
|
+ "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322)",
|
|
|
+ "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)",
|
|
|
+ "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; WOW64; Trident/5.0)",
|
|
|
+ "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; BOIE9;ENUS)",
|
|
|
+ "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Win64; x64; Trident/5.0)",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko",
|
|
|
+ "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2)",
|
|
|
+ "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)",
|
|
|
+ "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; x64; Trident/6.0)",
|
|
|
+ "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",
|
|
|
+ "Mozilla/5.0 (Windows NT 6.3; ARM; Trident/7.0; Touch; rv:11.0) like Gecko"
|
|
|
+ };
|
|
|
+
|
|
|
+ public static String yougetDownload(String yougetPath, String fileUrl, String filePath) throws Exception {
|
|
|
+ ProcessBuilder processBuilder = new ProcessBuilder(yougetPath, "-d", fileUrl, "-O", filePath);
|
|
|
+ Process process = processBuilder.start();
|
|
|
+ process.waitFor();
|
|
|
+
|
|
|
+ // 创建 BufferedReader 以读取进程的输出
|
|
|
+ BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream()));
|
|
|
+ String line;
|
|
|
+
|
|
|
+ // 创建正则表达式以匹配文件路径
|
|
|
+ Pattern pattern = Pattern.compile("Downloading\\s+(\\S*)");
|
|
|
+
|
|
|
+ // 读取并处理输出
|
|
|
+ while ((line = reader.readLine()) != null) {
|
|
|
+ // System.out.println(line);
|
|
|
+ // 如果输出行包含文件路径,打印路径
|
|
|
+ Matcher matcher = pattern.matcher(line);
|
|
|
+ if (matcher.find()) {
|
|
|
+ filePath = matcher.group(1);
|
|
|
+ System.out.println("Downloaded file path: " + filePath);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return filePath;
|
|
|
+ }
|
|
|
+
|
|
|
public static void download(String fileUrl, String filePath) throws Exception {
|
|
|
+ download(fileUrl, filePath, false);
|
|
|
+ }
|
|
|
+
|
|
|
+ public static void download(String fileUrl, String filePath, boolean useUa) throws Exception {
|
|
|
log.info("begin download [{}] to [{}]", fileUrl, filePath);
|
|
|
URL url = new URL(fileUrl);
|
|
|
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
|
|
+ if (useUa) {
|
|
|
+ conn.setRequestProperty("User-Agent", getRandomUserAgent());
|
|
|
+ }
|
|
|
if (conn.getResponseCode() == HttpURLConnection.HTTP_FORBIDDEN) {
|
|
|
throw new CommonException(ExceptionEnum.URL_FORBIDDEN);
|
|
|
}
|
|
|
conn.setConnectTimeout(5000);
|
|
|
conn.setReadTimeout(5000);
|
|
|
- conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.81");
|
|
|
log.info("download file size is {} of url [{}]", formatFileSize(conn.getContentLength()), fileUrl);
|
|
|
|
|
|
InputStream inputStream = conn.getInputStream();
|
|
@@ -79,4 +189,52 @@ public class FileUtils {
|
|
|
outputStream.close();
|
|
|
log.info("downloaded successfully [{}] to [{}]", fileUrl, filePath);
|
|
|
}
|
|
|
+
|
|
|
+ public static void main(String[] args) throws Exception {
|
|
|
+ // try {
|
|
|
+ download("http://mpvideo.qpic.cn/0bc3zyaagaaaieajowybojsvbtwdaphaaaya.f10002.mp4?dis_k=afa8996b6f4aac67ff2d6b3b7abaa4b4&dis_t=1694751571&play_scene=10120&auth_info=WsS8pdtVOTQL3MuqxRszQlg3FBNoCCQ4PQATPSseNWV8Sz4/BF1kW2kwH14QOSR4Ug==&auth_key=d33c33aa66ca8bd8a05204709a5f92b1&vid=wxv_3103954619094892547&format_id=10002&support_redirect=0&mmversion=false", "/Users/ehlxr/Downloads/" + System.currentTimeMillis(), true);
|
|
|
+ //
|
|
|
+ // } catch (Exception e) {
|
|
|
+ // e.printStackTrace();
|
|
|
+ // }
|
|
|
+ // String s = "/Users/ehlxr/Downloads/" + System.currentTimeMillis();
|
|
|
+ // String filePath = yougetDownload("/Users/ehlxr/Desktop/you-get-0.4.1650/you-get", "http://mpvideo.qpic.cn/0bc3amabsaaa7mamoixezzrfaa6ddebqagia.f10002.mp4?dis_k=683cf1375dd8cd051ebc48c9861dbf65&dis_t=1694751341&play_scene=10120&auth_info=JO3Lo+01NkNXiZupx3kqLiQ0OB8XZmsYYXs4QSNiQDMCV3IxCDlrLDU9D2AcXT0ULg==&auth_key=33f6a5927b824d086369a12ff4a7a635&vid=wxv_2509438417202888705&format_id=10002&support_redirect=0&mmversion=false", s);
|
|
|
+ //
|
|
|
+ // File file = new File(filePath);
|
|
|
+ // System.out.println(file.exists());
|
|
|
+
|
|
|
+ // try {
|
|
|
+ // // 创建和启动进程
|
|
|
+ // ProcessBuilder pb = new ProcessBuilder("/Users/ehlxr/Desktop/you-get-0.4.1650/you-get", "-O", "/Users/ehlxr/Downloads/" + System.currentTimeMillis(), "http://mpvideo.qpic.cn/0bc3zyaagaaaieajowybojsvbtwdaphaaaya.f10002.mp4?dis_k=afa8996b6f4aac67ff2d6b3b7abaa4b4&dis_t=1694751571&play_scene=10120&auth_info=WsS8pdtVOTQL3MuqxRszQlg3FBNoCCQ4PQATPSseNWV8Sz4/BF1kW2kwH14QOSR4Ug==&auth_key=d33c33aa66ca8bd8a05204709a5f92b1&vid=wxv_3103954619094892547&format_id=10002&support_redirect=0&mmversion=false");
|
|
|
+ // pb.redirectErrorStream(true);
|
|
|
+ // Process process = pb.start();
|
|
|
+ //
|
|
|
+ // // 创建 BufferedReader 以读取进程的输出
|
|
|
+ // BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream()));
|
|
|
+ // String line;
|
|
|
+ //
|
|
|
+ // // 创建正则表达式以匹配文件路径
|
|
|
+ // Pattern pattern = Pattern.compile("Downloading (.*)", Pattern.CASE_INSENSITIVE);
|
|
|
+ //
|
|
|
+ // // 读取并处理输出
|
|
|
+ // while ((line = reader.readLine()) != null) {
|
|
|
+ // // 如果输出行包含文件路径,打印路径
|
|
|
+ // Matcher matcher = pattern.matcher(line);
|
|
|
+ // if (matcher.find()) {
|
|
|
+ // System.out.println("Downloaded file path: " + matcher.group(1));
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ //
|
|
|
+ // // 等待进程结束
|
|
|
+ // process.waitFor();
|
|
|
+ // } catch (Exception e) {
|
|
|
+ // e.printStackTrace();
|
|
|
+ // }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 此方法返回一个随机的用户代理字符串
|
|
|
+ public static String getRandomUserAgent() {
|
|
|
+ int randomIndex = new Random().nextInt(USER_AGENTS.length);
|
|
|
+ return USER_AGENTS[randomIndex];
|
|
|
+ }
|
|
|
}
|