Explorar el Código

增加小程序匹配抓取

xueyiming hace 6 meses
padre
commit
9c6824a545

+ 144 - 3
long-article-server/src/main/java/com/tzld/piaoquan/longarticle/service/local/impl/CrawlerVideoServiceImpl.java

@@ -1,26 +1,167 @@
 package com.tzld.piaoquan.longarticle.service.local.impl;
 
+import com.alibaba.fastjson.JSON;
 import com.alibaba.fastjson.JSONArray;
+import com.alibaba.fastjson.JSONObject;
+import com.tzld.piaoquan.longarticle.model.po.CrawlerVideo;
 import com.tzld.piaoquan.longarticle.model.po.LongArticlesText;
+import com.tzld.piaoquan.longarticle.utils.other.DouyinSearch;
+import com.tzld.piaoquan.longarticle.utils.other.HkspSearch;
+import com.tzld.piaoquan.longarticle.utils.other.NlpUtils;
+import org.apache.commons.lang3.StringUtils;
 import org.springframework.stereotype.Service;
+import org.springframework.util.CollectionUtils;
 
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
 import java.util.stream.Collectors;
 
 @Service
 public class CrawlerVideoServiceImpl {
 
-    private static final String default_account_id = "69637498";
+    private static final String default_user_id = "69637498";
 
+    private static final Double NLP_SIMILARITY_THRESHOLD = 0.45;
 
-    public void searchVideosFromWeb(LongArticlesText longArticlesText) {
+    private static final List<String> sensitiveWords = new ArrayList<String>() {{
+        add("人民");
+        add("必胜");
+        add("正义必胜");
+        add("中国");
+        add("老虎");
+        add("人生的扣子");
+        add("共产党");
+        add("总书记");
+        add("政");
+        add("习");
+    }};
+
+
+    public List<CrawlerVideo> searchVideosFromWeb(LongArticlesText longArticlesText) {
         String articleSummary = longArticlesText.getKimiSummary().substring(0, 15);
-        String oriTitle = longArticlesText.getKimiTitle();
+        String oriTitle = longArticlesText.getKimiTitle().substring(0, 15);
         String kimiKeys = longArticlesText.getKimiKeys();
         JSONArray jsonArray = JSONArray.parseArray(kimiKeys);
         String newKimiKeys = jsonArray.stream()
                 .map(Object::toString)
                 .collect(Collectors.joining(","));
+        List<JSONObject> res = new ArrayList<>();
+        List<JSONObject> list0 = searchVideo(articleSummary, sensitiveWords);
+        List<JSONObject> list1 = searchVideo(oriTitle, sensitiveWords);
+        List<JSONObject> list2 = searchVideo(newKimiKeys, sensitiveWords);
+        if (!CollectionUtils.isEmpty(list0)) {
+            res.addAll(list0);
+        }
+        if (!CollectionUtils.isEmpty(list1)) {
+            res.addAll(list1);
+        }
+        if (!CollectionUtils.isEmpty(list2)) {
+            res.addAll(list2);
+        }
+        if (CollectionUtils.isEmpty(res)) {
+            //TODO 搜索失败
+            return null;
+        }
+        res = res.stream().filter(f -> StringUtils.isNotEmpty(f.getString("title"))).collect(Collectors.toList());
+        List<String> titleList = res.stream().map(e -> e.getString("title")).collect(Collectors.toList());
+        List<Float> titleSimilarityWithNlp = getTitleSimilarityWithNlp(oriTitle, titleList);
+        if (CollectionUtils.isEmpty(titleSimilarityWithNlp) || titleSimilarityWithNlp.size() != res.size()) {
+            //TODO 评分失败
+            return null;
+        }
+        for (int i = 0; i < res.size(); i++) {
+            JSONObject jsonObject = res.get(i);
+            jsonObject.put("score", titleSimilarityWithNlp.get(i));
+        }
+        res = res.stream().filter(f -> f.getDouble("score") != null && f.getFloat("score") > NLP_SIMILARITY_THRESHOLD)
+                .collect(Collectors.toList());
+        if (CollectionUtils.isEmpty(res)) {
+            //没有符合评分要求的视频
+            return null;
+        }
+        List<CrawlerVideo> crawlerVideoList = new ArrayList<>();
+        for (JSONObject jsonObject : res) {
+            String platform = jsonObject.getString("platform");
+            if (StringUtils.isEmpty(platform)) {
+                continue;
+            }
+            if ("dy_search".equals(platform)) {
+                crawlerVideoList.add(dyVideoProduce(jsonObject));
+            }
+            if ("baidu_search".equals(platform)) {
+                crawlerVideoList.add(baiduVideoProduce(jsonObject));
+            }
+        }
+        return crawlerVideoList;
+    }
+
+    public List<Float> getTitleSimilarityWithNlp(String oriTitle, List<String> titleList) {
+        List<Float> baseScores = NlpUtils.baseNlpTitleSimilarity(oriTitle, titleList);
+        if (!CollectionUtils.isEmpty(baseScores)) {
+            return baseScores;
+        }
+        List<Float> aliyunScores = NlpUtils.aliyunNlpTitleSimilarity(oriTitle, titleList);
+        if (!CollectionUtils.isEmpty(aliyunScores)) {
+            return aliyunScores;
+        }
+        return null;
+    }
 
 
+    public List<JSONObject> searchVideo(String text, List<String> sensitiveWords) {
+        List<JSONObject> res;
+        res = DouyinSearch.douyinSearch(text, sensitiveWords, "");
+        if (!CollectionUtils.isEmpty(res)) {
+            for (JSONObject jsonObject : res) {
+                jsonObject.put("platform", "dy_search");
+            }
+            return res;
+        }
+        res = HkspSearch.hkspSearch(text, sensitiveWords, "");
+        if (!CollectionUtils.isEmpty(res)) {
+            for (JSONObject jsonObject : res) {
+                jsonObject.put("platform", "baidu_search");
+            }
+            return res;
+        }
+        return null;
+    }
+
+    public CrawlerVideo dyVideoProduce(JSONObject jsonObject) {
+        CrawlerVideo crawlerVideo = new CrawlerVideo();
+        crawlerVideo.setPlatform(jsonObject.getString("c"));
+        crawlerVideo.setVideoTitle(jsonObject.getString("title"));
+        crawlerVideo.setOutVideoId(jsonObject.getString("channel_content_id"));
+        if (jsonObject.getLong("publish_timestamp") != null) {
+            crawlerVideo.setPublishTime(new Date(jsonObject.getLong("publish_timestamp")));
+        }
+        crawlerVideo.setVideoUrl(jsonObject.getJSONArray("video_url_list").getJSONObject(0).getString("video_url"));
+        crawlerVideo.setCoverUrl(jsonObject.getJSONArray("image_url_list").getJSONObject(0).getString("image_url"));
+        crawlerVideo.setPlayCount(jsonObject.getInteger("play_count"));
+        crawlerVideo.setLikeCount(jsonObject.getInteger("like_count"));
+        crawlerVideo.setShareCount(jsonObject.getInteger("share_count"));
+        crawlerVideo.setDuration(jsonObject.getJSONArray("video_url_list").getJSONObject(0).getInteger("video_duration"));
+        crawlerVideo.setScore(jsonObject.getFloat("score"));
+        crawlerVideo.setUserId(default_user_id);
+        return crawlerVideo;
+    }
+
+    public CrawlerVideo baiduVideoProduce(JSONObject jsonObject) {
+        CrawlerVideo crawlerVideo = new CrawlerVideo();
+        crawlerVideo.setPlatform(jsonObject.getString("platform"));
+        crawlerVideo.setVideoTitle(jsonObject.getString("title"));
+        crawlerVideo.setOutVideoId(jsonObject.getString("id"));
+        if (jsonObject.getLong("publish_time") != null) {
+            crawlerVideo.setPublishTime(new Date(jsonObject.getLong("publish_time") * 1000));
+        }
+        crawlerVideo.setVideoUrl(jsonObject.getString("playurl"));
+        crawlerVideo.setCoverUrl(jsonObject.getString("poster"));
+        crawlerVideo.setLikeCount(jsonObject.getInteger("like") == null ? 0 : jsonObject.getInteger("like"));
+        crawlerVideo.setPlayCount(jsonObject.getInteger("playcnt"));
+        crawlerVideo.setDuration(jsonObject.getInteger("duration"));
+        crawlerVideo.setScore(jsonObject.getFloat("score"));
+        crawlerVideo.setUserId(default_user_id);
+        return crawlerVideo;
     }
 }

+ 26 - 1
long-article-server/src/main/java/com/tzld/piaoquan/longarticle/utils/other/DouyinSearch.java

@@ -4,15 +4,40 @@ import cn.hutool.http.HttpRequest;
 import cn.hutool.http.HttpResponse;
 import com.alibaba.fastjson.JSONArray;
 import com.alibaba.fastjson.JSONObject;
+import com.tzld.piaoquan.longarticle.model.po.CrawlerVideo;
+import org.apache.commons.lang3.StringUtils;
 
 import java.util.ArrayList;
+import java.util.Date;
 import java.util.List;
 
 public class DouyinSearch {
 
+    public static CrawlerVideo dyVideoProduce(JSONObject jsonObject) {
+        CrawlerVideo crawlerVideo = new CrawlerVideo();
+        crawlerVideo.setPlatform(jsonObject.getString("platform"));
+        crawlerVideo.setVideoTitle(jsonObject.getString("title"));
+        crawlerVideo.setOutVideoId(jsonObject.getString("channel_content_id"));
+        if (jsonObject.getLong("publish_timestamp") != null) {
+            crawlerVideo.setPublishTime(new Date(jsonObject.getLong("publish_timestamp")));
+        }
+        crawlerVideo.setVideoUrl(jsonObject.getJSONArray("video_url_list").getJSONObject(0).getString("video_url"));
+        crawlerVideo.setCoverUrl(jsonObject.getJSONArray("image_url_list").getJSONObject(0).getString("image_url"));
+        crawlerVideo.setPlayCount(jsonObject.getInteger("play_count"));
+        crawlerVideo.setLikeCount(jsonObject.getInteger("like_count"));
+        crawlerVideo.setDuration(jsonObject.getJSONArray("video_url_list").getJSONObject(0).getInteger("video_duration"));
+        crawlerVideo.setScore(jsonObject.getFloat("score"));
+        return crawlerVideo;
+    }
+
     public static void main(String[] args) {
         List<String> sensitiveWords = new ArrayList<>();
-        System.out.println(douyinSearch("你好", sensitiveWords, ""));
+        List<JSONObject> list = douyinSearch("你好", sensitiveWords, "");
+        for (JSONObject jsonObject : list){
+            System.out.println(jsonObject);
+            CrawlerVideo crawlerVideo = dyVideoProduce(jsonObject);
+            System.out.println(crawlerVideo);
+        }
     }
 
     public static List<JSONObject> douyinSearch(String keyword, List<String> sensitiveWords, String traceId) {

+ 25 - 5
long-article-server/src/main/java/com/tzld/piaoquan/longarticle/utils/other/HkspSearch.java

@@ -4,22 +4,42 @@ import cn.hutool.http.HttpRequest;
 import cn.hutool.http.HttpResponse;
 import com.alibaba.fastjson.JSONArray;
 import com.alibaba.fastjson.JSONObject;
+import com.tzld.piaoquan.longarticle.model.po.CrawlerVideo;
 
 
 import java.io.IOException;
 import java.net.*;
 import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
-import java.util.ArrayList;
-import java.util.Base64;
-import java.util.List;
-import java.util.UUID;
+import java.util.*;
 
 public class HkspSearch {
 
+
+    public static CrawlerVideo baiduVideoProduce(JSONObject jsonObject) {
+        CrawlerVideo crawlerVideo = new CrawlerVideo();
+        crawlerVideo.setPlatform(jsonObject.getString("platform"));
+        crawlerVideo.setVideoTitle(jsonObject.getString("title"));
+        crawlerVideo.setOutVideoId(jsonObject.getString("id"));
+        if (jsonObject.getLong("publish_time") != null) {
+            crawlerVideo.setPublishTime(new Date(jsonObject.getLong("publish_time") * 1000));
+        }
+        crawlerVideo.setVideoUrl(jsonObject.getString("playurl"));
+        crawlerVideo.setCoverUrl(jsonObject.getString("poster"));
+        crawlerVideo.setLikeCount(jsonObject.getInteger("like") == null ? 0 : jsonObject.getInteger("like"));
+        crawlerVideo.setPlayCount(jsonObject.getInteger("playcnt"));
+        crawlerVideo.setDuration(jsonObject.getInteger("duration"));
+        return crawlerVideo;
+    }
+
     public static void main(String[] args) {
         List<String> sensitiveWords = new ArrayList<>();
-        System.out.println(hkspSearch("你好", sensitiveWords, ""));
+        List<JSONObject> list = hkspSearch("你好", sensitiveWords, "");
+        for (JSONObject jsonObject : list){
+            CrawlerVideo crawlerVideo = baiduVideoProduce(jsonObject);
+            System.out.println(jsonObject);
+            System.out.println(crawlerVideo);
+        }
     }
 
     public static List<JSONObject> hkspSearch(String key, List<String> sensitiveWords, String traceId) {

+ 64 - 0
long-article-server/src/main/java/com/tzld/piaoquan/longarticle/utils/other/NlpUtils.java

@@ -0,0 +1,64 @@
+package com.tzld.piaoquan.longarticle.utils.other;
+
+import cn.hutool.http.HttpRequest;
+import com.alibaba.fastjson.JSONArray;
+import com.alibaba.fastjson.JSONObject;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.util.CollectionUtils;
+
+import java.math.BigDecimal;
+import java.util.*;
+
+@Slf4j
+public class NlpUtils {
+
+    private static final String nlp_base_url = "http://61.48.133.26:6060/nlp";
+
+    private static final String nlp_aliyun_url = "http://47.98.136.48:6060/nlp";
+
+    public static void main(String[] args) {
+        String title1 = "在女儿出嫁的那神圣时刻,交接仪式上的父亲。。。";
+        String title2 = "转载:““和老婆置气,我去女儿家避难。女。。。";
+        String title3 = "“姨夫”和“姑父”哪个亲为何一个是父亲的父,一个是丈夫的夫";
+        List<String> titleList = new ArrayList<>();
+        titleList.add(title1);
+        titleList.add(title2);
+        titleList.add(title3);
+        System.out.println(nlpTitleSimilarity(nlp_base_url, "这是我的女儿", titleList));
+    }
+
+    public static List<Float> baseNlpTitleSimilarity(String oriTitle, List<String> titleList){
+        return nlpTitleSimilarity(nlp_base_url, oriTitle, titleList);
+    }
+
+    public static List<Float> aliyunNlpTitleSimilarity(String oriTitle, List<String> titleList){
+        return nlpTitleSimilarity(nlp_aliyun_url, oriTitle, titleList);
+    }
+
+    public static List<Float> nlpTitleSimilarity(String url, String oriTitle, List<String> titleList) {
+        List<String> oriTitles = Collections.singletonList(oriTitle);
+
+        JSONObject param = new JSONObject();
+        JSONObject data = new JSONObject();
+        data.put("text_list_a", oriTitles);
+        data.put("text_list_b", titleList);
+        param.put("data", data);
+        param.put("function", "similarities_cross");
+        param.put("use_cache", false);
+        try {
+            String body = HttpRequest.post(url)
+                    .header("Content-Type", "application/json")
+                    .body(JSONObject.toJSONString(param))
+                    .execute()
+                    .body();
+            JSONObject jsonObject = JSONObject.parseObject(body);
+            JSONArray scoreListList = jsonObject.getJSONArray("score_list_list").getJSONArray(0);
+            if (!CollectionUtils.isEmpty(scoreListList)) {
+                return scoreListList.toJavaList(Float.class);
+            }
+        } catch (Exception e) {
+            log.error("NlpUtils nlpTitleSimilarity error");
+        }
+        return null;
+    }
+}