|
@@ -1,26 +1,167 @@
|
|
|
package com.tzld.piaoquan.longarticle.service.local.impl;
|
|
|
|
|
|
+import com.alibaba.fastjson.JSON;
|
|
|
import com.alibaba.fastjson.JSONArray;
|
|
|
+import com.alibaba.fastjson.JSONObject;
|
|
|
+import com.tzld.piaoquan.longarticle.model.po.CrawlerVideo;
|
|
|
import com.tzld.piaoquan.longarticle.model.po.LongArticlesText;
|
|
|
+import com.tzld.piaoquan.longarticle.utils.other.DouyinSearch;
|
|
|
+import com.tzld.piaoquan.longarticle.utils.other.HkspSearch;
|
|
|
+import com.tzld.piaoquan.longarticle.utils.other.NlpUtils;
|
|
|
+import org.apache.commons.lang3.StringUtils;
|
|
|
import org.springframework.stereotype.Service;
|
|
|
+import org.springframework.util.CollectionUtils;
|
|
|
|
|
|
+import java.util.ArrayList;
|
|
|
+import java.util.Date;
|
|
|
+import java.util.List;
|
|
|
import java.util.stream.Collectors;
|
|
|
|
|
|
@Service
|
|
|
public class CrawlerVideoServiceImpl {
|
|
|
|
|
|
- private static final String default_account_id = "69637498";
|
|
|
+ private static final String default_user_id = "69637498";
|
|
|
|
|
|
+ private static final Double NLP_SIMILARITY_THRESHOLD = 0.45;
|
|
|
|
|
|
- public void searchVideosFromWeb(LongArticlesText longArticlesText) {
|
|
|
+ private static final List<String> sensitiveWords = new ArrayList<String>() {{
|
|
|
+ add("人民");
|
|
|
+ add("必胜");
|
|
|
+ add("正义必胜");
|
|
|
+ add("中国");
|
|
|
+ add("老虎");
|
|
|
+ add("人生的扣子");
|
|
|
+ add("共产党");
|
|
|
+ add("总书记");
|
|
|
+ add("政");
|
|
|
+ add("习");
|
|
|
+ }};
|
|
|
+
|
|
|
+
|
|
|
+ public List<CrawlerVideo> searchVideosFromWeb(LongArticlesText longArticlesText) {
|
|
|
String articleSummary = longArticlesText.getKimiSummary().substring(0, 15);
|
|
|
- String oriTitle = longArticlesText.getKimiTitle();
|
|
|
+ String oriTitle = longArticlesText.getKimiTitle().substring(0, 15);
|
|
|
String kimiKeys = longArticlesText.getKimiKeys();
|
|
|
JSONArray jsonArray = JSONArray.parseArray(kimiKeys);
|
|
|
String newKimiKeys = jsonArray.stream()
|
|
|
.map(Object::toString)
|
|
|
.collect(Collectors.joining(","));
|
|
|
+ List<JSONObject> res = new ArrayList<>();
|
|
|
+ List<JSONObject> list0 = searchVideo(articleSummary, sensitiveWords);
|
|
|
+ List<JSONObject> list1 = searchVideo(oriTitle, sensitiveWords);
|
|
|
+ List<JSONObject> list2 = searchVideo(newKimiKeys, sensitiveWords);
|
|
|
+ if (!CollectionUtils.isEmpty(list0)) {
|
|
|
+ res.addAll(list0);
|
|
|
+ }
|
|
|
+ if (!CollectionUtils.isEmpty(list1)) {
|
|
|
+ res.addAll(list1);
|
|
|
+ }
|
|
|
+ if (!CollectionUtils.isEmpty(list2)) {
|
|
|
+ res.addAll(list2);
|
|
|
+ }
|
|
|
+ if (CollectionUtils.isEmpty(res)) {
|
|
|
+ //TODO 搜索失败
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+ res = res.stream().filter(f -> StringUtils.isNotEmpty(f.getString("title"))).collect(Collectors.toList());
|
|
|
+ List<String> titleList = res.stream().map(e -> e.getString("title")).collect(Collectors.toList());
|
|
|
+ List<Float> titleSimilarityWithNlp = getTitleSimilarityWithNlp(oriTitle, titleList);
|
|
|
+ if (CollectionUtils.isEmpty(titleSimilarityWithNlp) || titleSimilarityWithNlp.size() != res.size()) {
|
|
|
+ //TODO 评分失败
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+ for (int i = 0; i < res.size(); i++) {
|
|
|
+ JSONObject jsonObject = res.get(i);
|
|
|
+ jsonObject.put("score", titleSimilarityWithNlp.get(i));
|
|
|
+ }
|
|
|
+ res = res.stream().filter(f -> f.getDouble("score") != null && f.getFloat("score") > NLP_SIMILARITY_THRESHOLD)
|
|
|
+ .collect(Collectors.toList());
|
|
|
+ if (CollectionUtils.isEmpty(res)) {
|
|
|
+ //没有符合评分要求的视频
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+ List<CrawlerVideo> crawlerVideoList = new ArrayList<>();
|
|
|
+ for (JSONObject jsonObject : res) {
|
|
|
+ String platform = jsonObject.getString("platform");
|
|
|
+ if (StringUtils.isEmpty(platform)) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ if ("dy_search".equals(platform)) {
|
|
|
+ crawlerVideoList.add(dyVideoProduce(jsonObject));
|
|
|
+ }
|
|
|
+ if ("baidu_search".equals(platform)) {
|
|
|
+ crawlerVideoList.add(baiduVideoProduce(jsonObject));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return crawlerVideoList;
|
|
|
+ }
|
|
|
+
|
|
|
+ public List<Float> getTitleSimilarityWithNlp(String oriTitle, List<String> titleList) {
|
|
|
+ List<Float> baseScores = NlpUtils.baseNlpTitleSimilarity(oriTitle, titleList);
|
|
|
+ if (!CollectionUtils.isEmpty(baseScores)) {
|
|
|
+ return baseScores;
|
|
|
+ }
|
|
|
+ List<Float> aliyunScores = NlpUtils.aliyunNlpTitleSimilarity(oriTitle, titleList);
|
|
|
+ if (!CollectionUtils.isEmpty(aliyunScores)) {
|
|
|
+ return aliyunScores;
|
|
|
+ }
|
|
|
+ return null;
|
|
|
+ }
|
|
|
|
|
|
|
|
|
+ public List<JSONObject> searchVideo(String text, List<String> sensitiveWords) {
|
|
|
+ List<JSONObject> res;
|
|
|
+ res = DouyinSearch.douyinSearch(text, sensitiveWords, "");
|
|
|
+ if (!CollectionUtils.isEmpty(res)) {
|
|
|
+ for (JSONObject jsonObject : res) {
|
|
|
+ jsonObject.put("platform", "dy_search");
|
|
|
+ }
|
|
|
+ return res;
|
|
|
+ }
|
|
|
+ res = HkspSearch.hkspSearch(text, sensitiveWords, "");
|
|
|
+ if (!CollectionUtils.isEmpty(res)) {
|
|
|
+ for (JSONObject jsonObject : res) {
|
|
|
+ jsonObject.put("platform", "baidu_search");
|
|
|
+ }
|
|
|
+ return res;
|
|
|
+ }
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+
|
|
|
+ public CrawlerVideo dyVideoProduce(JSONObject jsonObject) {
|
|
|
+ CrawlerVideo crawlerVideo = new CrawlerVideo();
|
|
|
+ crawlerVideo.setPlatform(jsonObject.getString("c"));
|
|
|
+ crawlerVideo.setVideoTitle(jsonObject.getString("title"));
|
|
|
+ crawlerVideo.setOutVideoId(jsonObject.getString("channel_content_id"));
|
|
|
+ if (jsonObject.getLong("publish_timestamp") != null) {
|
|
|
+ crawlerVideo.setPublishTime(new Date(jsonObject.getLong("publish_timestamp")));
|
|
|
+ }
|
|
|
+ crawlerVideo.setVideoUrl(jsonObject.getJSONArray("video_url_list").getJSONObject(0).getString("video_url"));
|
|
|
+ crawlerVideo.setCoverUrl(jsonObject.getJSONArray("image_url_list").getJSONObject(0).getString("image_url"));
|
|
|
+ crawlerVideo.setPlayCount(jsonObject.getInteger("play_count"));
|
|
|
+ crawlerVideo.setLikeCount(jsonObject.getInteger("like_count"));
|
|
|
+ crawlerVideo.setShareCount(jsonObject.getInteger("share_count"));
|
|
|
+ crawlerVideo.setDuration(jsonObject.getJSONArray("video_url_list").getJSONObject(0).getInteger("video_duration"));
|
|
|
+ crawlerVideo.setScore(jsonObject.getFloat("score"));
|
|
|
+ crawlerVideo.setUserId(default_user_id);
|
|
|
+ return crawlerVideo;
|
|
|
+ }
|
|
|
+
|
|
|
+ public CrawlerVideo baiduVideoProduce(JSONObject jsonObject) {
|
|
|
+ CrawlerVideo crawlerVideo = new CrawlerVideo();
|
|
|
+ crawlerVideo.setPlatform(jsonObject.getString("platform"));
|
|
|
+ crawlerVideo.setVideoTitle(jsonObject.getString("title"));
|
|
|
+ crawlerVideo.setOutVideoId(jsonObject.getString("id"));
|
|
|
+ if (jsonObject.getLong("publish_time") != null) {
|
|
|
+ crawlerVideo.setPublishTime(new Date(jsonObject.getLong("publish_time") * 1000));
|
|
|
+ }
|
|
|
+ crawlerVideo.setVideoUrl(jsonObject.getString("playurl"));
|
|
|
+ crawlerVideo.setCoverUrl(jsonObject.getString("poster"));
|
|
|
+ crawlerVideo.setLikeCount(jsonObject.getInteger("like") == null ? 0 : jsonObject.getInteger("like"));
|
|
|
+ crawlerVideo.setPlayCount(jsonObject.getInteger("playcnt"));
|
|
|
+ crawlerVideo.setDuration(jsonObject.getInteger("duration"));
|
|
|
+ crawlerVideo.setScore(jsonObject.getFloat("score"));
|
|
|
+ crawlerVideo.setUserId(default_user_id);
|
|
|
+ return crawlerVideo;
|
|
|
}
|
|
|
}
|