Переглянути джерело

Merge branch 'feature_similarity' of algorithm/recommend-server into master

dingyunpeng 6 місяців тому
батько
коміт
b6c825733b

+ 5 - 0
recommend-server-service/pom.xml

@@ -265,6 +265,11 @@
             <artifactId>spark-mllib_2.12</artifactId>
             <version>3.3.1</version>
         </dependency>
+        <dependency>
+            <groupId>com.tzld.piaoquan</groupId>
+            <artifactId>recommend-similarity</artifactId>
+            <version>1.0.0</version>
+        </dependency>
     </dependencies>
 
 

+ 3 - 0
recommend-server-service/src/main/java/com/tzld/piaoquan/recommend/server/service/WarmUpService.java

@@ -1,6 +1,8 @@
 package com.tzld.piaoquan.recommend.server.service;
 
 import com.tzld.piaoquan.recommend.server.repository.WxVideoStatusRepository;
+import com.tzld.piaoquan.recommend.server.util.SimilarityUtils;
+import com.tzld.piaoquan.recommend.similarity.word2vec.Segment;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.beans.factory.annotation.Qualifier;
 import org.springframework.core.annotation.Order;
@@ -35,5 +37,6 @@ public class WarmUpService {
         com.tzld.piaoquan.recommend.server.service.score.ScorerUtils.warmUp();
         com.tzld.piaoquan.recommend.server.framework.score.ScorerUtils.warmUp();
         wxVideoStatusRepository.count();
+        SimilarityUtils.init();
     }
 }

+ 3 - 0
recommend-server-service/src/main/java/com/tzld/piaoquan/recommend/server/service/rank/RankService.java

@@ -50,6 +50,9 @@ public class RankService {
     @ApolloJsonValue("${region.recall.return.size:{}}")
     protected Map<String, Map<String, Integer>> regionRecallReturnSize;
 
+    @Value("${word2vec.exp:691}")
+    protected String word2vecExp;
+
     public RankResult rank(RankParam param) {
         if (param == null
                 || param.getRecallResult() == null

+ 29 - 0
recommend-server-service/src/main/java/com/tzld/piaoquan/recommend/server/service/rank/extractor/ExtractorUtils.java

@@ -5,6 +5,8 @@ import java.time.format.DateTimeFormatter;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
+
+import com.tzld.piaoquan.recommend.server.util.SimilarityUtils;
 import org.xm.Similarity;
 public class ExtractorUtils {
 
@@ -59,6 +61,33 @@ public class ExtractorUtils {
         Double[] result = {(double) d1, d3, d4};
         return result;
     }
+
+    public static Double[] funcC34567ForTagsNew(String tags, String title) {
+        String[] tagsList = tags.split(",");
+        int d1 = 0;
+        List<String> d2 = new ArrayList<>();
+        double d3 = 0.0;
+        double d4 = 0.0;
+
+        for (String tag : tagsList) {
+            if (title.contains(tag)) {
+                d1++;
+                d2.add(tag);
+            }
+            double score = SimilarityUtils.word2VecSimilarity(tag, title);
+            if (score > d3) {
+                d3 = score;
+            }
+            d4 += score;
+        }
+
+        d4 = (tagsList.length > 0) ? d4 / tagsList.length : d4;
+
+        // 使用数组来返回多个值
+        Double[] result = {(double) d1, d3, d4};
+        return result;
+    }
+
     public static Double calDiv(double a, double b){
         if (a == 0 || b == 0){
             return 0D;

+ 22 - 17
recommend-server-service/src/main/java/com/tzld/piaoquan/recommend/server/service/rank/strategy/RankStrategy4RegionMergeModelV562.java

@@ -10,6 +10,7 @@ import com.tzld.piaoquan.recommend.server.service.rank.extractor.ExtractorUtils;
 import com.tzld.piaoquan.recommend.server.service.recall.strategy.*;
 import com.tzld.piaoquan.recommend.server.service.score.ScorerUtils;
 import com.tzld.piaoquan.recommend.server.util.CommonCollectionUtils;
+import com.tzld.piaoquan.recommend.server.util.JSONUtils;
 import lombok.extern.slf4j.Slf4j;
 import org.apache.commons.collections4.MapUtils;
 import org.apache.commons.math3.util.Pair;
@@ -32,17 +33,13 @@ public class RankStrategy4RegionMergeModelV562 extends RankStrategy4RegionMergeM
     @ApolloJsonValue("${rank.score.merge.weightv562:}")
     private Map<String, Double> mergeWeight;
 
-
-
-
-
     @Autowired
     private FeatureService featureService;
 
     Map<String, double[]> bucketsMap = new HashMap<>();
     Map<String, Double> bucketsLen = new HashMap<>();
 
-    @Value("${similarity.concurrent: false}")
+    @Value("${similarity.concurrent: true}")
     private boolean similarityConcurrent;
 
     @Override
@@ -70,13 +67,13 @@ public class RankStrategy4RegionMergeModelV562 extends RankStrategy4RegionMergeM
         List<Video> rovRecallRank = new ArrayList<>(v0);
         //-------------------return相似召回------------------
         List<Video> v6 = extractAndSort(param, ReturnVideoRecallStrategy.PUSH_FORM);
-        v6 = v6.stream().filter(r-> !setVideo.contains(r.getVideoId())).collect(Collectors.toList());
+        v6 = v6.stream().filter(r -> !setVideo.contains(r.getVideoId())).collect(Collectors.toList());
         v6 = v6.subList(0, Math.min(mergeWeight.getOrDefault("v6", 5.0).intValue(), v6.size()));
         rovRecallRank.addAll(v6);
         setVideo.addAll(v6.stream().map(Video::getVideoId).collect(Collectors.toSet()));
         //-------------------新地域召回------------------
         List<Video> v1 = extractAndSort(param, RegionRealtimeRecallStrategyV1.PUSH_FORM);
-        v1 = v1.stream().filter(r-> !setVideo.contains(r.getVideoId())).collect(Collectors.toList());
+        v1 = v1.stream().filter(r -> !setVideo.contains(r.getVideoId())).collect(Collectors.toList());
         v1 = v1.subList(0, Math.min(mergeWeight.getOrDefault("v1", 5.0).intValue(), v1.size()));
         rovRecallRank.addAll(v1);
         setVideo.addAll(v1.stream().map(Video::getVideoId).collect(Collectors.toSet()));
@@ -241,7 +238,12 @@ public class RankStrategy4RegionMergeModelV562 extends RankStrategy4RegionMergeM
                             String tags = c34567Map.getOrDefault(key, "");
                             if (!tags.isEmpty()) {
                                 Future<Pair<String, Double[]>> future = ThreadPoolFactory.defaultPool().submit(() -> {
-                                    Double[] doubles = ExtractorUtils.funcC34567ForTags(tags, title);
+                                    Double[] doubles = null;
+                                    if (param.getAbExpCodes().contains(word2vecExp)) {
+                                        doubles = ExtractorUtils.funcC34567ForTagsNew(tags, title);
+                                    } else {
+                                        doubles = ExtractorUtils.funcC34567ForTags(tags, title);
+                                    }
                                     return Pair.create(key, doubles);
                                 });
                                 futures.add(future);
@@ -263,7 +265,12 @@ public class RankStrategy4RegionMergeModelV562 extends RankStrategy4RegionMergeM
                         for (String key_time : Arrays.asList("tags_1d", "tags_3d", "tags_7d")) {
                             String tags = c34567Map.getOrDefault(name + "_" + key_time, "");
                             if (!tags.isEmpty()) {
-                                Double[] doubles = ExtractorUtils.funcC34567ForTags(tags, title);
+                                Double[] doubles = null;
+                                if (param.getAbExpCodes().contains(word2vecExp)) {
+                                    doubles = ExtractorUtils.funcC34567ForTagsNew(tags, title);
+                                } else {
+                                    doubles = ExtractorUtils.funcC34567ForTags(tags, title);
+                                }
                                 featureMap.put(name + "_" + key_time + "_matchnum", doubles[0]);
                                 featureMap.put(name + "_" + key_time + "_maxscore", doubles[1]);
                                 featureMap.put(name + "_" + key_time + "_avgscore", doubles[2]);
@@ -362,7 +369,6 @@ public class RankStrategy4RegionMergeModelV562 extends RankStrategy4RegionMergeM
         weightList.add(mergeWeight.getOrDefault("h1_ago_vov_w", 0.0));
 
 
-
         Map<String, Map<String, String>> vid2MapFeature = this.getVideoRedisFeature(vids, "redis:vid_hasreturn_rov:");
         Map<String, Map<String, String>> vid2VovFeatureMap = this.getVideoRedisFeature(vids, "redis:vid_vovhour4rank:");
         List<Video> result = new ArrayList<>();
@@ -411,7 +417,6 @@ public class RankStrategy4RegionMergeModelV562 extends RankStrategy4RegionMergeM
             item.getScoresMap().put("vov_thresh", vov_thresh);
 
 
-
             List<Double> featureList = new ArrayList<>(7);
             featureList.add(d2_ago_vov);
             featureList.add(d1_ago_vov);
@@ -424,13 +429,13 @@ public class RankStrategy4RegionMergeModelV562 extends RankStrategy4RegionMergeM
             // todo 线性加权 预测VoV
 
 
-            double vov_p = calculateScore(featureList, weightList,item,  vov_thresh, view_thresh, h1_ago_view,level50_vov,level_95_vov,beta_vov);
+            double vov_p = calculateScore(featureList, weightList, item, vov_thresh, view_thresh, h1_ago_view, level50_vov, level_95_vov, beta_vov);
 
 
             double hasReturnRovScore = Double.parseDouble(vid2MapFeature.getOrDefault(item.getVideoId() + "", new HashMap<>())
                     .getOrDefault("rate_n", "0"));
             item.getScoresMap().put("hasReturnRovScore", hasReturnRovScore);
-            score = fmRov * (1 + hasReturnRovScore)  * (1.0 + alpha_vov * vov_p);
+            score = fmRov * (1 + hasReturnRovScore) * (1.0 + alpha_vov * vov_p);
 
 
             item.getScoresMap().put("vov_p", vov_p);
@@ -457,8 +462,8 @@ public class RankStrategy4RegionMergeModelV562 extends RankStrategy4RegionMergeM
     }
 
 
-    private  double calculateScore(List<Double> featureList, List<Double> weightList,RankItem rankItem,
-                                        double vov_thresh, double view_thresh, double h1_ago_view,double level50_vov,double level_95_vov,double beta_vov) {
+    private double calculateScore(List<Double> featureList, List<Double> weightList, RankItem rankItem,
+                                  double vov_thresh, double view_thresh, double h1_ago_view, double level50_vov, double level_95_vov, double beta_vov) {
         // 检查 h1_ago_view 条件
         if (h1_ago_view == -2 || h1_ago_view == -1 || h1_ago_view < view_thresh) {
             rankItem.getScoresMap().put("origin_vov_p", 0d);
@@ -497,8 +502,8 @@ public class RankStrategy4RegionMergeModelV562 extends RankStrategy4RegionMergeM
         if (score < vov_thresh) {
             score = 0;
         } else {
-            double term1 = 1 / (1 + Math.exp(-1*beta_vov * (score - level50_vov)));
-            double term2 = 1 + Math.exp(-1*beta_vov * (level_95_vov - level50_vov));
+            double term1 = 1 / (1 + Math.exp(-1 * beta_vov * (score - level50_vov)));
+            double term2 = 1 + Math.exp(-1 * beta_vov * (level_95_vov - level50_vov));
             score = term1 * term2;
         }
         return score;

+ 14 - 3
recommend-server-service/src/main/java/com/tzld/piaoquan/recommend/server/service/rank/strategy/RankStrategy4RegionMergeModelV565.java

@@ -236,7 +236,12 @@ public class RankStrategy4RegionMergeModelV565 extends RankStrategy4RegionMergeM
                             String tags = c34567Map.getOrDefault(key, "");
                             if (!tags.isEmpty()) {
                                 Future<Pair<String, Double[]>> future = ThreadPoolFactory.defaultPool().submit(() -> {
-                                    Double[] doubles = ExtractorUtils.funcC34567ForTags(tags, title);
+                                    Double[] doubles;
+                                    if (param.getAbExpCodes().contains(word2vecExp)) {
+                                        doubles = ExtractorUtils.funcC34567ForTagsNew(tags, title);
+                                    } else {
+                                        doubles = ExtractorUtils.funcC34567ForTags(tags, title);
+                                    }
                                     return Pair.create(key, doubles);
                                 });
                                 futures.add(future);
@@ -258,7 +263,12 @@ public class RankStrategy4RegionMergeModelV565 extends RankStrategy4RegionMergeM
                         for (String key_time : Arrays.asList("tags_1d", "tags_3d", "tags_7d")) {
                             String tags = c34567Map.getOrDefault(name + "_" + key_time, "");
                             if (!tags.isEmpty()) {
-                                Double[] doubles = ExtractorUtils.funcC34567ForTags(tags, title);
+                                Double[] doubles = null;
+                                if (param.getAbExpCodes().contains(word2vecExp)) {
+                                    doubles = ExtractorUtils.funcC34567ForTagsNew(tags, title);
+                                } else {
+                                    doubles = ExtractorUtils.funcC34567ForTags(tags, title);
+                                }
                                 featureMap.put(name + "_" + key_time + "_matchnum", doubles[0]);
                                 featureMap.put(name + "_" + key_time + "_maxscore", doubles[1]);
                                 featureMap.put(name + "_" + key_time + "_avgscore", doubles[2]);
@@ -451,7 +461,8 @@ public class RankStrategy4RegionMergeModelV565 extends RankStrategy4RegionMergeM
 
 
     private double calculateScore(List<Double> featureList, List<Double> weightList, RankItem rankItem,
-                                  double vov_thresh, double view_thresh, double h1_ago_view, double level50_vov, double level_95_vov, double beta_vov) {
+                                  double vov_thresh, double view_thresh, double h1_ago_view, double level50_vov, double level_95_vov,
+                                  double beta_vov) {
         // 检查 h1_ago_view 条件
         if (h1_ago_view == -2 || h1_ago_view == -1 || h1_ago_view < view_thresh) {
             rankItem.getScoresMap().put("origin_vov_p", 0d);

+ 63 - 0
recommend-server-service/src/main/java/com/tzld/piaoquan/recommend/server/util/SimilarityUtils.java

@@ -0,0 +1,63 @@
+package com.tzld.piaoquan.recommend.server.util;
+
+import com.tzld.piaoquan.recommend.similarity.word2vec.Segment;
+import com.tzld.piaoquan.recommend.similarity.word2vec.Word2Vec;
+import lombok.extern.slf4j.Slf4j;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+/**
+ * @author dyp
+ */
+@Slf4j
+public final class SimilarityUtils {
+
+    private static Word2Vec vec = new Word2Vec();
+
+    private static final AtomicBoolean modelLoaded = new AtomicBoolean(false);
+    private static final AtomicBoolean init = new AtomicBoolean(false);
+
+    private static final ScheduledExecutorService scheduler = Executors.newSingleThreadScheduledExecutor();
+
+    public static void init() {
+        if (init.compareAndSet(false, true)) {
+            Segment.getWords("1");
+            scheduler.scheduleAtFixedRate(() -> {
+                try {
+                    long start = System.currentTimeMillis();
+                    String endpoint = PropertiesUtil.getString("oss.endpoint");
+                    String bucketName = "art-recommend";
+                    String path = "similarity/word2vec/Google_word2vec_zhwiki210720_300d.bin";
+                    String accessKeyId = "LTAIP6x1l3DXfSxm";
+                    String accessKetSecret = "KbTaM9ars4OX3PMS6Xm7rtxGr1FLon";
+                    Word2Vec temp = new Word2Vec();
+                    temp.loadGoogleModelFromOss(endpoint, bucketName, path, accessKeyId, accessKetSecret);
+                    vec = temp;
+                    long end = System.currentTimeMillis();
+
+                    if (modelLoaded.compareAndSet(false, true)) {
+                        scheduler.shutdown();
+                        log.info("Model loaded successfully cost {}. Scheduled tasks cancelled.", end - start);
+                    }
+
+                } catch (IOException e) {
+                    log.error("loadGoogleModelFromOss error", e);
+                }
+            }, 0, 5, TimeUnit.MINUTES);
+        }
+    }
+
+
+    public static float word2VecSimilarity(String str1, String str2) {
+        List<String> words1 = Segment.getWords(str1);
+        List<String> words2 = Segment.getWords(str2);
+        return vec.sentenceSimilarity(words1, words2);
+    }
+
+
+}

+ 3 - 0
recommend-server-service/src/main/resources/application-dev.yml

@@ -1,6 +1,9 @@
 server:
   port: 8001
 
+oss:
+  endpoint: oss-cn-hangzhou.aliyuncs.com
+
 eureka:
   instance:
     prefer-ip-address: true #是否优先使用IP地址作为主机名的标识,默认false

+ 5 - 2
recommend-server-service/src/main/resources/application-pre.yml

@@ -1,6 +1,9 @@
 server:
   port: 8080
 
+oss:
+  endpoint: oss-cn-hangzhou-internal.aliyuncs.com
+
 eureka:
   instance:
     prefer-ip-address: true #是否优先使用IP地址作为主机名的标识,默认false
@@ -119,10 +122,10 @@ aliyun:
     endpoint: cn-hangzhou-intranet.log.aliyuncs.com
     accessKeyId: LTAIP6x1l3DXfSxm
     accessKeySecret: KbTaM9ars4OX3PMS6Xm7rtxGr1FLon
-    project: recommend-server
+    project: recommend-server-test
   timer:
     log:
-      project: recommend-server
+      project: recommend-server-test
       logStore: timer
   blacklist:
     filter:

+ 3 - 0
recommend-server-service/src/main/resources/application-prod.yml

@@ -1,6 +1,9 @@
 server:
   port: 8080
 
+oss:
+  endpoint: oss-cn-hangzhou-internal.aliyuncs.com
+
 eureka:
   instance:
     prefer-ip-address: true #是否优先使用IP地址作为主机名的标识,默认false

+ 3 - 0
recommend-server-service/src/main/resources/application-test.yml

@@ -1,6 +1,9 @@
 server:
   port: 8080
 
+oss:
+  endpoint: oss-cn-hangzhou-internal.aliyuncs.com
+
 eureka:
   instance:
     prefer-ip-address: true #是否优先使用IP地址作为主机名的标识,默认false