Pārlūkot izejas kodu

feat:修改label

zhaohaipeng 2 mēneši atpakaļ
vecāks
revīzija
921fff1af6

+ 31 - 13
src/main/java/examples/extractor/ExtractorUtils.java

@@ -2,18 +2,23 @@ package examples.extractor;
 
 import examples.utils.SimilarityUtils;
 
+import java.time.Instant;
+import java.time.LocalDate;
+import java.time.ZoneId;
+import java.time.temporal.ChronoUnit;
 import java.util.Map;
 import java.time.LocalDateTime;
 import java.time.format.DateTimeFormatter;
 import java.util.ArrayList;
 import java.util.List;
+
 public class ExtractorUtils {
 
-    public static Double division(String s1, String s2, Map<String, String> maps){
+    public static Double division(String s1, String s2, Map<String, String> maps) {
         double rate = 0.0;
-        if (maps.containsKey(s1) && maps.containsKey(s2)){
+        if (maps.containsKey(s1) && maps.containsKey(s2)) {
             Double d1 = Double.valueOf(maps.get(s1));
-            if (isDoubleEqualToZero(d1)){
+            if (isDoubleEqualToZero(d1)) {
                 return rate;
             }
             Double d2 = Double.valueOf(maps.get(s2));
@@ -21,9 +26,10 @@ public class ExtractorUtils {
         }
         return rate;
     }
-    public static Double divisionDouble(Double d1, Double d2){
+
+    public static Double divisionDouble(Double d1, Double d2) {
         double rate = 0.0;
-        if (isDoubleEqualToZero(d1)){
+        if (isDoubleEqualToZero(d1)) {
             return rate;
         }
         rate = d2 / d1;
@@ -37,7 +43,6 @@ public class ExtractorUtils {
     }
 
 
-
     public static double calculateVariance(List<Double> numbers) {
         double average = numbers.stream()
                 .mapToDouble(Double::doubleValue)
@@ -68,7 +73,7 @@ public class ExtractorUtils {
 
         for (int i = 0; i < numbers.size() - 1; i++) {
             Double diff = 0.0;
-            if (!isDoubleEqualToZero(numbers.get(i))){
+            if (!isDoubleEqualToZero(numbers.get(i))) {
                 diff = (numbers.get(i + 1) - numbers.get(i)) / numbers.get(i);
             }
             differences.add(diff);
@@ -105,7 +110,7 @@ public class ExtractorUtils {
         if (bucket < 0) {
             bucket = 0;
         }
-        return (int)bucket;
+        return (int) bucket;
     }
 
     // 针对大于1的数字,进行分桶。
@@ -117,7 +122,7 @@ public class ExtractorUtils {
         if (bucket < 0) {
             bucket = 0;
         }
-        return (int)bucket;
+        return (int) bucket;
     }
 
     public static int findInsertPosition(double[] sortedArray, double target) {
@@ -171,10 +176,23 @@ public class ExtractorUtils {
     }
 
     public static double reciprocal(double num) {
-        if (num == 0) {
-            throw new IllegalArgumentException("不能对零取倒数");
-        }
-        return 1.0 / num;
+        return 1.0 / (num + 1);
+    }
+
+    public static long getDaysBetween(long timestamp1, long timestamp2) {
+        Instant instant1 = Instant.ofEpochSecond(timestamp1);
+        Instant instant2 = Instant.ofEpochSecond(timestamp2);
+
+        LocalDate date1 = instant1.atZone(ZoneId.systemDefault()).toLocalDate();
+        LocalDate date2 = instant2.atZone(ZoneId.systemDefault()).toLocalDate();
+
+        return ChronoUnit.DAYS.between(date1, date2);
+    }
+
+    public static int getHourByTimestamp(long timestamp) {
+        return LocalDateTime
+                .ofInstant(Instant.ofEpochSecond(timestamp), ZoneId.systemDefault())
+                .getHour();
     }
 
     public static void main(String[] args) {

+ 30 - 7
src/main/java/examples/extractor/v20250218/ExtractFeature20250218.java

@@ -5,6 +5,9 @@ import examples.extractor.RankExtractorFeature_20240530;
 import examples.utils.SimilarityUtils;
 import org.apache.commons.lang3.StringUtils;
 
+import java.time.Instant;
+import java.time.LocalDateTime;
+import java.time.ZoneId;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
@@ -15,7 +18,7 @@ public class ExtractFeature20250218 {
     private ExtractFeature20250218() {
     }
 
-    public static void handleB1ToB13(Map<String, Map<String, Object>> videoFeature, Map<String, Object> featureMap) {
+    public static void handleB1ToB11AndB13(Map<String, Map<String, Object>> videoFeature, Map<String, Object> featureMap) {
         List<String> times = Arrays.asList("1h", "3h", "6h", "12h", "24h", "72h", "168h");
         List<String> indexList = Arrays.asList("is_share", "share_cnt", "is_return_1", "return_n_uv", "str_one", "ros_one", "str", "ros", "str_plus", "ros_minus", "rovn");
         for (Map.Entry<String, Map<String, Object>> entry : videoFeature.entrySet()) {
@@ -36,7 +39,21 @@ public class ExtractFeature20250218 {
         }
     }
 
-    public static void handleVideoBasicFeature(Map<String, Object> videoFeature, Map<String, Object> featureMap) {
+    public static void handleB12(Map<String, Object> b12Feature, Map<String, Object> featureMap) {
+        List<String> times = Arrays.asList("7d", "14d", "30d", "60d");
+        List<String> indexList = Arrays.asList("is_share", "share_cnt", "is_return_1", "return_n_uv", "str_one", "ros_one", "str", "ros", "str_plus", "ros_minus", "rovn");
+        for (String time : times) {
+            for (String index : indexList) {
+                double value = Double.parseDouble(b12Feature.getOrDefault(index + "_" + time, "0").toString());
+                featureMap.put("b12_" + index + "_" + time, value);
+            }
+            double rovn = Double.parseDouble(b12Feature.getOrDefault("rovn_" + time, "0").toString());
+            double returnNUv = Double.parseDouble(b12Feature.getOrDefault("return_n_uv", "0").toString());
+            featureMap.put("b12_rovn*log(r)_" + time, rovn * RankExtractorFeature_20240530.calLog(returnNUv));
+        }
+    }
+
+    public static void handleVideoBasicFeature(Map<String, Object> videoFeature, long ts, Map<String, Object> featureMap) {
         Object totalTime = videoFeature.getOrDefault("total_time", "0");
         Double width = Double.parseDouble(videoFeature.getOrDefault("width", "0d").toString());
         Double height = Double.parseDouble(videoFeature.getOrDefault("height", "0d").toString());
@@ -44,6 +61,9 @@ public class ExtractFeature20250218 {
         Object bit_rate = videoFeature.getOrDefault("bit_rate", "0d");
         String festiveLabel1 = videoFeature.getOrDefault("festive_label1", "").toString();
 
+        long createTs = Long.parseLong(videoFeature.getOrDefault("gmt_create_timestamp", "0").toString());
+
+        featureMap.put("create_ts_diff", ExtractorUtils.getDaysBetween(createTs, ts));
         featureMap.put("total_time", totalTime);
         featureMap.put("width", width);
         featureMap.put("height", height);
@@ -58,6 +78,9 @@ public class ExtractFeature20250218 {
             featureMap.put("is_greeting", 1);
         }
 
+        LocalDateTime now = LocalDateTime.ofInstant(Instant.ofEpochSecond(ts), ZoneId.systemDefault());
+        featureMap.put("hour_" + now.getHour(), "0.1");
+        featureMap.put("day_of_week" + now.getDayOfWeek(), "0.1");
     }
 
     public static void handleC1(Map<String, Object> c1Feature, Map<String, Object> featureMap) {
@@ -134,12 +157,12 @@ public class ExtractFeature20250218 {
 
     }
 
-    public static Map<String, Map<String, String[]>> handleC6ToC7(Map<String, Object> c6Feature, Map<String, Object> c7Feature) {
+    public static Map<String, Map<String, String[]>> handleC7ToC8(Map<String, Object> c7Feature, Map<String, Object> c8Feature) {
         Map<String, Map<String, String[]>> resultMap = new HashMap<>();
 
         Map<String, Map<String, Object>> featureMaps = new HashMap<>();
-        featureMaps.put("c6", c6Feature);
         featureMaps.put("c7", c7Feature);
+        featureMaps.put("c8", c8Feature);
         List<String> indexList = Arrays.asList("share", "return");
         for (Map.Entry<String, Map<String, Object>> entry : featureMaps.entrySet()) {
             String key = entry.getKey();
@@ -167,15 +190,15 @@ public class ExtractFeature20250218 {
         return resultMap;
     }
 
-    public static void useC6ToC7(Map<String, Map<String, String[]>> c67Map, String vid, Map<String, Object> featureMap) {
+    public static void useC7ToC8(Map<String, Map<String, String[]>> map, String vid, Map<String, Object> featureMap) {
         if (StringUtils.isBlank(vid)) {
             return;
         }
         for (String key : Arrays.asList("c6", "c7")) {
             for (String action : Arrays.asList("share", "return")) {
                 String featureKey = key + "_" + action;
-                if (c67Map.containsKey(featureKey)) {
-                    Map<String, String[]> cfMap = c67Map.get(featureKey);
+                if (map.containsKey(featureKey)) {
+                    Map<String, String[]> cfMap = map.get(featureKey);
                     String[] scores = cfMap.get(vid);
                     featureMap.put(featureKey + "_score", Double.parseDouble(scores[0]));
                     featureMap.put(featureKey + "_num", Double.parseDouble(scores[1]));

+ 105 - 94
src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/makedata_recsys_41_originData_20250218.scala

@@ -5,6 +5,8 @@ import com.aliyun.odps.TableSchema
 import com.aliyun.odps.data.Record
 import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
 import examples.extractor.v20250218.ExtractFeature20250218
+import examples.extractor.ExtractorUtils
+import examples.utils.SimilarityUtils
 import org.apache.hadoop.io.compress.GzipCodec
 import org.apache.spark.sql.SparkSession
 import org.xm.Similarity
@@ -51,102 +53,110 @@ object makedata_recsys_41_originData_20250218 {
           partition = partition,
           transfer = func,
           numPartition = tablePart)
-        .map(record => {
-
-          val featureMap = new JSONObject()
-          val vid = if (record.isNull("vid")) "" else record.getString("vid")
-
-          // a 视频特征
-          val b1: JSONObject = if (record.isNull("b1_feature")) new JSONObject() else JSON.parseObject(record.getString("b1_feature"))
-          val b2: JSONObject = if (record.isNull("b2_feature")) new JSONObject() else JSON.parseObject(record.getString("b2_feature"))
-          val b3: JSONObject = if (record.isNull("b3_feature")) new JSONObject() else JSON.parseObject(record.getString("b3_feature"))
-          val b4: JSONObject = if (record.isNull("b4_feature")) new JSONObject() else JSON.parseObject(record.getString("b3_feature"))
-          val b5: JSONObject = if (record.isNull("b5_feature")) new JSONObject() else JSON.parseObject(record.getString("b3_feature"))
-          val b6: JSONObject = if (record.isNull("b6_feature")) new JSONObject() else JSON.parseObject(record.getString("b6_feature"))
-          val b7: JSONObject = if (record.isNull("b7_feature")) new JSONObject() else JSON.parseObject(record.getString("b7_feature"))
-          val b8: JSONObject = if (record.isNull("b8_feature")) new JSONObject() else JSON.parseObject(record.getString("b8_feature"))
-          val b9: JSONObject = if (record.isNull("b9_feature")) new JSONObject() else JSON.parseObject(record.getString("b9_feature"))
-          val b10: JSONObject = if (record.isNull("b10_feature")) new JSONObject() else JSON.parseObject(record.getString("b10_feature"))
-          val b11: JSONObject = if (record.isNull("b11_feature")) new JSONObject() else JSON.parseObject(record.getString("b11_feature"))
-          val b12: JSONObject = if (record.isNull("b12_feature")) new JSONObject() else JSON.parseObject(record.getString("b12_feature"))
-          val b13: JSONObject = if (record.isNull("b13_feature")) new JSONObject() else JSON.parseObject(record.getString("b13_feature"))
-
-          // 用户特征
-          val c1: JSONObject = if (record.isNull("c1_feature")) new JSONObject() else JSON.parseObject(record.getString("c1_feature"))
-          val c2: JSONObject = if (record.isNull("c2_feature")) new JSONObject() else JSON.parseObject(record.getString("c2_feature"))
-          val c3: JSONObject = if (record.isNull("c3_feature")) new JSONObject() else JSON.parseObject(record.getString("c3_feature"))
-          val c4: JSONObject = if (record.isNull("c4_feature")) new JSONObject() else JSON.parseObject(record.getString("c3_feature"))
-          val c5: JSONObject = if (record.isNull("c5_feature")) new JSONObject() else JSON.parseObject(record.getString("c3_feature"))
-          val c6: JSONObject = if (record.isNull("c6_feature")) new JSONObject() else JSON.parseObject(record.getString("c6_feature"))
-          val c7: JSONObject = if (record.isNull("c7_feature")) new JSONObject() else JSON.parseObject(record.getString("c7_feature"))
-          val c8: JSONObject = if (record.isNull("c8_feature")) new JSONObject() else JSON.parseObject(record.getString("c8_feature"))
-
-          // 视频基础信息 v1-待推荐视频,v2-头部视频
-          val v1: JSONObject = if (record.isNull("v1_feature")) new JSONObject() else JSON.parseObject(record.getString("v1_feature"))
-          val v2: JSONObject = if (record.isNull("v2_feature")) new JSONObject() else JSON.parseObject(record.getString("v2_feature"))
-
-          // CF特征
-          val d1: JSONObject = if (record.isNull("d1_feature")) new JSONObject() else JSON.parseObject(record.getString("d1_feature"))
-          val d2: JSONObject = if (record.isNull("d2_feature")) new JSONObject() else JSON.parseObject(record.getString("d2_feature"))
-          val d3: JSONObject = if (record.isNull("d3_feature")) new JSONObject() else JSON.parseObject(record.getString("d3_feature"))
-
-          val bFeatureMap = new util.HashMap[String, util.Map[String, Object]]();
-          bFeatureMap.put("b1", b1);
-          bFeatureMap.put("b2", b2);
-          bFeatureMap.put("b3", b3);
-          bFeatureMap.put("b4", b4);
-          bFeatureMap.put("b5", b5);
-          bFeatureMap.put("b6", b6);
-          bFeatureMap.put("b7", b7);
-          bFeatureMap.put("b8", b8);
-          bFeatureMap.put("b9", b9);
-          bFeatureMap.put("b10", b10);
-          bFeatureMap.put("b11", b11);
-          bFeatureMap.put("b12", b12);
-          bFeatureMap.put("b13", b13);
-
-          ExtractFeature20250218.handleB1ToB13(bFeatureMap, featureMap);
-          ExtractFeature20250218.handleC1(c1, featureMap)
-          ExtractFeature20250218.handleC2ToC3(c2, c3, featureMap)
-          ExtractFeature20250218.handleC4(c4, featureMap)
-          ExtractFeature20250218.handleC5ToC6(c5, c6, v1, featureMap)
-
-          val c67Map = ExtractFeature20250218.handleC6ToC7(c6, c7)
-          ExtractFeature20250218.useC6ToC7(c67Map, vid, featureMap)
-
-          ExtractFeature20250218.handleD1(d1, featureMap)
-          ExtractFeature20250218.handleD2(d2, featureMap)
-          ExtractFeature20250218.handleD3(d3, featureMap)
-          ExtractFeature20250218.handleVideoBasicFeature(v1, featureMap)
-          ExtractFeature20250218.handleVideoSimilarity(v1, v2, featureMap)
-
-          //4 处理label信息。
-          val labels = new JSONObject
-          for (labelKey <- List(
-            "is_share", "share_cnt",
-            "is_return_1", "return_1_uv",
-            "is_return_n", "return_n_uv",
-            "is_return_noself", "return_1_uv_noself",
-            "is_return_n_noself", "return_n_uv_noself"
-          )) {
-            if (!record.isNull(labelKey)) {
-              labels.put(labelKey, record.getString(labelKey))
+        .mapPartitions(p => {
+          SimilarityUtils.init()
+          p.map(record => {
+
+            val featureMap = new JSONObject()
+            val vid = if (record.isNull("vid")) "" else record.getString("vid")
+            // vid 已经提取了
+            val ts = record.getString("ts").toLong
+            // a 视频特征
+            val b1: JSONObject = if (record.isNull("b1_feature")) new JSONObject() else JSON.parseObject(record.getString("b1_feature"))
+            val b2: JSONObject = if (record.isNull("b2_feature")) new JSONObject() else JSON.parseObject(record.getString("b2_feature"))
+            val b3: JSONObject = if (record.isNull("b3_feature")) new JSONObject() else JSON.parseObject(record.getString("b3_feature"))
+            val b4: JSONObject = if (record.isNull("b4_feature")) new JSONObject() else JSON.parseObject(record.getString("b3_feature"))
+            val b5: JSONObject = if (record.isNull("b5_feature")) new JSONObject() else JSON.parseObject(record.getString("b3_feature"))
+            val b6: JSONObject = if (record.isNull("b6_feature")) new JSONObject() else JSON.parseObject(record.getString("b6_feature"))
+            val b7: JSONObject = if (record.isNull("b7_feature")) new JSONObject() else JSON.parseObject(record.getString("b7_feature"))
+            val b8: JSONObject = if (record.isNull("b8_feature")) new JSONObject() else JSON.parseObject(record.getString("b8_feature"))
+            val b9: JSONObject = if (record.isNull("b9_feature")) new JSONObject() else JSON.parseObject(record.getString("b9_feature"))
+            val b10: JSONObject = if (record.isNull("b10_feature")) new JSONObject() else JSON.parseObject(record.getString("b10_feature"))
+            val b11: JSONObject = if (record.isNull("b11_feature")) new JSONObject() else JSON.parseObject(record.getString("b11_feature"))
+            val b12: JSONObject = if (record.isNull("b12_feature")) new JSONObject() else JSON.parseObject(record.getString("b12_feature"))
+            val b13: JSONObject = if (record.isNull("b13_feature")) new JSONObject() else JSON.parseObject(record.getString("b13_feature"))
+
+            // 用户特征
+            val c1: JSONObject = if (record.isNull("c1_feature")) new JSONObject() else JSON.parseObject(record.getString("c1_feature"))
+            val c2: JSONObject = if (record.isNull("c2_feature")) new JSONObject() else JSON.parseObject(record.getString("c2_feature"))
+            val c3: JSONObject = if (record.isNull("c3_feature")) new JSONObject() else JSON.parseObject(record.getString("c3_feature"))
+            val c4: JSONObject = if (record.isNull("c4_feature")) new JSONObject() else JSON.parseObject(record.getString("c3_feature"))
+            val c5: JSONObject = if (record.isNull("c5_feature")) new JSONObject() else JSON.parseObject(record.getString("c3_feature"))
+            val c6: JSONObject = if (record.isNull("c6_feature")) new JSONObject() else JSON.parseObject(record.getString("c6_feature"))
+            val c7: JSONObject = if (record.isNull("c7_feature")) new JSONObject() else JSON.parseObject(record.getString("c7_feature"))
+            val c8: JSONObject = if (record.isNull("c8_feature")) new JSONObject() else JSON.parseObject(record.getString("c8_feature"))
+
+            // 视频基础信息 v1-待推荐视频,v2-头部视频
+            val v1: JSONObject = if (record.isNull("v1_feature")) new JSONObject() else JSON.parseObject(record.getString("v1_feature"))
+            val v2: JSONObject = if (record.isNull("v2_feature")) new JSONObject() else JSON.parseObject(record.getString("v2_feature"))
+
+            // CF特征
+            val d1: JSONObject = if (record.isNull("d1_feature")) new JSONObject() else JSON.parseObject(record.getString("d1_feature"))
+            val d2: JSONObject = if (record.isNull("d2_feature")) new JSONObject() else JSON.parseObject(record.getString("d2_feature"))
+            val d3: JSONObject = if (record.isNull("d3_feature")) new JSONObject() else JSON.parseObject(record.getString("d3_feature"))
+
+            val bFeatureMap = new util.HashMap[String, util.Map[String, Object]]();
+            bFeatureMap.put("b1", b1);
+            bFeatureMap.put("b2", b2);
+            bFeatureMap.put("b3", b3);
+            bFeatureMap.put("b4", b4);
+            bFeatureMap.put("b5", b5);
+            bFeatureMap.put("b6", b6);
+            bFeatureMap.put("b7", b7);
+            bFeatureMap.put("b8", b8);
+            bFeatureMap.put("b9", b9);
+            bFeatureMap.put("b10", b10);
+            bFeatureMap.put("b11", b11);
+            bFeatureMap.put("b13", b13);
+
+            ExtractFeature20250218.handleB1ToB11AndB13(bFeatureMap, featureMap);
+            ExtractFeature20250218.handleB12(b12, featureMap)
+            ExtractFeature20250218.handleC1(c1, featureMap)
+            ExtractFeature20250218.handleC2ToC3(c2, c3, featureMap)
+            ExtractFeature20250218.handleC4(c4, featureMap)
+            ExtractFeature20250218.handleC5ToC6(c5, c6, v1, featureMap)
+
+            val c78Map = ExtractFeature20250218.handleC7ToC8(c7, c8)
+            ExtractFeature20250218.useC7ToC8(c78Map, vid, featureMap)
+
+            ExtractFeature20250218.handleD1(d1, featureMap)
+            ExtractFeature20250218.handleD2(d2, featureMap)
+            ExtractFeature20250218.handleD3(d3, featureMap)
+            ExtractFeature20250218.handleVideoBasicFeature(v1, ts, featureMap)
+            ExtractFeature20250218.handleVideoSimilarity(v1, v2, featureMap)
+
+
+            //4 处理label信息。
+            val labels = new JSONObject
+            for (labelKey <- List(
+              "is_share", "share_cnt",
+              "is_return_1", "return_1_uv",
+              "is_return_n", "return_n_uv",
+              "is_return_noself", "return_1_uv_noself",
+              "is_return_n_noself", "return_n_uv_noself"
+            )) {
+              if (!record.isNull(labelKey)) {
+                labels.put(labelKey, record.getString(labelKey))
+              }
             }
-          }
-          //5 处理log key表头。
-          val apptype = record.getString("apptype")
-          val pagesource = record.getString("pagesource")
-          val mid = record.getString("mid")
-          // vid 已经提取了
-          val ts = record.getString("ts")
-          val abcode = record.getString("abcode")
-          val level = if (record.isNull("level")) "0" else record.getString("level")
-          val logKey = (apptype, pagesource, mid, vid, ts, abcode, level).productIterator.mkString(",")
-          val labelKey = labels.toString()
-          val featureKey = featureMap.toString()
-          //6 拼接数据,保存。
-          logKey + "\t" + labelKey + "\t" + featureKey
 
+            //5 处理log key表头。
+            val logs = new JSONObject()
+            for (key <- List("apptype", "abcode", "mid", "vid", "page", "recommendpagetype", "level", "ts", "headvideoid")) {
+              if (!record.isNull(key)) {
+                logs.put(key, record.getString(key))
+              }
+            }
+
+            logs.put("hour", ExtractorUtils.getHourByTimestamp(ts))
+
+            val logKey = logs.toString()
+            val labelKey = labels.toString()
+            val featureKey = featureMap.toString()
+            //6 拼接数据,保存。
+            logKey + "\t" + labelKey + "\t" + featureKey
+
+          })
         })
 
       // 4 保存数据到hdfs
@@ -162,6 +172,7 @@ object makedata_recsys_41_originData_20250218 {
     }
   }
 
+
   def func(record: Record, schema: TableSchema): Record = {
     record
   }