Преглед изворни кода

样本重新制作: 小时级别特征。

zhangbo пре 1 година
родитељ
комит
fbed1de0b2

+ 34 - 28
src/main/java/examples/extractor/ExtractorUtils.java

@@ -34,31 +34,7 @@ public class ExtractorUtils {
         return Math.abs(value) < epsilon;
     }
 
-    public static double ceilLog(Double key) {
-        double bucket = Math.ceil(Math.log(key + 1.0) * 100);
-        if (bucket > 100L) {
-            bucket = 100L;
-        }
-        if (bucket < 0) {
-            bucket = 0;
-        }
-        return  (double)bucket;
-    }
 
-    public static double bucketRatioFeature(Double key) {
-//        long bucket = Math.round(Math.log((key + 1.0) * 10));
-//        if (bucket > 50L) {
-//            bucket = 50L;
-//        }
-        double bucket = Math.round(Math.pow(key, 0.5) * 100);
-        if (bucket > 100L) {
-            bucket = 100L;
-        }
-        if (bucket < 0) {
-            bucket = 0;
-        }
-        return (double)bucket;
-    }
 
     public static double calculateVariance(List<Double> numbers) {
         double average = numbers.stream()
@@ -113,11 +89,41 @@ public class ExtractorUtils {
         return subtractedDateTime.format(formatter);
     }
 
+    // 针对0-1的数字,进行分桶。
+    public static Integer ceilLogRate(Double key) {
+        double bucket = Math.ceil(
+                Math.pow(key, 0.2) * 100
+        );
+        if (bucket > 300) {
+            bucket = 300;
+        }
+        if (bucket < 0) {
+            bucket = 0;
+        }
+        return (int)bucket;
+    }
+
+    // 针对大于1的数字,进行分桶。
+    public static int bucketCnt(Double key) {
+        long bucket = Math.round(Math.log((key * 10 + 1.0)) * 10);
+        if (bucket > 300) {
+            bucket = 300;
+        }
+        if (bucket < 0) {
+            bucket = 0;
+        }
+        return (int)bucket;
+    }
+
     public static void main(String[] args) {
-        System.out.println(bucketRatioFeature(0.1));
-        System.out.println(bucketRatioFeature(0.8));
-        System.out.println(bucketRatioFeature(0.01));
-        System.out.println(bucketRatioFeature(0.007));
+        System.out.println(ceilLogRate(0.0002));
+        System.out.println(ceilLogRate(0.01));
+        System.out.println(ceilLogRate(0.2));
+        System.out.println(ceilLogRate(4.));
+        System.out.println(bucketCnt(1.));
+        System.out.println(bucketCnt(20.));
+        System.out.println(bucketCnt(500.));
+        System.out.println(bucketCnt(50000.));
 
     }
 

+ 3 - 3
src/main/java/examples/extractor/RankExtractorItemFeature.java

@@ -146,7 +146,7 @@ public class RankExtractorItemFeature {
     public static Map<String, String> rateFeatureChange(Map<String, Double> maps){
         Map<String, String> result = new HashMap<>();
         for (Map.Entry<String, Double> entry : maps.entrySet()){
-            Double value = ExtractorUtils.bucketRatioFeature(entry.getValue());
+            int value = ExtractorUtils.ceilLogRate(entry.getValue());
             result.put(entry.getKey(), String.valueOf(value));
         }
         return result;
@@ -159,7 +159,7 @@ public class RankExtractorItemFeature {
             if (!names.contains(entry.getKey())){
                 continue;
             }
-            Double value = ExtractorUtils.ceilLog(Double.valueOf(entry.getValue()));
+            int value = ExtractorUtils.bucketCnt(Double.valueOf(entry.getValue()));
             result.put(entry.getKey(), String.valueOf(value));
         }
         return result;
@@ -179,7 +179,7 @@ public class RankExtractorItemFeature {
             }
             Double num = entry.getValue().getOrDefault(dateHour, 0.0);
             if (!ExtractorUtils.isDoubleEqualToZero(num)){
-                result.put(entry.getKey(), String.valueOf(ExtractorUtils.ceilLog(num)));
+                result.put(entry.getKey(), String.valueOf(ExtractorUtils.bucketCnt(num)));
             }
         }
         return result;

+ 2 - 2
src/main/java/examples/extractor/RankExtractorUserFeature.java

@@ -83,7 +83,7 @@ public class RankExtractorUserFeature {
     public static Map<String, String> rateFeatureChange(Map<String, Double> maps){
         Map<String, String> result = new HashMap<>();
         for (Map.Entry<String, Double> entry : maps.entrySet()){
-            Double value = ExtractorUtils.bucketRatioFeature(entry.getValue());
+            int value = ExtractorUtils.ceilLogRate(entry.getValue());
             result.put(entry.getKey(), String.valueOf(value));
         }
         return result;
@@ -95,7 +95,7 @@ public class RankExtractorUserFeature {
             if (!names.contains(entry.getKey())){
                 continue;
             }
-            Double value = ExtractorUtils.ceilLog(Double.valueOf(entry.getValue()));
+            int value = ExtractorUtils.bucketCnt(Double.valueOf(entry.getValue()));
             result.put(entry.getKey(), String.valueOf(value));
         }
         return result;

+ 26 - 30
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_06_strData.scala

@@ -53,38 +53,33 @@ object makedata_06_strData {
         numPartition = tablePart)
         .map(record => {
 
-          val originSecene = Set(
-            "apptype", "logtimestamp", "clientip", "ctx_day", "ctx_week", "ctx_hour", "ctx_region", "ctx_city"
-          )
-          val originUser = Set(
+          val originFeatureName = Set(
+            "apptype", "logtimestamp", "clientip", "ctx_day", "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
+
             "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion",
             "machineinfo_system", "machineinfo_wechatversion", "gmt_create_user",
             "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
             "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
             "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
-            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt"
-          )
-          val originItem = Set(
+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt",
+
             "title", "tags", "total_time", "play_count_total",
             "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
             "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
             "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
             "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt"
           )
-          val originItemRealtime = Set(
-            "view_pv_list_1day","view_uv_list_1day","play_pv_list_1day","play_uv_list_1day",
-            "share_pv_list_1day", "share_uv_list_1day","return_uv_list_1day",
-            "p_view_uv_list_1day","p_view_pv_list_1day","p_return_uv_list_1day",
-            "share_uv_list_2day","share_pv_list_2day","share_uv_list_3day","share_pv_list_3day",
+          val originFeatureMap = getFeatureFromSet(originFeatureName, record)
 
-            "view_uv_list_1h","view_pv_list_1h","play_uv_list_1h","play_pv_list_1h",
-            "share_uv_list_1h","share_pv_list_1h","return_uv_list_1h","p_return_uv_list_1h"
-          )
+          val itemRealtimeFeatureMap = getFeatureFromSet(Set(
+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
 
-          val sceneFeatureMap = getFeatureFromSet(originSecene, record)
-          val userFeatureMap = getFeatureFromSet(originUser, record)
-          val itemFeatureMap = getFeatureFromSet(originItem, record)
-          val itemRealtimeFeatureMap = getFeatureFromSet(originItemRealtime, record).map(r => {
+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
+          ), record).map(r => {
             val m = new java.util.HashMap[String, Double]()
             r._2.split(",").foreach(r => {
               m.put(r.split(":")(0), r.split(":")(1).toDouble)
@@ -106,16 +101,16 @@ object makedata_06_strData {
             "machineinfo_system", "machineinfo_wechatversion", "gmt_create_user",
             "title", "tags"
           ), record)
-          val f2 = RankExtractorUserFeature.getUserRateFeature(userFeatureMap)
-          val f3 = RankExtractorUserFeature.cntFeatureChange(userFeatureMap,
+          val f2 = RankExtractorUserFeature.getUserRateFeature(originFeatureMap)
+          val f3 = RankExtractorUserFeature.cntFeatureChange(originFeatureMap,
             new util.HashSet[String](util.Arrays.asList(
               "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
               "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
               "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
               "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt"))
           )
-          val f4 = RankExtractorItemFeature.getItemRateFeature(itemFeatureMap)
-          val f5 = RankExtractorItemFeature.cntFeatureChange(itemFeatureMap,
+          val f4 = RankExtractorItemFeature.getItemRateFeature(originFeatureMap)
+          val f5 = RankExtractorItemFeature.cntFeatureChange(originFeatureMap,
             new util.HashSet[String](util.Arrays.asList(
               "total_time", "play_count_total",
               "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
@@ -123,7 +118,8 @@ object makedata_06_strData {
               "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
               "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt")))
           val f6 = RankExtractorItemFeature.getItemRealtimeTrend(javaMap,
-            sceneFeatureMap.getOrElse("ctx_day", ""), sceneFeatureMap.getOrElse("ctx_hour", ""))
+            originFeatureMap.getOrElse("ctx_day", ""),
+            originFeatureMap.getOrElse("ctx_hour", ""))
           val f7 = RankExtractorItemFeature.getItemRealtimeCnt(javaMap,
             new util.HashSet[String](util.Arrays.asList(
               "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
@@ -134,12 +130,12 @@ object makedata_06_strData {
               "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
               "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
             )),
-            sceneFeatureMap.getOrElse("ctx_day", ""),
-            sceneFeatureMap.getOrElse("ctx_hour", "")
+            originFeatureMap.getOrElse("ctx_day", ""),
+            originFeatureMap.getOrElse("ctx_hour", "")
           )
           val f8 = RankExtractorItemFeature.getItemRealtimeRate(javaMap,
-            sceneFeatureMap.getOrElse("ctx_day", ""),
-            sceneFeatureMap.getOrElse("ctx_hour", "")
+            originFeatureMap.getOrElse("ctx_day", ""),
+            originFeatureMap.getOrElse("ctx_hour", "")
           )
 
           // 1:特征聚合到map中
@@ -199,7 +195,7 @@ object makedata_06_strData {
           val labelMap = getFeatureFromSet(labels, record)
           labels.foreach(r => {
             if (labelMap.containsKey(r)) {
-              labelMap.put(r, labelMap.get(r).get)
+              labelNew.put(r, labelMap.get(r).get)
             }
           })
           //3:记录唯一key
@@ -208,7 +204,7 @@ object makedata_06_strData {
           val logtimestamp = record.getString("logtimestamp")
           val sessionid = record.getString("sessionid")
 
-          val logKey = (mid, videoid, logtimestamp, sessionid).productIterator.mkString("-")
+          val logKey = (mid, videoid, logtimestamp, sessionid).productIterator.mkString(":")
           val labelKey = labelNew.toString()
           val featureKey = resultNew.toString()