Browse Source

feat:合并代码

zhaohaipeng 10 months ago
parent
commit
987239b917
76 changed files with 9654 additions and 937 deletions
  1. 86 60
      pom.xml
  2. 198 199
      src/main/java/examples/dataloader/AdRedisFeatureConstructor.java
  3. 223 223
      src/main/java/examples/dataloader/AdSampleConstructor.java
  4. 80 0
      src/main/java/examples/dataloader/OfflineVlogFeatureGroup.java
  5. 81 0
      src/main/java/examples/dataloader/OfflineVlogFeatureGroupV1.java
  6. 125 0
      src/main/java/examples/dataloader/OfflineVlogFeatureGroupV2.java
  7. 68 0
      src/main/java/examples/dataloader/OfflineVlogShareLRFeatureExtractor.java
  8. 66 0
      src/main/java/examples/dataloader/OfflineVlogShareLRFeatureExtractorV1.java
  9. 66 0
      src/main/java/examples/dataloader/OfflineVlogShareLRFeatureExtractorV2.java
  10. 120 0
      src/main/java/examples/dataloader/RequestContextOffline.java
  11. 45 0
      src/main/java/examples/dataloader/redisBuilderMyself.java
  12. 164 0
      src/main/java/examples/extractor/ExtractorUtils.java
  13. 33 0
      src/main/java/examples/extractor/RankExtractorFeature_20240530.java
  14. 324 0
      src/main/java/examples/extractor/RankExtractorItemFeature.java
  15. 338 0
      src/main/java/examples/extractor/RankExtractorItemFeatureV2.java
  16. 104 0
      src/main/java/examples/extractor/RankExtractorUserFeature.java
  17. 110 0
      src/main/java/examples/extractor/RankExtractorUserFeatureV2.java
  18. 100 107
      src/main/java/examples/sparksql/SparkAdCTRSampleLoader.java
  19. 125 124
      src/main/java/examples/sparksql/SparkAdFeaToRedisLoader.java
  20. 98 99
      src/main/java/examples/sparksql/SparkShareRatioSampleLoader.java
  21. 123 124
      src/main/java/examples/sparksql/SparkVideoFeaToRedisLoader.java
  22. 274 0
      src/main/resources/20240608_feature_name.txt
  23. 0 0
      src/main/resources/20240609_bucket_274.txt
  24. 2 0
      src/main/resources/20240609_bucket_274_old.txt
  25. 6 0
      src/main/resources/20240622_ad_bucket_249.txt
  26. 249 0
      src/main/resources/20240622_ad_feature_name.txt
  27. 125 0
      src/main/scala/com/aliyun/odps/spark/examples/ana/ana_01_cidvidpk.scala
  28. 79 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_01_readtable2hdfs.scala
  29. 249 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_02_writeredis.scala
  30. 74 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_03_deleteredis.scala
  31. 85 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_04_rosHdfsFromTablev1.scala
  32. 106 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_04_rosHdfsFromTablev2.scala
  33. 43 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_05_sampleStatic.scala
  34. 257 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_06_originData.scala
  35. 260 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_06_originData_v3.scala
  36. 243 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_07_rosData.scala
  37. 202 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_07_strData.scala
  38. 140 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_08_item2redis.scala
  39. 220 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_09_user2redis.scala
  40. 167 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_09_user2redis_freq.scala
  41. 244 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_10_originData_v3.scala
  42. 187 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_11_strData_v3.scala
  43. 215 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_12_rosData_v3.scala
  44. 216 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_12_rosData_v3_noweight.scala
  45. 278 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_13_originData_20240529.scala
  46. 256 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_13_originData_20240529_check.scala
  47. 92 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_14_valueData_20240608.scala
  48. 92 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_15_bucket_20240608.scala
  49. 127 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_16_bucketData_20240609.scala
  50. 132 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_16_bucketData_20240609_check.scala
  51. 300 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_17_bucketDataPrint_20240617.scala
  52. 43 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_18_mergehour2day_20240617.scala
  53. 388 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_31_originData_20240620.scala
  54. 103 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_32_bucket_20240622.scala
  55. 118 0
      src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_33_bucketData_20240622.scala
  56. 246 0
      src/main/scala/com/aliyun/odps/spark/examples/myUtils/MyDateUtils.scala
  57. 148 0
      src/main/scala/com/aliyun/odps/spark/examples/myUtils/MyHdfsUtils.scala
  58. 40 0
      src/main/scala/com/aliyun/odps/spark/examples/myUtils/ParamUtils.scala
  59. 39 0
      src/main/scala/com/aliyun/odps/spark/examples/myUtils/env.scala
  60. 1 1
      src/main/scala/com/aliyun/odps/spark/examples/sparksql/SparkSQL.scala
  61. 161 0
      src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本
  62. 34 0
      src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本-广告
  63. 8 0
      src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本【分析】
  64. 29 0
      zhangbo/00_copy.sh
  65. 16 0
      zhangbo/01_train.sh
  66. 25 0
      zhangbo/02_train_go.sh
  67. 33 0
      zhangbo/03_predict.sh
  68. 25 0
      zhangbo/04_upload.sh
  69. 151 0
      zhangbo/05_update_everyday_2model.sh
  70. 107 0
      zhangbo/05_update_everyday_str.sh
  71. 124 0
      zhangbo/06_update_everyday_feature.sh
  72. 67 0
      zhangbo/50_delete_hdfs.sh
  73. 28 0
      zhangbo/train.sh
  74. 14 0
      zhangbo/up.sh
  75. 10 0
      zhangbo/up2.sh
  76. 99 0
      zhangbo/utils.py

+ 86 - 60
pom.xml

@@ -17,18 +17,26 @@
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <modelVersion>4.0.0</modelVersion>
 
+    <parent>
+        <groupId>com.tzld.commons</groupId>
+        <artifactId>supom</artifactId>
+        <version>1.0.9</version>
+    </parent>
+
     <properties>
-        <spark.version>3.1.1</spark.version>
-        <oss.sdk.version>3.0.0</oss.sdk.version>
+        <spark.version>2.3.0</spark.version>
         <cupid.sdk.version>3.3.8-public</cupid.sdk.version>
-        <scala.version>2.12.10</scala.version>
-        <scala.binary.version>2.12</scala.binary.version>
-        <odps.version>0.28.4-public</odps.version>
+        <scala.version>2.11.8</scala.version>
+        <scala.binary.version>2.11</scala.binary.version>
+        <java.version>1.8</java.version>
+        <maven.compiler.source>${java.version}</maven.compiler.source>
+        <maven.compiler.target>${java.version}</maven.compiler.target>
         <emr.version>2.0.0</emr.version>
+        <odps.version>0.28.4-public</odps.version>
     </properties>
 
     <groupId>com.aliyun.odps</groupId>
-    <artifactId>spark-examples_${scala.binary.version}</artifactId>
+    <artifactId>spark-examples</artifactId>
     <version>1.0.0-SNAPSHOT</version>
     <packaging>jar</packaging>
 
@@ -39,6 +47,29 @@
             <version>3.12.0</version>
         </dependency>
 
+        <dependency>
+            <groupId>com.hankcs</groupId>
+            <artifactId>hanlp</artifactId>
+            <version>portable-1.8.2</version>
+        </dependency>
+<!--        <dependency>-->
+<!--            <groupId>com.medallia.word2vec</groupId>-->
+<!--            <artifactId>word2vec</artifactId>-->
+<!--            <version>0.1.42</version>-->
+<!--        </dependency>-->
+
+        <dependency>
+            <groupId>org.xm</groupId>
+            <artifactId>similarity</artifactId>
+            <version>1.1</version>
+        </dependency>
+
+        <dependency>
+            <groupId>com.alibaba</groupId>
+            <artifactId>fastjson</artifactId>
+            <version>1.2.83</version>
+        </dependency>
+
         <dependency>
             <groupId>com.tzld.piaoquan</groupId>
             <artifactId>recommend-server-client</artifactId>
@@ -48,17 +79,15 @@
         <dependency>
             <groupId>com.tzld.piaoquan</groupId>
             <artifactId>recommend-feature-client</artifactId>
-            <version>1.1.15</version>
+            <version>1.0.3</version>
         </dependency>
 
-
         <dependency>
             <groupId>com.tzld.piaoquan</groupId>
             <artifactId>ad-engine-commons</artifactId>
-            <version>1.1.0</version>
+            <version>1.0.0</version>
         </dependency>
 
-
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-core_${scala.binary.version}</artifactId>
@@ -66,36 +95,32 @@
             <scope>provided</scope>
             <exclusions>
                 <exclusion>
-                    <artifactId>protobuf-java</artifactId>
-                    <groupId>com.google.protobuf</groupId>
+                    <groupId>org.scala-lang</groupId>
+                    <artifactId>scala-library</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.scala-lang</groupId>
+                    <artifactId>scalap</artifactId>
                 </exclusion>
             </exclusions>
         </dependency>
-
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-sql_${scala.binary.version}</artifactId>
             <version>${spark.version}</version>
             <scope>provided</scope>
-            <exclusions>
-                <exclusion>
-                    <artifactId>protobuf-java</artifactId>
-                    <groupId>com.google.protobuf</groupId>
-                </exclusion>
-            </exclusions>
         </dependency>
-
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-mllib_${scala.binary.version}</artifactId>
             <version>${spark.version}</version>
             <scope>provided</scope>
-            <exclusions>
-                <exclusion>
-                    <artifactId>protobuf-java</artifactId>
-                    <groupId>com.google.protobuf</groupId>
-                </exclusion>
-            </exclusions>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-streaming_${scala.binary.version}</artifactId>
+            <version>${spark.version}</version>
+            <scope>provided</scope>
         </dependency>
 
         <dependency>
@@ -103,26 +128,36 @@
             <artifactId>cupid-sdk</artifactId>
             <version>${cupid.sdk.version}</version>
             <scope>provided</scope>
-            <exclusions>
-                <exclusion>
-                    <artifactId>protobuf-java</artifactId>
-                    <groupId>com.google.protobuf</groupId>
-                </exclusion>
-            </exclusions>
         </dependency>
 
         <dependency>
             <groupId>com.aliyun.odps</groupId>
             <artifactId>hadoop-fs-oss</artifactId>
             <version>${cupid.sdk.version}</version>
-            <exclusions>
-                <exclusion>
-                    <artifactId>protobuf-java</artifactId>
-                    <groupId>com.google.protobuf</groupId>
-                </exclusion>
-            </exclusions>
         </dependency>
 
+        <dependency>
+            <groupId>com.aliyun.odps</groupId>
+            <artifactId>odps-spark-datasource_${scala.binary.version}</artifactId>
+            <version>${cupid.sdk.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.scala-lang</groupId>
+            <artifactId>scala-library</artifactId>
+            <version>${scala.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.scala-lang</groupId>
+            <artifactId>scala-actors</artifactId>
+            <version>${scala.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>com.aliyun.emr</groupId>
+            <artifactId>emr-maxcompute_2.11</artifactId>
+            <version>${emr.version}</version>
+        </dependency>
 
         <dependency>
             <groupId>org.springframework.boot</groupId>
@@ -134,34 +169,14 @@
             <artifactId>jedis</artifactId>
             <version>3.3.0</version>
         </dependency>
+
+
         <dependency>
             <groupId>org.projectlombok</groupId>
             <artifactId>lombok</artifactId>
             <version>1.18.24</version>
         </dependency>
 
-        <dependency>
-            <groupId>com.aliyun.odps</groupId>
-            <artifactId>odps-sdk-commons</artifactId>
-            <version>${odps.version}</version>
-        </dependency>
-
-        <dependency>
-            <groupId>com.aliyun.emr</groupId>
-            <artifactId>emr-mns_2.11</artifactId>
-            <version>${emr.version}</version>
-            <exclusions>
-                <exclusion>
-                    <groupId>com.aliyun.mns</groupId>
-                    <artifactId>aliyun-sdk-mns</artifactId>
-                </exclusion>
-            </exclusions>
-        </dependency>
-        <dependency>
-            <groupId>com.aliyun.emr</groupId>
-            <artifactId>emr-maxcompute_2.11</artifactId>
-            <version>${emr.version}</version>
-        </dependency>
     </dependencies>
 
     <build>
@@ -240,6 +255,17 @@
                     </execution>
                 </executions>
             </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>3.8.1</version>
+                <configuration>
+                    <source>1.8</source>
+                    <target>1.8</target>
+                    <!--<compilerId>scala</compilerId>-->
+                    <!-- <compilerVersion>2.12.10</compilerVersion>-->
+                </configuration>
+            </plugin>
         </plugins>
     </build>
 

+ 198 - 199
src/main/java/examples/dataloader/AdRedisFeatureConstructor.java

@@ -1,199 +1,198 @@
-package examples.dataloader;
-
-
-import com.aliyun.odps.account.Account;
-import com.aliyun.odps.account.AliyunAccount;
-import com.aliyun.odps.data.Record;
-import com.tzld.piaoquan.recommend.feature.domain.ad.base.AdActionFeature;
-import com.tzld.piaoquan.recommend.feature.domain.ad.base.AdRequestContext;
-import com.tzld.piaoquan.recommend.feature.domain.ad.base.UserAdFeature;
-import com.tzld.piaoquan.recommend.feature.domain.ad.base.AdItemFeature;
-
-
-import java.util.HashMap;
-import java.util.Map;
-
-public class AdRedisFeatureConstructor {
-
-    private static final String BUCKET_NAME = "ali-recommend";
-    private static final Map<String, String> ODPS_CONFIG = new HashMap<String, String>();
-
-    static {
-        ODPS_CONFIG.put("ENDPOINT", "http://service.cn.maxcompute.aliyun.com/api");
-        ODPS_CONFIG.put("ACCESSID", "LTAIWYUujJAm7CbH");
-        ODPS_CONFIG.put("ACCESSKEY", "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P");
-    }
-
-    ;
-
-    private static final Account account = new AliyunAccount(ODPS_CONFIG.get("ACCESSID"), ODPS_CONFIG.get("ACCESSKEY"));
-
-
-    public static UserAdFeature constructUserFeature(Record record) {
-        UserAdFeature userFeature = new UserAdFeature();
-        userFeature.setMid(record.getString("mids"));
-
-        // 1day features
-        AdActionFeature userAd1dayActionFeature = new AdActionFeature();
-        userAd1dayActionFeature.setAdView(record.getString("ad_view_1day"));
-        userAd1dayActionFeature.setAdClick(record.getString("ad_click_1day"));
-        userAd1dayActionFeature.setAdConversion(record.getString("ad_conversion_1day"));
-        userAd1dayActionFeature.setCtr(record.getString("ad_ctr_1day"));
-        userAd1dayActionFeature.setCvr(record.getString("ad_cvr_1day"));
-        userFeature.setDay1_cnt_features(userAd1dayActionFeature);
-
-
-        // 3day features
-        AdActionFeature userAd3dayActionFeature = new AdActionFeature();
-        userAd3dayActionFeature.setAdView(record.getString("ad_view_3day"));
-        userAd3dayActionFeature.setAdClick(record.getString("ad_click_3day"));
-        userAd3dayActionFeature.setAdConversion(record.getString("ad_conversion_3day"));
-        userAd3dayActionFeature.setCtr(record.getString("ad_ctr_3day"));
-        userAd3dayActionFeature.setCvr(record.getString("ad_cvr_3day"));
-        userFeature.setDay3_cnt_features(userAd3dayActionFeature);
-
-
-        // 7day features
-        AdActionFeature userAd7dayActionFeature = new AdActionFeature();
-        userAd7dayActionFeature.setAdView(record.getString("ad_view_7day"));
-        userAd7dayActionFeature.setAdClick(record.getString("ad_click7day"));
-        userAd7dayActionFeature.setAdConversion(record.getString("ad_conversion_7day"));
-        userAd7dayActionFeature.setCtr(record.getString("ad_ctr_7day"));
-        userAd7dayActionFeature.setCvr(record.getString("ad_cvr_7day"));
-        userFeature.setDay7_cnt_features(userAd7dayActionFeature);
-
-
-        // 3month features
-        AdActionFeature userAd3MonthActionFeature = new AdActionFeature();
-        userAd3MonthActionFeature.setAdView(record.getString("ad_view_3month"));
-        userAd3MonthActionFeature.setAdClick(record.getString("ad_click_3month"));
-        userAd3MonthActionFeature.setAdConversion(record.getString("ad_conversion_3month"));
-        userAd3MonthActionFeature.setCtr(record.getString("ad_ctr_3month"));
-        userAd3MonthActionFeature.setCvr(record.getString("ad_cvr_3month"));
-        userFeature.setMonth3_cnt_features(userAd3MonthActionFeature);
-
-
-        return userFeature;
-    }
-
-
-    public static AdItemFeature constructItemFeature(Record record) {
-        AdItemFeature itemFeature = new AdItemFeature();
-        itemFeature.setAdId(record.getString("creativeid"));
-        // itemFeature.setAdCode(record.getString("adcode"));
-        itemFeature.setCampaignId(record.getString("campaignid"));
-        itemFeature.setAdvertiserId(record.getString("advertiserid"));
-        itemFeature.setCreativeId(record.getString("creativeid"));
-
-        // ad 维度特征
-        AdActionFeature adIdActionFeature1day = new AdActionFeature();
-        adIdActionFeature1day.setAdView(record.getString("view_ad_1day"));
-        adIdActionFeature1day.setAdClick(record.getString("click_ad_1day"));
-        adIdActionFeature1day.setAdConversion(record.getString("conversion_ad_1day"));
-        adIdActionFeature1day.setCtr(record.getString("ctr_ad_1day"));
-        adIdActionFeature1day.setCvr(record.getString("cvr_ad_1day"));
-        itemFeature.setDay1_cnt_features(adIdActionFeature1day);
-
-        AdActionFeature adIdActionFeature3day = new AdActionFeature();
-        adIdActionFeature3day.setAdView(record.getString("view_ad_3day"));
-        adIdActionFeature3day.setAdClick(record.getString("click_ad_3day"));
-        adIdActionFeature3day.setAdConversion(record.getString("conversion_ad_3day"));
-        adIdActionFeature3day.setCtr(record.getString("ctr_ad_3day"));
-        adIdActionFeature3day.setCvr(record.getString("cvr_ad_3day"));
-        itemFeature.setDay3_cnt_features(adIdActionFeature3day);
-
-        AdActionFeature adIdActionFeature7day = new AdActionFeature();
-        adIdActionFeature7day.setAdView(record.getString("view_ad_7day"));
-        adIdActionFeature7day.setAdClick(record.getString("click_ad_7day"));
-        adIdActionFeature7day.setAdConversion(record.getString("conversion_ad_7day"));
-        adIdActionFeature7day.setCtr(record.getString("ctr_ad_7day"));
-        adIdActionFeature7day.setCvr(record.getString("cvr_ad_7day"));
-        itemFeature.setDay7_cnt_features(adIdActionFeature7day);
-
-        AdActionFeature adIdActionFeature3month = new AdActionFeature();
-        adIdActionFeature3month.setAdView(record.getString("view_ad_3month"));
-        adIdActionFeature3month.setAdClick(record.getString("click_ad_3month"));
-        adIdActionFeature3month.setAdConversion(record.getString("conversion_ad_3month"));
-        adIdActionFeature3month.setCtr(record.getString("ctr_ad_3month"));
-        adIdActionFeature3month.setCvr(record.getString("cvr_ad_3month"));
-        itemFeature.setMonth3_cnt_features(adIdActionFeature3month);
-
-        // TODO creativeId等维度特征
-        // creative 维度特征
-        AdActionFeature creativeActionFeature1day = new AdActionFeature();
-        creativeActionFeature1day.setAdView(record.getString("view_creative_1day"));
-        creativeActionFeature1day.setAdClick(record.getString("click_creative_1day"));
-        creativeActionFeature1day.setAdConversion(record.getString("conversion_creative_1day"));
-        creativeActionFeature1day.setCtr(record.getString("ctr_creative_1day"));
-        creativeActionFeature1day.setCvr(record.getString("cvr_creative_1day"));
-        itemFeature.setCreative_1day_cnt_features(creativeActionFeature1day);
-
-        AdActionFeature creativeActionFeature3day = new AdActionFeature();
-        creativeActionFeature3day.setAdView(record.getString("view_creative_3day"));
-        creativeActionFeature3day.setAdClick(record.getString("click_creative_3day"));
-        creativeActionFeature3day.setAdConversion(record.getString("conversion_creative_3day"));
-        creativeActionFeature3day.setCtr(record.getString("ctr_creative_3day"));
-        creativeActionFeature3day.setCvr(record.getString("cvr_creative_3day"));
-        itemFeature.setCreative_3day_cnt_features(creativeActionFeature3day);
-
-        AdActionFeature creativeActionFeature7day = new AdActionFeature();
-        creativeActionFeature7day.setAdView(record.getString("view_creative_7day"));
-        creativeActionFeature7day.setAdClick(record.getString("click_creative_7day"));
-        creativeActionFeature7day.setAdConversion(record.getString("conversion_creative_7day"));
-        creativeActionFeature7day.setCtr(record.getString("ctr_creative_7day"));
-        creativeActionFeature7day.setCvr(record.getString("cvr_creative_7day"));
-        itemFeature.setCreative_7day_cnt_features(creativeActionFeature7day);
-
-
-        AdActionFeature creativeActionFeature3month = new AdActionFeature();
-        creativeActionFeature3month.setAdView(record.getString("view_creative_3month"));
-        creativeActionFeature3month.setAdClick(record.getString("click_creative_3month"));
-        creativeActionFeature3month.setAdConversion(record.getString("conversion_creative_3month"));
-        creativeActionFeature3month.setCtr(record.getString("ctr_creative_3month"));
-        creativeActionFeature3month.setCvr(record.getString("cvr_creative_3month"));
-        itemFeature.setCreative_3month_cnt_features(creativeActionFeature3month);
-
-
-
-        // TODO advertiser维度
-        // advertiser 维度特征
-        AdActionFeature advidActionFeature1day = new AdActionFeature();
-        advidActionFeature1day.setAdView(record.getString("view_advertiser_1day"));
-        advidActionFeature1day.setAdClick(record.getString("click_advertiser_1day"));
-        advidActionFeature1day.setAdConversion(record.getString("conversion_advertiser_1day"));
-        advidActionFeature1day.setCtr(record.getString("ctr_advertiser_1day"));
-        advidActionFeature1day.setCvr(record.getString("cvr_advertiser_1day"));
-        itemFeature.setAdvertiser_1day_cnt_features(advidActionFeature1day);
-
-        AdActionFeature advidActionFeature3day = new AdActionFeature();
-        advidActionFeature3day.setAdView(record.getString("view_advertiser_3day"));
-        advidActionFeature3day.setAdClick(record.getString("click_advertiser_3day"));
-        advidActionFeature3day.setAdConversion(record.getString("conversion_advertiser_3day"));
-        advidActionFeature3day.setCtr(record.getString("ctr_advertiser_3day"));
-        advidActionFeature3day.setCvr(record.getString("cvr_advertiser_3day"));
-        itemFeature.setAdvertiser_3day_cnt_features(advidActionFeature3day);
-
-        AdActionFeature advidActionFeature7day = new AdActionFeature();
-        advidActionFeature7day.setAdView(record.getString("view_advertiser_7day"));
-        advidActionFeature7day.setAdClick(record.getString("click_advertiser_7day"));
-        advidActionFeature7day.setAdConversion(record.getString("conversion_advertiser_7day"));
-        advidActionFeature7day.setCtr(record.getString("ctr_advertiser_7day"));
-        advidActionFeature7day.setCvr(record.getString("cvr_advertiser_7day"));
-        itemFeature.setAdvertiser_7day_cnt_features(advidActionFeature7day);
-
-
-        AdActionFeature advidActionFeature3month = new AdActionFeature();
-        advidActionFeature3month.setAdView(record.getString("view_advertiser_3month"));
-        advidActionFeature3month.setAdClick(record.getString("click_advertiser_3month"));
-        advidActionFeature3month.setAdConversion(record.getString("conversion_advertiser_3month"));
-        advidActionFeature3month.setCtr(record.getString("ctr_advertiser_3month"));
-        advidActionFeature3month.setCvr(record.getString("cvr_advertiser_3month"));
-        itemFeature.setAdvertiser_3month_cnt_features(advidActionFeature3month);
-
-
-
-        return itemFeature;
-    }
-
-
-}
+//package examples.dataloader;
+//
+//
+//import com.aliyun.odps.account.Account;
+//import com.aliyun.odps.account.AliyunAccount;
+//import com.aliyun.odps.data.Record;
+//import com.tzld.piaoquan.ad.engine.commons.base.AdActionFeature;
+//import com.tzld.piaoquan.ad.engine.commons.base.AdItemFeature;
+//import com.tzld.piaoquan.ad.engine.commons.base.UserAdFeature;
+//
+//
+//import java.util.HashMap;
+//import java.util.Map;
+//
+//public class AdRedisFeatureConstructor {
+//
+//    private static final String BUCKET_NAME = "ali-recommend";
+//    private static final Map<String, String> ODPS_CONFIG = new HashMap<String, String>();
+//
+//    static {
+//        ODPS_CONFIG.put("ENDPOINT", "http://service.cn.maxcompute.aliyun.com/api");
+//        ODPS_CONFIG.put("ACCESSID", "LTAIWYUujJAm7CbH");
+//        ODPS_CONFIG.put("ACCESSKEY", "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P");
+//    }
+//
+//    ;
+//
+//    private static final Account account = new AliyunAccount(ODPS_CONFIG.get("ACCESSID"), ODPS_CONFIG.get("ACCESSKEY"));
+//
+//
+//    public static UserAdFeature constructUserFeature(Record record) {
+//        UserAdFeature userFeature = new UserAdFeature();
+//        userFeature.setMid(record.getString("mids"));
+//
+//        // 1day features
+//        AdActionFeature userAd1dayActionFeature = new AdActionFeature();
+//        userAd1dayActionFeature.setOriginAdView(record.getString("ad_view_1day"));
+//        userAd1dayActionFeature.setOriginAdClick(record.getString("ad_click_1day"));
+//        userAd1dayActionFeature.setOriginAdConversion(record.getString("ad_conversion_1day"));
+//        userAd1dayActionFeature.setOriginCtr(record.getString("ad_ctr_1day"));
+//        userAd1dayActionFeature.setOriginCvr(record.getString("ad_cvr_1day"));
+//        userFeature.setDay1_cnt_features(userAd1dayActionFeature);
+//
+//
+//        // 3day features
+//        AdActionFeature userAd3dayActionFeature = new AdActionFeature();
+//        userAd1dayActionFeature.setOriginAdView(record.getString("ad_view_3day"));
+//        userAd1dayActionFeature.setOriginAdClick(record.getString("ad_click_3day"));
+//        userAd1dayActionFeature.setOriginAdConversion(record.getString("ad_conversion_3day"));
+//        userAd1dayActionFeature.setOriginCtr(record.getString("ad_ctr_3day"));
+//        userAd1dayActionFeature.setOriginCvr(record.getString("ad_cvr_3day"));
+//        userFeature.setDay3_cnt_features(userAd3dayActionFeature);
+//
+//
+//        // 7day features
+//        AdActionFeature userAd7dayActionFeature = new AdActionFeature();
+//        userAd1dayActionFeature.setOriginAdView(record.getString("ad_view_7day"));
+//        userAd1dayActionFeature.setOriginAdClick(record.getString("ad_click7day"));
+//        userAd1dayActionFeature.setOriginAdConversion(record.getString("ad_conversion_7day"));
+//        userAd1dayActionFeature.setOriginCtr(record.getString("ad_ctr_7day"));
+//        userAd1dayActionFeature.setOriginCvr(record.getString("ad_cvr_7day"));
+//        userFeature.setDay7_cnt_features(userAd7dayActionFeature);
+//
+//
+//        // 3month features
+//        AdActionFeature userAd3MonthActionFeature = new AdActionFeature();
+//        userAd3MonthActionFeature.setOriginAdView(record.getString("ad_view_3month"));
+//        userAd3MonthActionFeature.setOriginAdClick(record.getString("ad_click_3month"));
+//        userAd3MonthActionFeature.setOriginAdConversion(record.getString("ad_conversion_3month"));
+//        userAd3MonthActionFeature.setOriginCtr(record.getString("ad_ctr_3month"));
+//        userAd3MonthActionFeature.setOriginCvr(record.getString("ad_cvr_3month"));
+//        userFeature.setMonth3_cnt_features(userAd3MonthActionFeature);
+//
+//
+//        return userFeature;
+//    }
+//
+//
+//    public static AdItemFeature constructItemFeature(Record record) {
+//        AdItemFeature itemFeature = new AdItemFeature();
+//        itemFeature.setAdId(record.getString("creativeid"));
+//        // itemFeature.setAdCode(record.getString("adcode"));
+//        itemFeature.setCampaignId(record.getString("campaignid"));
+//        itemFeature.setAdvertiserId(record.getString("advertiserid"));
+//        itemFeature.setCreativeId(record.getString("creativeid"));
+//
+//        // ad 维度特征
+//        AdActionFeature adIdActionFeature1day = new AdActionFeature();
+//        adIdActionFeature1day.setOriginAdView(record.getString("view_ad_1day"));
+//        adIdActionFeature1day.setOriginAdClick(record.getString("click_ad_1day"));
+//        adIdActionFeature1day.setOriginAdConversion(record.getString("conversion_ad_1day"));
+//        adIdActionFeature1day.setOriginCtr(record.getString("ctr_ad_1day"));
+//        adIdActionFeature1day.setOriginCvr(record.getString("cvr_ad_1day"));
+//        itemFeature.setDay1_cnt_features(adIdActionFeature1day);
+//
+//        AdActionFeature adIdActionFeature3day = new AdActionFeature();
+//        adIdActionFeature3day.setOriginAdView(record.getString("view_ad_3day"));
+//        adIdActionFeature3day.setOriginAdClick(record.getString("click_ad_3day"));
+//        adIdActionFeature3day.setOriginAdConversion(record.getString("conversion_ad_3day"));
+//        adIdActionFeature3day.setOriginCtr(record.getString("ctr_ad_3day"));
+//        adIdActionFeature3day.setOriginCvr(record.getString("cvr_ad_3day"));
+//        itemFeature.setDay3_cnt_features(adIdActionFeature3day);
+//
+//        AdActionFeature adIdActionFeature7day = new AdActionFeature();
+//        adIdActionFeature7day.setOriginAdView(record.getString("view_ad_7day"));
+//        adIdActionFeature7day.setOriginAdClick(record.getString("click_ad_7day"));
+//        adIdActionFeature7day.setOriginAdConversion(record.getString("conversion_ad_7day"));
+//        adIdActionFeature7day.setOriginCtr(record.getString("ctr_ad_7day"));
+//        adIdActionFeature7day.setOriginCvr(record.getString("cvr_ad_7day"));
+//        itemFeature.setDay7_cnt_features(adIdActionFeature7day);
+//
+//        AdActionFeature adIdActionFeature3month = new AdActionFeature();
+//        adIdActionFeature3month.setOriginAdView(record.getString("view_ad_3month"));
+//        adIdActionFeature3month.setOriginAdClick(record.getString("click_ad_3month"));
+//        adIdActionFeature3month.setOriginAdConversion(record.getString("conversion_ad_3month"));
+//        adIdActionFeature3month.setOriginCtr(record.getString("ctr_ad_3month"));
+//        adIdActionFeature3month.setOriginCvr(record.getString("cvr_ad_3month"));
+//        itemFeature.setMonth3_cnt_features(adIdActionFeature3month);
+//
+//        // TODO creativeId等维度特征
+//        // creative 维度特征
+//        AdActionFeature creativeActionFeature1day = new AdActionFeature();
+//        creativeActionFeature1day.setOriginAdView(record.getString("view_creative_1day"));
+//        creativeActionFeature1day.setOriginAdClick(record.getString("click_creative_1day"));
+//        creativeActionFeature1day.setOriginAdConversion(record.getString("conversion_creative_1day"));
+//        creativeActionFeature1day.setOriginCtr(record.getString("ctr_creative_1day"));
+//        creativeActionFeature1day.setOriginCvr(record.getString("cvr_creative_1day"));
+//        itemFeature.setCreative_1day_cnt_features(creativeActionFeature1day);
+//
+//        AdActionFeature creativeActionFeature3day = new AdActionFeature();
+//        creativeActionFeature3day.setOriginAdView(record.getString("view_creative_3day"));
+//        creativeActionFeature3day.setOriginAdClick(record.getString("click_creative_3day"));
+//        creativeActionFeature3day.setOriginAdConversion(record.getString("conversion_creative_3day"));
+//        creativeActionFeature3day.setOriginCtr(record.getString("ctr_creative_3day"));
+//        creativeActionFeature3day.setOriginCvr(record.getString("cvr_creative_3day"));
+//        itemFeature.setCreative_3day_cnt_features(creativeActionFeature3day);
+//
+//        AdActionFeature creativeActionFeature7day = new AdActionFeature();
+//        creativeActionFeature7day.setOriginAdView(record.getString("view_creative_7day"));
+//        creativeActionFeature7day.setOriginAdClick(record.getString("click_creative_7day"));
+//        creativeActionFeature7day.setOriginAdConversion(record.getString("conversion_creative_7day"));
+//        creativeActionFeature7day.setOriginCtr(record.getString("ctr_creative_7day"));
+//        creativeActionFeature7day.setOriginCvr(record.getString("cvr_creative_7day"));
+//        itemFeature.setCreative_7day_cnt_features(creativeActionFeature7day);
+//
+//
+//        AdActionFeature creativeActionFeature3month = new AdActionFeature();
+//        creativeActionFeature3month.setOriginAdView(record.getString("view_creative_3month"));
+//        creativeActionFeature3month.setOriginAdClick(record.getString("click_creative_3month"));
+//        creativeActionFeature3month.setOriginAdConversion(record.getString("conversion_creative_3month"));
+//        creativeActionFeature3month.setOriginCtr(record.getString("ctr_creative_3month"));
+//        creativeActionFeature3month.setOriginCvr(record.getString("cvr_creative_3month"));
+//        itemFeature.setCreative_3month_cnt_features(creativeActionFeature3month);
+//
+//
+//
+//        // TODO advertiser维度
+//        // advertiser 维度特征
+//        AdActionFeature advidActionFeature1day = new AdActionFeature();
+//        advidActionFeature1day.setOriginAdView(record.getString("view_advertiser_1day"));
+//        advidActionFeature1day.setOriginAdClick(record.getString("click_advertiser_1day"));
+//        advidActionFeature1day.setOriginAdConversion(record.getString("conversion_advertiser_1day"));
+//        advidActionFeature1day.setOriginCtr(record.getString("ctr_advertiser_1day"));
+//        advidActionFeature1day.setOriginCvr(record.getString("cvr_advertiser_1day"));
+//        itemFeature.setAdvertiser_1day_cnt_features(advidActionFeature1day);
+//
+//        AdActionFeature advidActionFeature3day = new AdActionFeature();
+//        advidActionFeature3day.setOriginAdView(record.getString("view_advertiser_3day"));
+//        advidActionFeature3day.setOriginAdClick(record.getString("click_advertiser_3day"));
+//        advidActionFeature3day.setOriginAdConversion(record.getString("conversion_advertiser_3day"));
+//        advidActionFeature3day.setOriginCtr(record.getString("ctr_advertiser_3day"));
+//        advidActionFeature3day.setOriginCvr(record.getString("cvr_advertiser_3day"));
+//        itemFeature.setAdvertiser_3day_cnt_features(advidActionFeature3day);
+//
+//        AdActionFeature advidActionFeature7day = new AdActionFeature();
+//        advidActionFeature7day.setOriginAdView(record.getString("view_advertiser_7day"));
+//        advidActionFeature7day.setOriginAdClick(record.getString("click_advertiser_7day"));
+//        advidActionFeature7day.setOriginAdConversion(record.getString("conversion_advertiser_7day"));
+//        advidActionFeature7day.setOriginCtr(record.getString("ctr_advertiser_7day"));
+//        advidActionFeature7day.setOriginCvr(record.getString("cvr_advertiser_7day"));
+//        itemFeature.setAdvertiser_7day_cnt_features(advidActionFeature7day);
+//
+//
+//        AdActionFeature advidActionFeature3month = new AdActionFeature();
+//        advidActionFeature3month.setOriginAdView(record.getString("view_advertiser_3month"));
+//        advidActionFeature3month.setOriginAdClick(record.getString("click_advertiser_3month"));
+//        advidActionFeature3month.setOriginAdConversion(record.getString("conversion_advertiser_3month"));
+//        advidActionFeature3month.setOriginCtr(record.getString("ctr_advertiser_3month"));
+//        advidActionFeature3month.setOriginCvr(record.getString("cvr_advertiser_3month"));
+//        itemFeature.setAdvertiser_3month_cnt_features(advidActionFeature3month);
+//
+//
+//
+//        return itemFeature;
+//    }
+//
+//
+//}

+ 223 - 223
src/main/java/examples/dataloader/AdSampleConstructor.java

@@ -1,223 +1,223 @@
-package examples.dataloader;
-
-
-import com.aliyun.odps.account.Account;
-import com.aliyun.odps.account.AliyunAccount;
-import com.aliyun.odps.data.Record;
-import com.tzld.piaoquan.recommend.feature.domain.ad.base.AdActionFeature;
-import com.tzld.piaoquan.recommend.feature.domain.ad.base.AdRequestContext;
-import com.tzld.piaoquan.recommend.feature.domain.ad.base.UserAdFeature;
-import com.tzld.piaoquan.recommend.feature.domain.ad.base.AdItemFeature;
-
-
-import java.util.HashMap;
-import java.util.Map;
-
-public class AdSampleConstructor {
-
-    private static final String BUCKET_NAME = "ali-recommend";
-    private static final Map<String, String> ODPS_CONFIG = new HashMap<String, String>();
-
-    static {
-        ODPS_CONFIG.put("ENDPOINT", "http://service.cn.maxcompute.aliyun.com/api");
-        ODPS_CONFIG.put("ACCESSID", "LTAIWYUujJAm7CbH");
-        ODPS_CONFIG.put("ACCESSKEY", "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P");
-    }
-
-    private static final Account account = new AliyunAccount(ODPS_CONFIG.get("ACCESSID"), ODPS_CONFIG.get("ACCESSKEY"));
-
-
-    public static AdRequestContext constructRequestContext(Record record) {
-        AdRequestContext requestContext = new AdRequestContext();
-        requestContext.setApptype(record.getString("apptype"));
-        requestContext.setMachineinfoBrand(record.getString("machineinfo_brand"));
-        requestContext.setMachineinfoModel(record.getString("machineinfo_model"));
-        requestContext.setMachineinfoSdkversion(record.getString("machineinfo_sdkversion"));
-        requestContext.setMachineinfoWchatversion(record.getString("machineinfo_wechatversion"));
-
-
-        requestContext.setDay(record.getString("ctx_day"));
-        requestContext.setWeek(record.getString("ctx_week"));
-        requestContext.setHour(record.getString("ctx_hour"));
-        requestContext.setRegion(record.getString("province"));
-        requestContext.setCity(record.getString("city"));
-        return requestContext;
-    }
-
-
-    public static UserAdFeature constructUserFeature(Record record) {
-        UserAdFeature userFeature = new UserAdFeature();
-        userFeature.setMid(record.getString("machinecode"));
-
-        // 1day features
-        AdActionFeature user1dayActionFeature = new AdActionFeature();
-        user1dayActionFeature.setAdView(record.getString("user_view_1day"));
-        user1dayActionFeature.setAdClick(record.getString("user_click_1day"));
-        user1dayActionFeature.setAdConversion(record.getString("user_conversion_1day"));
-        user1dayActionFeature.setCtr(record.getString("user_ctr_1day"));
-        user1dayActionFeature.setCvr(record.getString("user_cvr_1day"));
-        userFeature.setDay1_cnt_features(user1dayActionFeature);
-
-        // 3day features
-        AdActionFeature user3dayActionFeature = new AdActionFeature();
-        user3dayActionFeature.setAdView(record.getString("user_view_3day"));
-        user3dayActionFeature.setAdClick(record.getString("user_click_3day"));
-        user3dayActionFeature.setAdConversion(record.getString("user_conversion_3day"));
-        user3dayActionFeature.setCtr(record.getString("user_ctr_3day"));
-        user3dayActionFeature.setCvr(record.getString("user_cvr_3day"));
-        userFeature.setDay3_cnt_features(user3dayActionFeature);
-
-
-        // 7day features
-        AdActionFeature user7dayActionFeature = new AdActionFeature();
-        user7dayActionFeature.setAdView(record.getString("user_view_7day"));
-        user7dayActionFeature.setAdClick(record.getString("user_click7day"));
-        user7dayActionFeature.setAdConversion(record.getString("user_conversion_7day"));
-        user7dayActionFeature.setCtr(record.getString("user_ctr_7day"));
-        user7dayActionFeature.setCvr(record.getString("user_cvr_7day"));
-        userFeature.setDay7_cnt_features(user7dayActionFeature);
-
-        // 3month features
-        AdActionFeature user3MonthActionFeature = new AdActionFeature();
-        user3MonthActionFeature.setAdView(record.getString("user_view_3month"));
-        user3MonthActionFeature.setAdClick(record.getString("user_click_3month"));
-        user3MonthActionFeature.setAdConversion(record.getString("user_conversion_3month"));
-        user3MonthActionFeature.setCtr(record.getString("user_ctr_3month"));
-        user3MonthActionFeature.setCvr(record.getString("user_cvr_3month"));
-        userFeature.setMonth3_cnt_features(user3MonthActionFeature);
-
-        return userFeature;
-    }
-
-
-    public static AdItemFeature constructItemFeature(Record record) {
-        AdItemFeature itemFeature = new AdItemFeature();
-
-
-        itemFeature.setAdId(record.getString("adid"));
-        // itemFeature.setAdCode(record.getString("adcode"));
-        itemFeature.setAdvertiserId(record.getString("advertiserid"));
-        itemFeature.setCampaignId(record.getString("campaignid"));
-        itemFeature.setCreativeId(record.getString("creativeid"));
-
-        // 1day features
-        AdActionFeature user1dayActionFeature = new AdActionFeature();
-        user1dayActionFeature.setAdView(record.getString("ad_view_1day"));
-        user1dayActionFeature.setAdClick(record.getString("ad_click_1day"));
-        user1dayActionFeature.setAdConversion(record.getString("ad_conversion_1day"));
-        user1dayActionFeature.setCtr(record.getString("ad_ctr_1day"));
-        user1dayActionFeature.setCvr(record.getString("ad_cvr_1day"));
-        itemFeature.setDay1_cnt_features(user1dayActionFeature);
-
-        // 3day features
-        AdActionFeature user3dayActionFeature = new AdActionFeature();
-        user3dayActionFeature.setAdView(record.getString("ad_view_3day"));
-        user3dayActionFeature.setAdClick(record.getString("ad_click_3day"));
-        user3dayActionFeature.setAdConversion(record.getString("ad_conversion_3day"));
-        user3dayActionFeature.setCtr(record.getString("ad_ctr_3day"));
-        user3dayActionFeature.setCvr(record.getString("ad_cvr_3day"));
-        itemFeature.setDay3_cnt_features(user3dayActionFeature);
-
-
-        // 7day features
-        AdActionFeature user7dayActionFeature = new AdActionFeature();
-        user7dayActionFeature.setAdView(record.getString("ad_view_7day"));
-        user7dayActionFeature.setAdClick(record.getString("ad_click_7day"));
-        user7dayActionFeature.setAdConversion(record.getString("ad_conversion_7day"));
-        user7dayActionFeature.setCtr(record.getString("ad_ctr_7day"));
-        user7dayActionFeature.setCvr(record.getString("ad_cvr_7day"));
-        itemFeature.setDay7_cnt_features(user7dayActionFeature);
-
-        // 3month features
-        AdActionFeature user3MonthActionFeature = new AdActionFeature();
-        user3MonthActionFeature.setAdView(record.getString("ad_view_3month"));
-        user3MonthActionFeature.setAdClick(record.getString("ad_click_3month"));
-        user3MonthActionFeature.setAdConversion(record.getString("ad_conversion_3month"));
-        user3MonthActionFeature.setCtr(record.getString("ad_ctr_3month"));
-        user3MonthActionFeature.setCvr(record.getString("ad_cvr_3month"));
-        itemFeature.setMonth3_cnt_features(user3MonthActionFeature);
-
-
-        //TODO  CREATIVE 维度  需要在样本中补齐
-        AdActionFeature creative1dayFeature = new AdActionFeature();
-        creative1dayFeature.setAdView(record.getString("view_creative_1day"));
-        creative1dayFeature.setAdClick(record.getString("click_creative_1day"));
-        creative1dayFeature.setAdConversion(record.getString("conversion_creative_1day"));
-        creative1dayFeature.setCtr(record.getString("ctr_creative_1day"));
-        creative1dayFeature.setCvr(record.getString("cvr_creative_1day"));
-        itemFeature.setCreative_1day_cnt_features(creative1dayFeature);
-
-        // 3day features
-        AdActionFeature creative3dayFeature = new AdActionFeature();
-        creative3dayFeature.setAdView(record.getString("view_creative_3day"));
-        creative3dayFeature.setAdClick(record.getString("click_creative_3day"));
-        creative3dayFeature.setAdConversion(record.getString("conversion_creative_3day"));
-        creative3dayFeature.setCtr(record.getString("ctr_creative_3day"));
-        creative3dayFeature.setCvr(record.getString("cvr_creative_3day"));
-        itemFeature.setCreative_3day_cnt_features(creative3dayFeature);
-
-
-        // 7day features
-        AdActionFeature creative7dayFeature = new AdActionFeature();
-        creative7dayFeature.setAdView(record.getString("view_creative_7day"));
-        creative7dayFeature.setAdClick(record.getString("click_creative_7day"));
-        creative7dayFeature.setAdConversion(record.getString("conversion_creative_7day"));
-        creative7dayFeature.setCtr(record.getString("ctr_creative_7day"));
-        creative7dayFeature.setCvr(record.getString("cvr_creative_7day"));
-        itemFeature.setCreative_7day_cnt_features(creative7dayFeature);
-
-        // 3month features
-        AdActionFeature creative3MonthFeature = new AdActionFeature();
-        creative3MonthFeature.setAdView(record.getString("view_creative_3month"));
-        creative3MonthFeature.setAdClick(record.getString("click_creative_3month"));
-        creative3MonthFeature.setAdConversion(record.getString("conversion_creative_3month"));
-        creative3MonthFeature.setCtr(record.getString("ctr_creative_3month"));
-        creative3MonthFeature.setCvr(record.getString("cvr_creative_3month"));
-        itemFeature.setCreative_3month_cnt_features(creative3MonthFeature);
-
-
-        // advertiser id
-        // 1day features
-        AdActionFeature advertiser1dayFeature = new AdActionFeature();
-        advertiser1dayFeature.setAdView(record.getString("advertiser_view_1day"));
-        advertiser1dayFeature.setAdClick(record.getString("advertiser_click_1day"));
-        advertiser1dayFeature.setAdConversion(record.getString("advertiser_conversion_1day"));
-        advertiser1dayFeature.setCtr(record.getString("advertiser_ctr_1day"));
-        advertiser1dayFeature.setCvr(record.getString("advertiser_cvr_1day"));
-        itemFeature.setAdvertiser_1day_cnt_features(advertiser1dayFeature);
-
-        // 3day features
-        AdActionFeature advertiser3dayFeature = new AdActionFeature();
-        advertiser3dayFeature.setAdView(record.getString("advertiser_view_3day"));
-        advertiser3dayFeature.setAdClick(record.getString("advertiser_click_3day"));
-        advertiser3dayFeature.setAdConversion(record.getString("advertiser_conversion_3day"));
-        advertiser3dayFeature.setCtr(record.getString("advertiser_ctr_3day"));
-        advertiser3dayFeature.setCvr(record.getString("advertiser_cvr_3day"));
-        itemFeature.setAdvertiser_3day_cnt_features(advertiser3dayFeature);
-
-
-        // 7day features
-        AdActionFeature advertiser7dayFeature = new AdActionFeature();
-        advertiser7dayFeature.setAdView(record.getString("advertiser_view_7day"));
-        advertiser7dayFeature.setAdClick(record.getString("advertiser_click_7day"));
-        advertiser7dayFeature.setAdConversion(record.getString("advertiser_conversion_7day"));
-        advertiser7dayFeature.setCtr(record.getString("advertiser_ctr_7day"));
-        advertiser7dayFeature.setCvr(record.getString("advertiser_cvr_7day"));
-        itemFeature.setAdvertiser_7day_cnt_features(advertiser7dayFeature);
-
-        // 3month features
-        AdActionFeature advertiser3monthFeature = new AdActionFeature();
-        advertiser3monthFeature.setAdView(record.getString("advertiser_view_3month"));
-        advertiser3monthFeature.setAdClick(record.getString("advertiser_view_3month"));
-        advertiser3monthFeature.setAdConversion(record.getString("advertiser_conversion_3month"));
-        advertiser3monthFeature.setCtr(record.getString("advertiser_ctr_3month"));
-        advertiser3monthFeature.setCvr(record.getString("advertiser_cvr_3month"));
-        itemFeature.setAdvertiser_3month_cnt_features(advertiser3monthFeature);
-
-
-
-        return itemFeature;
-    }
-
-
-}
+//package examples.dataloader;
+//
+//
+//import com.aliyun.odps.account.Account;
+//import com.aliyun.odps.account.AliyunAccount;
+//import com.aliyun.odps.data.Record;
+//import com.tzld.piaoquan.ad.engine.commons.base.AdActionFeature;
+//import com.tzld.piaoquan.ad.engine.commons.base.AdRequestContext;
+//import com.tzld.piaoquan.ad.engine.commons.base.UserAdFeature;
+//import com.tzld.piaoquan.ad.engine.commons.base.AdItemFeature;
+//
+//
+//import java.util.HashMap;
+//import java.util.Map;
+//
+//public class AdSampleConstructor {
+//
+//    private static final String BUCKET_NAME = "ali-recommend";
+//    private static final Map<String, String> ODPS_CONFIG = new HashMap<String, String>();
+//
+//    static {
+//        ODPS_CONFIG.put("ENDPOINT", "http://service.cn.maxcompute.aliyun.com/api");
+//        ODPS_CONFIG.put("ACCESSID", "LTAIWYUujJAm7CbH");
+//        ODPS_CONFIG.put("ACCESSKEY", "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P");
+//    }
+//
+//    private static final Account account = new AliyunAccount(ODPS_CONFIG.get("ACCESSID"), ODPS_CONFIG.get("ACCESSKEY"));
+//
+//
+//    public static AdRequestContext constructRequestContext(Record record) {
+//        AdRequestContext requestContext = new AdRequestContext();
+//        requestContext.setApptype(record.getString("apptype"));
+//        requestContext.setMachineinfoBrand(record.getString("machineinfo_brand"));
+//        requestContext.setMachineinfoModel(record.getString("machineinfo_model"));
+//        requestContext.setMachineinfoSdkversion(record.getString("machineinfo_sdkversion"));
+//        requestContext.setMachineinfoWchatversion(record.getString("machineinfo_wechatversion"));
+//
+//
+//        requestContext.setDay(record.getString("ctx_day"));
+//        requestContext.setWeek(record.getString("ctx_week"));
+//        requestContext.setHour(record.getString("ctx_hour"));
+//        requestContext.setRegion(record.getString("province"));
+//        requestContext.setCity(record.getString("city"));
+//        return requestContext;
+//    }
+//
+//
+//    public static UserAdFeature constructUserFeature(Record record) {
+//        UserAdFeature userFeature = new UserAdFeature();
+//        userFeature.setMid(record.get("machinecode").toString());
+//
+//        // 1day features
+//        AdActionFeature user1dayActionFeature = new AdActionFeature();
+//        user1dayActionFeature.setAdView(record.getString("user_view_1day"));
+//        user1dayActionFeature.setAdClick(record.getString("user_click_1day"));
+//        user1dayActionFeature.setAdConversion(record.getString("user_conversion_1day"));
+//        user1dayActionFeature.setCtr(record.getString("user_ctr_1day"));
+//        user1dayActionFeature.setCvr(record.getString("user_cvr_1day"));
+//        userFeature.setDay1_cnt_features(user1dayActionFeature);
+//
+//        // 3day features
+//        AdActionFeature user3dayActionFeature = new AdActionFeature();
+//        user3dayActionFeature.setAdView(record.getString("user_view_3day"));
+//        user3dayActionFeature.setAdClick(record.getString("user_click_3day"));
+//        user3dayActionFeature.setAdConversion(record.getString("user_conversion_3day"));
+//        user3dayActionFeature.setCtr(record.getString("user_ctr_3day"));
+//        user3dayActionFeature.setCvr(record.getString("user_cvr_3day"));
+//        userFeature.setDay3_cnt_features(user3dayActionFeature);
+//
+//
+//        // 7day features
+//        AdActionFeature user7dayActionFeature = new AdActionFeature();
+//        user7dayActionFeature.setAdView(record.getString("user_view_7day"));
+//        user7dayActionFeature.setAdClick(record.getString("user_click7day"));
+//        user7dayActionFeature.setAdConversion(record.getString("user_conversion_7day"));
+//        user7dayActionFeature.setCtr(record.getString("user_ctr_7day"));
+//        user7dayActionFeature.setCvr(record.getString("user_cvr_7day"));
+//        userFeature.setDay7_cnt_features(user7dayActionFeature);
+//
+//        // 3month features
+//        AdActionFeature user3MonthActionFeature = new AdActionFeature();
+//        user3MonthActionFeature.setAdView(record.getString("user_view_3month"));
+//        user3MonthActionFeature.setAdClick(record.getString("user_click_3month"));
+//        user3MonthActionFeature.setAdConversion(record.getString("user_conversion_3month"));
+//        user3MonthActionFeature.setCtr(record.getString("user_ctr_3month"));
+//        user3MonthActionFeature.setCvr(record.getString("user_cvr_3month"));
+//        userFeature.setMonth3_cnt_features(user3MonthActionFeature);
+//
+//        return userFeature;
+//    }
+//
+//
+//    public static AdItemFeature constructItemFeature(Record record) {
+//        AdItemFeature itemFeature = new AdItemFeature();
+//
+//
+//        itemFeature.setAdId(record.getString("adid"));
+//        // itemFeature.setAdCode(record.getString("adcode"));
+//        itemFeature.setAdvertiserId(record.getString("advertiserid"));
+//        itemFeature.setCampaignId(record.getString("campaignid"));
+//        itemFeature.setCreativeId(record.getString("creativeid"));
+//
+//        // 1day features
+//        AdActionFeature user1dayActionFeature = new AdActionFeature();
+//        user1dayActionFeature.setAdView(record.getString("ad_view_1day"));
+//        user1dayActionFeature.setAdClick(record.getString("ad_click_1day"));
+//        user1dayActionFeature.setAdConversion(record.getString("ad_conversion_1day"));
+//        user1dayActionFeature.setCtr(record.getString("ad_ctr_1day"));
+//        user1dayActionFeature.setCvr(record.getString("ad_cvr_1day"));
+//        itemFeature.setDay1_cnt_features(user1dayActionFeature);
+//
+//        // 3day features
+//        AdActionFeature user3dayActionFeature = new AdActionFeature();
+//        user3dayActionFeature.setAdView(record.getString("ad_view_3day"));
+//        user3dayActionFeature.setAdClick(record.getString("ad_click_3day"));
+//        user3dayActionFeature.setAdConversion(record.getString("ad_conversion_3day"));
+//        user3dayActionFeature.setCtr(record.getString("ad_ctr_3day"));
+//        user3dayActionFeature.setCvr(record.getString("ad_cvr_3day"));
+//        itemFeature.setDay3_cnt_features(user3dayActionFeature);
+//
+//
+//        // 7day features
+//        AdActionFeature user7dayActionFeature = new AdActionFeature();
+//        user7dayActionFeature.setAdView(record.getString("ad_view_7day"));
+//        user7dayActionFeature.setAdClick(record.getString("ad_click_7day"));
+//        user7dayActionFeature.setAdConversion(record.getString("ad_conversion_7day"));
+//        user7dayActionFeature.setCtr(record.getString("ad_ctr_7day"));
+//        user7dayActionFeature.setCvr(record.getString("ad_cvr_7day"));
+//        itemFeature.setDay7_cnt_features(user7dayActionFeature);
+//
+//        // 3month features
+//        AdActionFeature user3MonthActionFeature = new AdActionFeature();
+//        user3MonthActionFeature.setAdView(record.getString("ad_view_3month"));
+//        user3MonthActionFeature.setAdClick(record.getString("ad_click_3month"));
+//        user3MonthActionFeature.setAdConversion(record.getString("ad_conversion_3month"));
+//        user3MonthActionFeature.setCtr(record.getString("ad_ctr_3month"));
+//        user3MonthActionFeature.setCvr(record.getString("ad_cvr_3month"));
+//        itemFeature.setMonth3_cnt_features(user3MonthActionFeature);
+//
+//
+//        //TODO  CREATIVE 维度  需要在样本中补齐
+//        AdActionFeature creative1dayFeature = new AdActionFeature();
+//        creative1dayFeature.setAdView(record.getString("view_creative_1day"));
+//        creative1dayFeature.setAdClick(record.getString("click_creative_1day"));
+//        creative1dayFeature.setAdConversion(record.getString("conversion_creative_1day"));
+//        creative1dayFeature.setCtr(record.getString("ctr_creative_1day"));
+//        creative1dayFeature.setCvr(record.getString("cvr_creative_1day"));
+//        itemFeature.setCreative_1day_cnt_features(creative1dayFeature);
+//
+//        // 3day features
+//        AdActionFeature creative3dayFeature = new AdActionFeature();
+//        creative3dayFeature.setAdView(record.getString("view_creative_3day"));
+//        creative3dayFeature.setAdClick(record.getString("click_creative_3day"));
+//        creative3dayFeature.setAdConversion(record.getString("conversion_creative_3day"));
+//        creative3dayFeature.setCtr(record.getString("ctr_creative_3day"));
+//        creative3dayFeature.setCvr(record.getString("cvr_creative_3day"));
+//        itemFeature.setCreative_3day_cnt_features(creative3dayFeature);
+//
+//
+//        // 7day features
+//        AdActionFeature creative7dayFeature = new AdActionFeature();
+//        creative7dayFeature.setAdView(record.getString("view_creative_7day"));
+//        creative7dayFeature.setAdClick(record.getString("click_creative_7day"));
+//        creative7dayFeature.setAdConversion(record.getString("conversion_creative_7day"));
+//        creative7dayFeature.setCtr(record.getString("ctr_creative_7day"));
+//        creative7dayFeature.setCvr(record.getString("cvr_creative_7day"));
+//        itemFeature.setCreative_7day_cnt_features(creative7dayFeature);
+//
+//        // 3month features
+//        AdActionFeature creative3MonthFeature = new AdActionFeature();
+//        creative3MonthFeature.setAdView(record.getString("view_creative_3month"));
+//        creative3MonthFeature.setAdClick(record.getString("click_creative_3month"));
+//        creative3MonthFeature.setAdConversion(record.getString("conversion_creative_3month"));
+//        creative3MonthFeature.setCtr(record.getString("ctr_creative_3month"));
+//        creative3MonthFeature.setCvr(record.getString("cvr_creative_3month"));
+//        itemFeature.setCreative_3month_cnt_features(creative3MonthFeature);
+//
+//
+//        // advertiser id
+//        // 1day features
+//        AdActionFeature advertiser1dayFeature = new AdActionFeature();
+//        advertiser1dayFeature.setAdView(record.getString("advertiser_view_1day"));
+//        advertiser1dayFeature.setAdClick(record.getString("advertiser_click_1day"));
+//        advertiser1dayFeature.setAdConversion(record.getString("advertiser_conversion_1day"));
+//        advertiser1dayFeature.setCtr(record.getString("advertiser_ctr_1day"));
+//        advertiser1dayFeature.setCvr(record.getString("advertiser_cvr_1day"));
+//        itemFeature.setAdvertiser_1day_cnt_features(advertiser1dayFeature);
+//
+//        // 3day features
+//        AdActionFeature advertiser3dayFeature = new AdActionFeature();
+//        advertiser3dayFeature.setAdView(record.getString("advertiser_view_3day"));
+//        advertiser3dayFeature.setAdClick(record.getString("advertiser_click_3day"));
+//        advertiser3dayFeature.setAdConversion(record.getString("advertiser_conversion_3day"));
+//        advertiser3dayFeature.setCtr(record.getString("advertiser_ctr_3day"));
+//        advertiser3dayFeature.setCvr(record.getString("advertiser_cvr_3day"));
+//        itemFeature.setAdvertiser_3day_cnt_features(advertiser3dayFeature);
+//
+//
+//        // 7day features
+//        AdActionFeature advertiser7dayFeature = new AdActionFeature();
+//        advertiser7dayFeature.setAdView(record.getString("advertiser_view_7day"));
+//        advertiser7dayFeature.setAdClick(record.getString("advertiser_click_7day"));
+//        advertiser7dayFeature.setAdConversion(record.getString("advertiser_conversion_7day"));
+//        advertiser7dayFeature.setCtr(record.getString("advertiser_ctr_7day"));
+//        advertiser7dayFeature.setCvr(record.getString("advertiser_cvr_7day"));
+//        itemFeature.setAdvertiser_7day_cnt_features(advertiser7dayFeature);
+//
+//        // 3month features
+//        AdActionFeature advertiser3monthFeature = new AdActionFeature();
+//        advertiser3monthFeature.setAdView(record.getString("advertiser_view_3month"));
+//        advertiser3monthFeature.setAdClick(record.getString("advertiser_view_3month"));
+//        advertiser3monthFeature.setAdConversion(record.getString("advertiser_conversion_3month"));
+//        advertiser3monthFeature.setCtr(record.getString("advertiser_ctr_3month"));
+//        advertiser3monthFeature.setCvr(record.getString("advertiser_cvr_3month"));
+//        itemFeature.setAdvertiser_3month_cnt_features(advertiser3monthFeature);
+//
+//
+//
+//        return itemFeature;
+//    }
+//
+//
+//}

+ 80 - 0
src/main/java/examples/dataloader/OfflineVlogFeatureGroup.java

@@ -0,0 +1,80 @@
+package examples.dataloader;
+
+public enum OfflineVlogFeatureGroup {
+
+    machineinfo_brand,
+    machineinfo_model,
+    machineinfo_platform,
+    machineinfo_system,
+    u_1day_exp_cnt,
+    u_1day_click_cnt,
+    u_1day_share_cnt,
+    u_1day_return_cnt,
+    u_ctr_1day,
+    u_str_1day,
+    u_rov_1day,
+    u_ros_1day,
+
+    u_3day_exp_cnt,
+    u_3day_click_cnt,
+    u_3day_share_cnt,
+    u_3day_return_cnt,
+    u_ctr_3day,
+    u_str_3day,
+    u_rov_3day,
+    u_ros_3day,
+
+
+    total_time,
+
+    play_count_total,
+    i_1day_exp_cnt,
+    i_1day_click_cnt,
+    i_1day_share_cnt,
+    i_1day_return_cnt,
+    i_ctr_1day,
+    i_str_1day,
+    i_rov_1day,
+    i_ros_1day,
+
+    i_3day_exp_cnt,
+    i_3day_click_cnt,
+    i_3day_share_cnt,
+    i_3day_return_cnt,
+    i_ctr_3day,
+    i_str_3day,
+    i_rov_3day,
+    i_ros_3day,
+
+    ctx_week,
+    ctx_hour,
+    ctx_region,
+    ctx_city,
+    ;
+
+
+    private final byte[] idBytes;
+    private final byte[] nameBytes;
+
+    OfflineVlogFeatureGroup() {
+        this.idBytes = String.valueOf(ordinal()).getBytes();
+        this.nameBytes = name().toLowerCase().getBytes();
+    }
+
+    public final int getId() {
+        return ordinal();
+    }
+
+    public final String getGroupName() {
+        return name().toLowerCase();
+    }
+
+    public final byte[] getGroupNameBytes() {
+        return getGroupName().getBytes();
+    }
+
+    public final byte[] getIdBytes() {
+        return idBytes;
+    }
+
+}

+ 81 - 0
src/main/java/examples/dataloader/OfflineVlogFeatureGroupV1.java

@@ -0,0 +1,81 @@
+package examples.dataloader;
+
+public enum OfflineVlogFeatureGroupV1 {
+
+    machineinfo_brand,
+    machineinfo_model,
+    machineinfo_platform,
+    machineinfo_system,
+    u_1day_exp_cnt,
+    u_1day_click_cnt,
+    u_1day_share_cnt,
+    u_1day_return_cnt,
+    u_1day_ctr,
+    u_1day_str,
+    u_1day_rov,
+    u_1day_ros,
+
+    u_3day_exp_cnt,
+    u_3day_click_cnt,
+    u_3day_share_cnt,
+    u_3day_return_cnt,
+    u_3day_ctr,
+    u_3day_str,
+    u_3day_rov,
+    u_3day_ros,
+
+
+    total_time,
+
+    play_count_total,
+    i_1day_exp_cnt,
+    i_1day_click_cnt,
+    i_1day_share_cnt,
+    i_1day_return_cnt,
+    i_1day_ctr,
+    i_1day_str,
+    i_1day_rov,
+    i_1day_ros,
+
+    i_3day_exp_cnt,
+    i_3day_click_cnt,
+    i_3day_share_cnt,
+    i_3day_return_cnt,
+    i_3day_ctr,
+    i_3day_str,
+    i_3day_rov,
+    i_3day_ros,
+
+    ctx_week,
+    ctx_hour,
+    ctx_region,
+    ctx_city,
+
+    ;
+
+
+    private final byte[] idBytes;
+    private final byte[] nameBytes;
+
+    OfflineVlogFeatureGroupV1() {
+        this.idBytes = String.valueOf(ordinal()).getBytes();
+        this.nameBytes = name().toLowerCase().getBytes();
+    }
+
+    public final int getId() {
+        return ordinal();
+    }
+
+    public final String getGroupName() {
+        return name().toLowerCase();
+    }
+
+    public final byte[] getGroupNameBytes() {
+        return getGroupName().getBytes();
+    }
+
+    public final byte[] getIdBytes() {
+        return idBytes;
+    }
+
+}

+ 125 - 0
src/main/java/examples/dataloader/OfflineVlogFeatureGroupV2.java

@@ -0,0 +1,125 @@
+package examples.dataloader;
+
+public enum OfflineVlogFeatureGroupV2 {
+
+    machineinfo_brand,
+    machineinfo_model,
+    machineinfo_platform,
+    machineinfo_system,
+    u_1day_exp_cnt,
+    u_1day_click_cnt,
+    u_1day_share_cnt,
+    u_1day_return_cnt,
+    u_1day_ctr,
+    u_1day_str,
+    u_1day_rov,
+    u_1day_ros,
+
+    u_3day_exp_cnt,
+    u_3day_click_cnt,
+    u_3day_share_cnt,
+    u_3day_return_cnt,
+    u_3day_ctr,
+    u_3day_str,
+    u_3day_rov,
+    u_3day_ros,
+
+    total_time,
+
+    play_count_total,
+    i_1day_exp_cnt,
+    i_1day_click_cnt,
+    i_1day_share_cnt,
+    i_1day_return_cnt,
+    i_1day_ctr,
+    i_1day_str,
+    i_1day_rov,
+    i_1day_ros,
+
+    i_3day_exp_cnt,
+    i_3day_click_cnt,
+    i_3day_share_cnt,
+    i_3day_return_cnt,
+    i_3day_ctr,
+    i_3day_str,
+    i_3day_rov,
+    i_3day_ros,
+
+    ctx_week,
+    ctx_hour,
+    ctx_region,
+    ctx_city,
+    view_pv_list_1day,
+    view_uv_list_1day,
+    play_pv_list_1day,
+    play_uv_list_1day,
+    share_pv_list_1day,
+    share_uv_list_1day,
+    return_uv_list_1day,
+    p_view_uv_list_1day,
+    p_view_pv_list_1day,
+    p_return_uv_list_1day,
+    share_uv_list_2day,
+    share_pv_list_2day,
+    share_uv_list_3day,
+    share_pv_list_3day,
+    view_uv_list_1h,
+    view_pv_list_1h,
+    play_uv_list_1h,
+    play_pv_list_1h,
+    share_uv_list_1h,
+    share_pv_list_1h,
+    return_uv_list_1h,
+    p_return_uv_list_1h,
+    i_1day_ctr_rt,
+    i_1day_str_rt,
+    i_1day_ros_rt,
+    i_1day_rov_rt,
+    i_1h_ctr_rt,
+    i_1h_str_rt,
+    i_1h_ros_rt,
+    i_1h_rov_rt,
+    u_7day_exp_cnt,
+    u_7day_click_cnt,
+    u_7day_share_cnt,
+    u_7day_return_cnt,
+    i_7day_exp_cnt,
+    i_7day_click_cnt,
+    i_7day_share_cnt,
+    i_7day_return_cnt,
+    u_7day_ctr,
+    u_7day_str,
+    u_7day_rov,
+    u_7day_ros,
+    i_7day_ctr,
+    i_7day_str,
+    i_7day_rov,
+    i_7day_ros
+    ;
+
+
+    private final byte[] idBytes;
+    private final byte[] nameBytes;
+
+    OfflineVlogFeatureGroupV2() {
+        this.idBytes = String.valueOf(ordinal()).getBytes();
+        this.nameBytes = name().toLowerCase().getBytes();
+    }
+
+    public final int getId() {
+        return ordinal();
+    }
+
+    public final String getGroupName() {
+        return name().toLowerCase();
+    }
+
+    public final byte[] getGroupNameBytes() {
+        return getGroupName().getBytes();
+    }
+
+    public final byte[] getIdBytes() {
+        return idBytes;
+    }
+
+}

+ 68 - 0
src/main/java/examples/dataloader/OfflineVlogShareLRFeatureExtractor.java

@@ -0,0 +1,68 @@
+package examples.dataloader;
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.ListMultimap;
+import com.tzld.piaoquan.recommend.feature.domain.video.base.RequestContextBytesFeature;
+import com.tzld.piaoquan.recommend.feature.domain.video.base.UserBytesFeature;
+import com.tzld.piaoquan.recommend.feature.domain.video.base.VideoBytesFeature;
+import com.tzld.piaoquan.recommend.feature.domain.video.feature.BytesGroup;
+import com.tzld.piaoquan.recommend.feature.domain.video.feature.BytesUtils;
+import com.tzld.piaoquan.recommend.feature.model.sample.*;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+public class OfflineVlogShareLRFeatureExtractor {
+
+    public ListMultimap<FeatureGroup, BaseFeature> featureMap = ArrayListMultimap.create();
+
+    final private BytesUtils utils;
+    final private int groupCount = OfflineVlogFeatureGroup.values().length;
+    public OfflineVlogShareLRFeatureExtractor() {
+        BytesGroup[] groups = new BytesGroup[OfflineVlogFeatureGroup.values().length];
+        OfflineVlogFeatureGroup[] var2 = OfflineVlogFeatureGroup.values();
+        int var3 = var2.length;
+
+        for(int var4 = 0; var4 < var3; ++var4) {
+            OfflineVlogFeatureGroup g = var2[var4];
+            groups[g.ordinal()] = new BytesGroup(g.ordinal(), g.getGroupName(), g.getGroupNameBytes());
+        }
+        this.utils = new BytesUtils(groups);
+    }
+    public void makeFeature(Map<String, Object> maps){
+        for (Map.Entry<String, Object> entry : maps.entrySet()){
+            OfflineVlogFeatureGroup ovf = OfflineVlogFeatureGroup.valueOf(entry.getKey());
+            Object value = entry.getValue();
+            if (value instanceof String){
+                this.makeFea(ovf, ((String)value).getBytes());
+            }else if (value instanceof Double){
+                this.makeFea(ovf, String.valueOf((Double)value).getBytes());
+            }else if (value instanceof Integer){
+                //todo
+            }else{
+                //todo
+                this.makeFea(ovf, ((String)value).getBytes());
+            }
+        }
+    }
+    public void makeFeature4String(Map<String, String> maps){
+        for (Map.Entry<String, String> entry : maps.entrySet()){
+            OfflineVlogFeatureGroup ovf = OfflineVlogFeatureGroup.valueOf(entry.getKey());
+            String value = entry.getValue();
+            this.makeFea(ovf, value.getBytes());
+        }
+    }
+
+    private FeatureGroup makeGroup(OfflineVlogFeatureGroup group) {
+        FeatureGroup.Builder g = FeatureGroup.newBuilder();
+        g.setType("1");
+        g.setName(group.getGroupName());
+        g.setId(group.ordinal());
+        return g.build();
+    }
+    void makeFea(OfflineVlogFeatureGroup group, byte[] value) {
+        FeatureGroup featureGroup = this.makeGroup(group);
+        BaseFeature feature = this.utils.makeFea(group.ordinal(), value);
+        this.featureMap.put(featureGroup, feature);
+    }
+
+}

+ 66 - 0
src/main/java/examples/dataloader/OfflineVlogShareLRFeatureExtractorV1.java

@@ -0,0 +1,66 @@
+package examples.dataloader;
+
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.ListMultimap;
+import com.tzld.piaoquan.recommend.feature.domain.video.feature.BytesGroup;
+import com.tzld.piaoquan.recommend.feature.domain.video.feature.BytesUtils;
+import com.tzld.piaoquan.recommend.feature.model.sample.BaseFeature;
+import com.tzld.piaoquan.recommend.feature.model.sample.FeatureGroup;
+
+import java.util.Map;
+
+public class OfflineVlogShareLRFeatureExtractorV1 {
+
+    public ListMultimap<FeatureGroup, BaseFeature> featureMap = ArrayListMultimap.create();
+
+    final private BytesUtils utils;
+    final private int groupCount = OfflineVlogFeatureGroupV1.values().length;
+    public OfflineVlogShareLRFeatureExtractorV1() {
+        BytesGroup[] groups = new BytesGroup[OfflineVlogFeatureGroupV1.values().length];
+        OfflineVlogFeatureGroupV1[] var2 = OfflineVlogFeatureGroupV1.values();
+        int var3 = var2.length;
+
+        for(int var4 = 0; var4 < var3; ++var4) {
+            OfflineVlogFeatureGroupV1 g = var2[var4];
+            groups[g.ordinal()] = new BytesGroup(g.ordinal(), g.getGroupName(), g.getGroupNameBytes());
+        }
+        this.utils = new BytesUtils(groups);
+    }
+    public void makeFeature(Map<String, Object> maps){
+        for (Map.Entry<String, Object> entry : maps.entrySet()){
+            OfflineVlogFeatureGroupV1 ovf = OfflineVlogFeatureGroupV1.valueOf(entry.getKey());
+            Object value = entry.getValue();
+            if (value instanceof String){
+                this.makeFea(ovf, ((String)value).getBytes());
+            }else if (value instanceof Double){
+                this.makeFea(ovf, String.valueOf((Double)value).getBytes());
+            }else if (value instanceof Integer){
+                //todo
+            }else{
+                //todo
+                this.makeFea(ovf, ((String)value).getBytes());
+            }
+        }
+    }
+    public void makeFeature4String(Map<String, String> maps){
+        for (Map.Entry<String, String> entry : maps.entrySet()){
+            OfflineVlogFeatureGroupV1 ovf = OfflineVlogFeatureGroupV1.valueOf(entry.getKey());
+            String value = entry.getValue();
+            this.makeFea(ovf, value.getBytes());
+        }
+    }
+
+    private FeatureGroup makeGroup(OfflineVlogFeatureGroupV1 group) {
+        FeatureGroup.Builder g = FeatureGroup.newBuilder();
+        g.setType("1");
+        g.setName(group.getGroupName());
+        g.setId(group.ordinal());
+        return g.build();
+    }
+    void makeFea(OfflineVlogFeatureGroupV1 group, byte[] value) {
+        FeatureGroup featureGroup = this.makeGroup(group);
+        BaseFeature feature = this.utils.makeFea(group.ordinal(), value);
+        this.featureMap.put(featureGroup, feature);
+    }
+    
+}

+ 66 - 0
src/main/java/examples/dataloader/OfflineVlogShareLRFeatureExtractorV2.java

@@ -0,0 +1,66 @@
+package examples.dataloader;
+
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.ListMultimap;
+import com.tzld.piaoquan.recommend.feature.domain.video.feature.BytesGroup;
+import com.tzld.piaoquan.recommend.feature.domain.video.feature.BytesUtils;
+import com.tzld.piaoquan.recommend.feature.model.sample.BaseFeature;
+import com.tzld.piaoquan.recommend.feature.model.sample.FeatureGroup;
+
+import java.util.Map;
+
+public class OfflineVlogShareLRFeatureExtractorV2 {
+
+    public ListMultimap<FeatureGroup, BaseFeature> featureMap = ArrayListMultimap.create();
+
+    final private BytesUtils utils;
+    final private int groupCount = OfflineVlogFeatureGroupV2.values().length;
+    public OfflineVlogShareLRFeatureExtractorV2() {
+        BytesGroup[] groups = new BytesGroup[OfflineVlogFeatureGroupV2.values().length];
+        OfflineVlogFeatureGroupV2[] var2 = OfflineVlogFeatureGroupV2.values();
+        int var3 = var2.length;
+
+        for(int var4 = 0; var4 < var3; ++var4) {
+            OfflineVlogFeatureGroupV2 g = var2[var4];
+            groups[g.ordinal()] = new BytesGroup(g.ordinal(), g.getGroupName(), g.getGroupNameBytes());
+        }
+        this.utils = new BytesUtils(groups);
+    }
+    public void makeFeature(Map<String, Object> maps){
+        for (Map.Entry<String, Object> entry : maps.entrySet()){
+            OfflineVlogFeatureGroupV2 ovf = OfflineVlogFeatureGroupV2.valueOf(entry.getKey());
+            Object value = entry.getValue();
+            if (value instanceof String){
+                this.makeFea(ovf, ((String)value).getBytes());
+            }else if (value instanceof Double){
+                this.makeFea(ovf, String.valueOf((Double)value).getBytes());
+            }else if (value instanceof Integer){
+                //todo
+            }else{
+                //todo
+                this.makeFea(ovf, ((String)value).getBytes());
+            }
+        }
+    }
+    public void makeFeature4String(Map<String, String> maps){
+        for (Map.Entry<String, String> entry : maps.entrySet()){
+            OfflineVlogFeatureGroupV2 ovf = OfflineVlogFeatureGroupV2.valueOf(entry.getKey());
+            String value = entry.getValue();
+            this.makeFea(ovf, value.getBytes());
+        }
+    }
+
+    private FeatureGroup makeGroup(OfflineVlogFeatureGroupV2 group) {
+        FeatureGroup.Builder g = FeatureGroup.newBuilder();
+        g.setType("1");
+        g.setName(group.getGroupName());
+        g.setId(group.ordinal());
+        return g.build();
+    }
+    void makeFea(OfflineVlogFeatureGroupV2 group, byte[] value) {
+        FeatureGroup featureGroup = this.makeGroup(group);
+        BaseFeature feature = this.utils.makeFea(group.ordinal(), value);
+        this.featureMap.put(featureGroup, feature);
+    }
+    
+}

+ 120 - 0
src/main/java/examples/dataloader/RequestContextOffline.java

@@ -0,0 +1,120 @@
+package examples.dataloader;
+
+import com.aliyun.odps.data.Record;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+public class RequestContextOffline {
+    public Map<String, Object> featureMap = new HashMap<>();
+
+    public void putUserFeature(Record record){
+        setKVinMap(record, "machineinfo_brand", "string", "");
+        setKVinMap(record, "machineinfo_model", "string", "");
+        setKVinMap(record, "machineinfo_platform", "string","");
+        setKVinMap(record, "machineinfo_system", "string","");
+
+        setKVinMap(record, "u_1day_exp_cnt", "double", "cnt");
+        setKVinMap(record, "u_1day_click_cnt", "double", "cnt");
+        setKVinMap(record, "u_1day_share_cnt", "double", "cnt");
+        setKVinMap(record, "u_1day_return_cnt", "double", "cnt");
+
+        setKVinMap(record, "u_ctr_1day", "double", "rate");
+        setKVinMap(record, "u_str_1day", "double", "rate");
+        setKVinMap(record, "u_rov_1day", "double", "rate");
+        setKVinMap(record, "u_ros_1day", "double", "rate");
+
+        setKVinMap(record, "u_3day_exp_cnt", "double", "cnt");
+        setKVinMap(record, "u_3day_click_cnt", "double", "cnt");
+        setKVinMap(record, "u_3day_share_cnt", "double", "cnt");
+        setKVinMap(record, "u_3day_return_cnt", "double", "cnt");
+
+        setKVinMap(record, "u_ctr_3day", "double", "rate");
+        setKVinMap(record, "u_str_3day", "double", "rate");
+        setKVinMap(record, "u_rov_3day", "double", "rate");
+        setKVinMap(record, "u_ros_3day", "double", "rate");
+    }
+    public void putItemFeature(Record record){
+        // setKVinMap(record, "i_title_len", "double", "cnt");
+        setKVinMap(record, "total_time", "double", "cnt");
+        // setKVinMap(record, "i_days_since_upload", "double", "cnt");
+        setKVinMap(record, "play_count_total", "double", "cnt");
+
+        setKVinMap(record, "i_1day_exp_cnt", "double", "cnt");
+        setKVinMap(record, "i_1day_click_cnt", "double", "cnt");
+        setKVinMap(record, "i_1day_share_cnt", "double", "cnt");
+        setKVinMap(record, "i_1day_return_cnt", "double", "cnt");
+
+        setKVinMap(record, "i_ctr_1day", "double", "rate");
+        setKVinMap(record, "i_str_1day", "double", "rate");
+        setKVinMap(record, "i_rov_1day", "double", "rate");
+        setKVinMap(record, "i_ros_1day", "double", "rate");
+
+        setKVinMap(record, "i_3day_exp_cnt", "double", "cnt");
+        setKVinMap(record, "i_3day_click_cnt", "double", "cnt");
+        setKVinMap(record, "i_3day_share_cnt", "double", "cnt");
+        setKVinMap(record, "i_3day_return_cnt", "double", "cnt");
+
+        setKVinMap(record, "i_ctr_3day", "double", "rate");
+        setKVinMap(record, "i_str_3day", "double", "rate");
+        setKVinMap(record, "i_rov_3day", "double", "rate");
+        setKVinMap(record, "i_ros_3day", "double", "rate");
+    }
+
+    public void putSceneFeature(Record record){
+        setKVinMap(record, "ctx_week", "string", "");
+        setKVinMap(record, "ctx_hour", "string", "");
+        setKVinMap(record, "ctx_region", "string","");
+        setKVinMap(record, "ctx_city", "string","");
+    }
+    public void setKVinMap(Record record, String key, String instance, String cntOrRate){
+        if (!Arrays.stream(record.getColumns()).map(r-> r.getName()).collect(Collectors.toSet()).contains(key)){
+            return;
+        }
+        String value;
+        try{
+            value = record.getString(key);
+        }catch (Exception e){
+            value = String.valueOf(record.getBigint(key));
+        }
+
+        if (value == null){
+            return;
+        }
+        String ins = instance.toLowerCase();
+        switch (ins){
+            case "string":
+                featureMap.put(key, value);
+                return;
+            case "double":
+                if ("cnt".equals(cntOrRate)){
+                    featureMap.put(key, String.valueOf(this.bucketRatioFeature(Double.valueOf(value))));
+                }else if ("rate".equals(cntOrRate)){
+                    featureMap.put(key, String.valueOf(this.ceilLog(Double.valueOf(value))));
+                }
+                return;
+            case "int":
+                return;
+            case "long":
+                return;
+            default:
+                return;
+        }
+    }
+
+
+    public double ceilLog(Double key) {
+        return Math.ceil(Math.log(key + 1.0));
+    }
+
+    public double bucketRatioFeature(Double key) {
+        long bucket = Math.round(Math.log((key + 1.0) * 50.0));
+        if (bucket > 50L) {
+            bucket = 50L;
+        }
+
+        return (double)bucket;
+    }
+}

+ 45 - 0
src/main/java/examples/dataloader/redisBuilderMyself.java

@@ -0,0 +1,45 @@
+package examples.dataloader;
+
+import org.springframework.data.redis.connection.RedisConnectionFactory;
+import org.springframework.data.redis.connection.RedisStandaloneConfiguration;
+import org.springframework.data.redis.connection.jedis.JedisConnectionFactory;
+import redis.clients.jedis.JedisPoolConfig;
+import org.springframework.data.redis.connection.jedis.JedisClientConfiguration;
+import org.springframework.data.redis.core.RedisTemplate;
+
+
+public class redisBuilderMyself {
+
+
+    public static JedisConnectionFactory redisConnectionFactory() {
+
+        RedisStandaloneConfiguration config = new RedisStandaloneConfiguration(
+                "r-bp1pi8wyv6lzvgjy5z.redis.rds.aliyuncs.com", 6379);
+        config.setPassword("Wqsd@2019");
+
+        JedisPoolConfig jedisPoolConfig = new JedisPoolConfig();
+        // 最大连接数, 根据业务需要设置,不能超过实例规格规定的最大连接数。
+        jedisPoolConfig.setMaxTotal(30);
+        // 最大空闲连接数, 根据业务需要设置,不能超过实例规格规定的最大连接数。
+        jedisPoolConfig.setMaxIdle(20);
+        // 关闭 testOn[Borrow|Return],防止产生额外的PING。
+        jedisPoolConfig.setTestOnBorrow(false);
+        jedisPoolConfig.setTestOnReturn(false);
+
+        JedisClientConfiguration jedisClientConfiguration = JedisClientConfiguration.builder().usePooling().poolConfig(
+                jedisPoolConfig).build();
+
+        return new JedisConnectionFactory(config, jedisClientConfiguration);
+    }
+
+
+
+
+    public static RedisTemplate<String, String> redisTemplate(RedisConnectionFactory connectionFactory) {
+        RedisTemplate<String, String> template = new RedisTemplate<>();
+        template.setConnectionFactory(connectionFactory);
+        return template;
+    }
+}
+
+

+ 164 - 0
src/main/java/examples/extractor/ExtractorUtils.java

@@ -0,0 +1,164 @@
+package examples.extractor;
+
+import java.util.Map;
+import java.time.LocalDateTime;
+import java.time.format.DateTimeFormatter;
+import java.util.ArrayList;
+import java.util.List;
+public class ExtractorUtils {
+
+    public static Double division(String s1, String s2, Map<String, String> maps){
+        double rate = 0.0;
+        if (maps.containsKey(s1) && maps.containsKey(s2)){
+            Double d1 = Double.valueOf(maps.get(s1));
+            if (isDoubleEqualToZero(d1)){
+                return rate;
+            }
+            Double d2 = Double.valueOf(maps.get(s2));
+            rate = d2 / d1;
+        }
+        return rate;
+    }
+    public static Double divisionDouble(Double d1, Double d2){
+        double rate = 0.0;
+        if (isDoubleEqualToZero(d1)){
+            return rate;
+        }
+        rate = d2 / d1;
+        return rate;
+    }
+
+    public static boolean isDoubleEqualToZero(double value) {
+        final double epsilon = 1e-10; // 定义一个很小的误差范围
+        // 判断value是否在误差范围内
+        return Math.abs(value) < epsilon;
+    }
+
+
+
+    public static double calculateVariance(List<Double> numbers) {
+        double average = numbers.stream()
+                .mapToDouble(Double::doubleValue)
+                .average()
+                .orElse(0.0);
+
+        double squaredDiffSum = numbers.stream()
+                .mapToDouble(Double::doubleValue)
+                .map(x -> Math.pow(x - average, 2))
+                .average()
+                .orElse(0.0);
+
+        return squaredDiffSum;
+    }
+
+    public static double calculateAverage(List<Double> numbers) {
+        if (numbers == null || numbers.isEmpty()) {
+            return 0.0;
+        }
+        return numbers.stream()
+                .mapToDouble(Number::doubleValue)
+                .average()
+                .orElse(0.0);
+    }
+
+    public static List<Double> calculateDifferences(List<Double> numbers) {
+        List<Double> differences = new ArrayList<>();
+
+        for (int i = 0; i < numbers.size() - 1; i++) {
+            Double diff = 0.0;
+            if (!isDoubleEqualToZero(numbers.get(i))){
+                diff = (numbers.get(i + 1) - numbers.get(i)) / numbers.get(i);
+            }
+            differences.add(diff);
+        }
+
+        return differences;
+    }
+
+    public static List<String> generateHourStrings(String timeString, int N) {
+        LocalDateTime dateTime = LocalDateTime.parse(timeString, DateTimeFormatter.ofPattern("yyyyMMddHH"));
+        List<String> hourStrings = new ArrayList<>();
+        for (int i = 0; i < N; i++) {
+            hourStrings.add(dateTime.minusHours(i).format(DateTimeFormatter.ofPattern("yyyyMMddHH")));
+        }
+
+        return hourStrings;
+    }
+
+    public static String subtractHours(String inputDateTime, int hoursToSubtract) {
+        DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMMddHH");
+        LocalDateTime dateTime = LocalDateTime.parse(inputDateTime, formatter);
+        LocalDateTime subtractedDateTime = dateTime.minusHours(hoursToSubtract);
+        return subtractedDateTime.format(formatter);
+    }
+
+    // 针对0-1的数字,进行分桶。
+    public static Integer ceilLogRate(Double key) {
+        double bucket = Math.ceil(
+                Math.pow(key, 0.2) * 100
+        );
+        if (bucket > 300) {
+            bucket = 300;
+        }
+        if (bucket < 0) {
+            bucket = 0;
+        }
+        return (int)bucket;
+    }
+
+    // 针对大于1的数字,进行分桶。
+    public static int bucketCnt(Double key) {
+        long bucket = Math.round(Math.log((key * 10 + 1.0)) * 10);
+        if (bucket > 300) {
+            bucket = 300;
+        }
+        if (bucket < 0) {
+            bucket = 0;
+        }
+        return (int)bucket;
+    }
+
+    public static int findInsertPosition(double[] sortedArray, double target) {
+        int low = 0;
+        int high = sortedArray.length - 1;
+
+        while (low <= high) {
+            int mid = low + (high - low) / 2;
+            double midValue = sortedArray[mid];
+
+            if (midValue < target) {
+                low = mid + 1;
+            } else if (midValue > target) {
+                high = mid - 1;
+            } else {
+                // 找到相等的值,尝试在右侧寻找插入点
+                while (mid < sortedArray.length - 1 && sortedArray[mid + 1] == target) {
+                    mid++;
+                }
+                return mid + 1; // 返回当前mid的下一个位置作为插入点
+            }
+        }
+
+        return low; // 返回low作为插入点
+    }
+
+    public static void main(String[] args) {
+        double[] sortedArray = {1.0, 2.0, 4.0, 4.0, 6.0};
+        double target = 0.0;
+        System.out.println(findInsertPosition(sortedArray, target));
+
+
+//        System.out.println(ceilLogRate(0.0002));
+//        System.out.println(ceilLogRate(0.01));
+//        System.out.println(ceilLogRate(0.2));
+//        System.out.println(ceilLogRate(4.));
+//        System.out.println(bucketCnt(1.));
+//        System.out.println(bucketCnt(20.));
+//        System.out.println(bucketCnt(500.));
+//        System.out.println(bucketCnt(50000.));
+
+//        System.out.println(generateHourStrings("2024011603", 5));
+
+    }
+
+}

+ 33 - 0
src/main/java/examples/extractor/RankExtractorFeature_20240530.java

@@ -0,0 +1,33 @@
+package examples.extractor;
+
+public class RankExtractorFeature_20240530 {
+
+    public static Double calDiv(double a, double b){
+        if (a == 0 || b == 0){
+            return 0D;
+        }
+        return a / b;
+    }
+    public static Double calLog(double a){
+        if (a <= 0){
+            return 0D;
+        }
+        return Math.log(a + 1.0);
+    }
+
+    public static void main(String[] args) {
+        System.out.println(Math.log(10));
+        System.out.println(Math.log(100));
+        System.out.println(Math.log(1000));
+        System.out.println(Math.log(10000));
+        System.out.println(Math.log(100000));
+
+        System.out.println(Math.log10(10));
+        System.out.println(Math.log10(100));
+        System.out.println(Math.log10(1000));
+        System.out.println(Math.log10(10000));
+        System.out.println(Math.log10(100000));
+    }
+}
+
+

+ 324 - 0
src/main/java/examples/extractor/RankExtractorItemFeature.java

@@ -0,0 +1,324 @@
+package examples.extractor;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+public class RankExtractorItemFeature {
+    public static Map<String, String> getItemRateFeature(Map<String, String> maps) {
+
+        double d;
+        Map<String, Double> result = new HashMap<>();
+        d = ExtractorUtils.division("i_1day_exp_cnt", "i_1day_click_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_1day_ctr",d);
+        }
+        d = ExtractorUtils.division("i_1day_exp_cnt", "i_1day_share_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_1day_str",d);
+        }
+        d = ExtractorUtils.division("i_1day_exp_cnt", "i_1day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_1day_rov",d);
+        }
+        d = ExtractorUtils.division("i_1day_share_cnt", "i_1day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_1day_ros",d);
+        }
+
+        d = ExtractorUtils.division("i_3day_exp_cnt", "i_3day_click_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_3day_ctr",d);
+        }
+        d = ExtractorUtils.division("i_3day_exp_cnt", "i_3day_share_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_3day_str",d);
+        }
+        d = ExtractorUtils.division("i_3day_exp_cnt", "i_3day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_3day_rov",d);
+        }
+        d = ExtractorUtils.division("i_3day_share_cnt", "i_3day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_3day_ros",d);
+        }
+
+        d = ExtractorUtils.division("i_7day_exp_cnt", "i_7day_click_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_7day_ctr",d);
+        }
+        d = ExtractorUtils.division("i_7day_exp_cnt", "i_7day_share_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_7day_str",d);
+        }
+        d = ExtractorUtils.division("i_7day_exp_cnt", "i_7day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_7day_rov",d);
+        }
+        d = ExtractorUtils.division("i_7day_share_cnt", "i_7day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_7day_ros",d);
+        }
+
+        d = ExtractorUtils.division("i_3month_exp_cnt", "i_3month_click_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_3month_ctr",d);
+        }
+        d = ExtractorUtils.division("i_3month_exp_cnt", "i_3month_share_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_3month_str",d);
+        }
+        d = ExtractorUtils.division("i_3month_exp_cnt", "i_3month_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_3month_rov",d);
+        }
+        d = ExtractorUtils.division("i_3month_share_cnt", "i_3month_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_3month_ros",d);
+        }
+
+
+        return rateFeatureChange(result);
+    }
+
+    public static Map<String, String> getItemRealtimeTrend(Map<String, Map<String, Double>> maps, String date, String hour){
+        Map<String, Double> result1 = new HashMap<>();
+        Map<String, Double> result2 = new HashMap<>();
+        if (date.isEmpty() || hour.isEmpty()){
+            return rateFeatureChange(result1);
+        }
+        int N = 6;
+
+        List<String> hourStrs = ExtractorUtils.generateHourStrings(date + hour, N);
+
+        String key;
+
+        key = "share_uv_list_1day";
+        if (maps.containsKey(key)){
+            Map<String, Double> fList = maps.get(key);
+            List<Double> arrs = hourStrs.stream().map(r -> fList.getOrDefault(r, 0.0D)).collect(Collectors.toList());
+            Collections.reverse(arrs);
+            result1.put(key+"_"+N+"_avg", ExtractorUtils.calculateAverage(arrs));
+            result1.put(key+"_"+N+"_var", ExtractorUtils.calculateVariance(arrs));
+
+            List<Double> arrsDiff = ExtractorUtils.calculateDifferences(arrs);
+            result2.put(key+"_diff_"+N+"_avg", ExtractorUtils.calculateAverage(arrsDiff));
+            result2.put(key+"_diff_"+N+"_var", ExtractorUtils.calculateVariance(arrsDiff));
+        }
+
+        key = "return_uv_list_1day";
+        if (maps.containsKey(key)){
+            Map<String, Double> fList = maps.get(key);
+            List<Double> arrs = hourStrs.stream().map(r -> fList.getOrDefault(r, 0.0D)).collect(Collectors.toList());
+            Collections.reverse(arrs);
+            result1.put(key+"_"+N+"_avg", ExtractorUtils.calculateAverage(arrs));
+            result1.put(key+"_"+N+"_var", ExtractorUtils.calculateVariance(arrs));
+
+            List<Double> arrsDiff = ExtractorUtils.calculateDifferences(arrs);
+            result2.put(key+"_diff_"+N+"_avg", ExtractorUtils.calculateAverage(arrsDiff));
+            result2.put(key+"_diff_"+N+"_var", ExtractorUtils.calculateVariance(arrsDiff));
+        }
+
+        key = "share_uv_list_1h";
+        if (maps.containsKey(key)){
+            Map<String, Double> fList = maps.get(key);
+            List<Double> arrs = hourStrs.stream().map(r -> fList.getOrDefault(r, 0.0D)).collect(Collectors.toList());
+            Collections.reverse(arrs);
+            result1.put(key+"_"+N+"_avg", ExtractorUtils.calculateAverage(arrs));
+            result1.put(key+"_"+N+"_var", ExtractorUtils.calculateVariance(arrs));
+
+            List<Double> arrsDiff = ExtractorUtils.calculateDifferences(arrs);
+            result2.put(key+"_diff_"+N+"_avg", ExtractorUtils.calculateAverage(arrsDiff));
+            result2.put(key+"_diff_"+N+"_var", ExtractorUtils.calculateVariance(arrsDiff));
+        }
+
+        key = "return_uv_list_1h";
+        if (maps.containsKey(key)){
+            Map<String, Double> fList = maps.get(key);
+            List<Double> arrs = hourStrs.stream().map(r -> fList.getOrDefault(r, 0.0D)).collect(Collectors.toList());
+            Collections.reverse(arrs);
+            result1.put(key+"_"+N+"_avg", ExtractorUtils.calculateAverage(arrs));
+            result1.put(key+"_"+N+"_var", ExtractorUtils.calculateVariance(arrs));
+
+            List<Double> arrsDiff = ExtractorUtils.calculateDifferences(arrs);
+            result2.put(key+"_diff_"+N+"_avg", ExtractorUtils.calculateAverage(arrsDiff));
+            result2.put(key+"_diff_"+N+"_var", ExtractorUtils.calculateVariance(arrsDiff));
+        }
+        Map<String, String> r1 = cntFeatureChange4Double(result1);
+        Map<String, String> r2 = rateFeatureChange(result2);
+        r1.putAll(r2);
+
+        return r1;
+    }
+
+
+    public static Map<String, String> rateFeatureChange(Map<String, Double> maps){
+        Map<String, String> result = new HashMap<>();
+        for (Map.Entry<String, Double> entry : maps.entrySet()){
+            int value = ExtractorUtils.ceilLogRate(entry.getValue());
+            result.put(entry.getKey(), String.valueOf(value));
+        }
+        return result;
+    }
+    public static Map<String, String> cntFeatureChange4Double(Map<String, Double> maps){
+        Map<String, String> result = new HashMap<>();
+        for (Map.Entry<String, Double> entry : maps.entrySet()){
+            int value = ExtractorUtils.bucketCnt(Double.valueOf(entry.getValue()));
+            result.put(entry.getKey(), String.valueOf(value));
+        }
+        return result;
+    }
+
+    public static Map<String, String> cntFeatureChange(Map<String, String> maps,
+                                                       Set<String> names){
+        Map<String, String> result = new HashMap<>();
+        for (Map.Entry<String, String> entry : maps.entrySet()){
+            if (!names.contains(entry.getKey())){
+                continue;
+            }
+            int value = ExtractorUtils.bucketCnt(Double.valueOf(entry.getValue()));
+            result.put(entry.getKey(), String.valueOf(value));
+        }
+        return result;
+    }
+
+    public static Map<String, String> getItemRealtimeCnt(Map<String, Map<String, Double>> maps,
+                                                         Set<String> names,
+                                                         String date, String hour){
+        Map<String, String> result = new HashMap<>();
+        if (date.isEmpty() || hour.isEmpty()){
+            return result;
+        }
+        String dateHour = ExtractorUtils.subtractHours(date + hour, 0);
+        for (Map.Entry<String, Map<String, Double>> entry : maps.entrySet()){
+            if (!names.contains(entry.getKey())){
+                continue;
+            }
+            Double num = entry.getValue().getOrDefault(dateHour, 0.0);
+            if (!ExtractorUtils.isDoubleEqualToZero(num)){
+                result.put(entry.getKey(), String.valueOf(ExtractorUtils.bucketCnt(num)));
+            }
+        }
+        return result;
+    }
+
+    public static Map<String, String> getItemRealtimeRate(Map<String, Map<String, Double>> maps,
+                                                         String date, String hour){
+        Map<String, Double> result = new HashMap<>();
+        if (date.isEmpty() || hour.isEmpty()){
+            return rateFeatureChange(result);
+        }
+        String dateHour = ExtractorUtils.subtractHours(date + hour, 0);
+
+        double d, d1, d2;
+        String k1, k2;
+
+        k1 = "view_pv_list_1day";
+        k2 = "play_pv_list_1day";
+        if (maps.containsKey(k1) && maps.containsKey(k2)){
+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
+            d = ExtractorUtils.divisionDouble(d1, d2);
+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
+                result.put("i_1day_ctr_rt", d);
+            }
+        }
+
+        k1 = "view_pv_list_1day";
+        k2 = "share_pv_list_1day";
+        if (maps.containsKey(k1) && maps.containsKey(k2)){
+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
+            d = ExtractorUtils.divisionDouble(d1, d2);
+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
+                result.put("i_1day_str_rt", d);
+            }
+        }
+
+        k1 = "share_pv_list_1day";
+        k2 = "return_uv_list_1day";
+        if (maps.containsKey(k1) && maps.containsKey(k2)){
+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
+            d = ExtractorUtils.divisionDouble(d1, d2);
+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
+                result.put("i_1day_ros_rt", d);
+            }
+        }
+
+        k1 = "view_pv_list_1day";
+        k2 = "return_uv_list_1day";
+        if (maps.containsKey(k1) && maps.containsKey(k2)){
+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
+            d = ExtractorUtils.divisionDouble(d1, d2);
+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
+                result.put("i_1day_rov_rt", d);
+            }
+        }
+
+        //---
+        k1 = "view_pv_list_1h";
+        k2 = "play_pv_list_1h";
+        if (maps.containsKey(k1) && maps.containsKey(k2)){
+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
+            d = ExtractorUtils.divisionDouble(d1, d2);
+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
+                result.put("i_1h_ctr_rt", d);
+            }
+        }
+
+        k1 = "view_pv_list_1h";
+        k2 = "share_pv_list_1h";
+        if (maps.containsKey(k1) && maps.containsKey(k2)){
+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
+            d = ExtractorUtils.divisionDouble(d1, d2);
+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
+                result.put("i_1h_str_rt", d);
+            }
+        }
+
+        k1 = "share_pv_list_1day";
+        k2 = "return_uv_list_1h";
+        if (maps.containsKey(k1) && maps.containsKey(k2)){
+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
+            d = ExtractorUtils.divisionDouble(d1, d2);
+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
+                result.put("i_1h_ros_rt", d);
+            }
+        }
+
+        k1 = "view_pv_list_1h";
+        k2 = "return_uv_list_1h";
+        if (maps.containsKey(k1) && maps.containsKey(k2)){
+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
+            d = ExtractorUtils.divisionDouble(d1, d2);
+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
+                result.put("i_1h_rov_rt", d);
+            }
+        }
+
+
+        return rateFeatureChange(result);
+    }
+
+    public static void main(String[] args) {
+        String s1 = "share_uv_list_1day";
+        String s2 = "2024011300:2,2024011301:2,2024011304:2,2024011309:3,2024011311:3,2024011314:4,2024011315:4,2024011321:1,2024011323:1,2024011400:1,2024011401:1,2024011404:1,2024011406:1,2024011407:1,2024011408:1,2024011410:1,2024011423:1,2024011302:2,2024011305:2,2024011312:4,2024011313:4,2024011317:4,2024011318:4,2024011319:3,2024011320:1,2024011403:1,2024011409:1,2024011411:1,2024011419:1,2024011420:1,2024011422:1,2024011303:2,2024011306:2,2024011307:2,2024011308:2,2024011310:3,2024011316:4,2024011322:1,2024011402:1,2024011405:1,2024011421:1";
+        Map<String, Double> m1 = new HashMap<>();
+        Map<String, Map<String, Double>> maps = new HashMap<>();
+        for (String s : s2.split(",")){
+            String s3 = s.split(":")[0];
+            String s4 = s.split(":")[1];
+            m1.put(s3, Double.valueOf(s4));
+        }
+        maps.put(s1, m1);
+
+        String date = "20240114";
+        String hour = "20";
+        System.out.println(getItemRealtimeTrend(maps, date, hour));
+    }
+}

+ 338 - 0
src/main/java/examples/extractor/RankExtractorItemFeatureV2.java

@@ -0,0 +1,338 @@
+package examples.extractor;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+public class RankExtractorItemFeatureV2 {
+    public static Map<String, String> getItemRateFeature(Map<String, String> maps) {
+
+        double d;
+        Map<String, Double> result = new HashMap<>();
+        d = ExtractorUtils.division("i_1day_exp_cnt", "i_1day_click_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_1day_ctr",d);
+        }
+        d = ExtractorUtils.division("i_1day_exp_cnt", "i_1day_share_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_1day_str",d);
+        }
+        d = ExtractorUtils.division("i_1day_exp_cnt", "i_1day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_1day_rov",d);
+        }
+        d = ExtractorUtils.division("i_1day_share_cnt", "i_1day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_1day_ros",d);
+        }
+
+        d = ExtractorUtils.division("i_3day_exp_cnt", "i_3day_click_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_3day_ctr",d);
+        }
+        d = ExtractorUtils.division("i_3day_exp_cnt", "i_3day_share_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_3day_str",d);
+        }
+        d = ExtractorUtils.division("i_3day_exp_cnt", "i_3day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_3day_rov",d);
+        }
+        d = ExtractorUtils.division("i_3day_share_cnt", "i_3day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_3day_ros",d);
+        }
+
+        d = ExtractorUtils.division("i_7day_exp_cnt", "i_7day_click_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_7day_ctr",d);
+        }
+        d = ExtractorUtils.division("i_7day_exp_cnt", "i_7day_share_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_7day_str",d);
+        }
+        d = ExtractorUtils.division("i_7day_exp_cnt", "i_7day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_7day_rov",d);
+        }
+        d = ExtractorUtils.division("i_7day_share_cnt", "i_7day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_7day_ros",d);
+        }
+
+        d = ExtractorUtils.division("i_3month_exp_cnt", "i_3month_click_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_3month_ctr",d);
+        }
+        d = ExtractorUtils.division("i_3month_exp_cnt", "i_3month_share_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_3month_str",d);
+        }
+        d = ExtractorUtils.division("i_3month_exp_cnt", "i_3month_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_3month_rov",d);
+        }
+        d = ExtractorUtils.division("i_3month_share_cnt", "i_3month_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("i_3month_ros",d);
+        }
+
+
+        Map<String, String> result2 = new HashMap<>();
+        for (Map.Entry<String, Double> entry : result.entrySet()){
+            result2.put(entry.getKey(), String.valueOf(entry.getValue()));
+        }
+
+        return result2;
+    }
+
+    public static Map<String, String> getItemRealtimeTrend(Map<String, Map<String, Double>> maps, String date, String hour){
+        Map<String, Double> result1 = new HashMap<>();
+        Map<String, Double> result2 = new HashMap<>();
+        if (date.isEmpty() || hour.isEmpty()){
+            return rateFeatureChange(result1);
+        }
+        int N = 6;
+
+        List<String> hourStrs = ExtractorUtils.generateHourStrings(date + hour, N);
+
+        String key;
+
+        key = "share_uv_list_1day";
+        if (maps.containsKey(key)){
+            Map<String, Double> fList = maps.get(key);
+            List<Double> arrs = hourStrs.stream().map(r -> fList.getOrDefault(r, 0.0D)).collect(Collectors.toList());
+            Collections.reverse(arrs);
+            result1.put(key+"_"+N+"_avg", ExtractorUtils.calculateAverage(arrs));
+            result1.put(key+"_"+N+"_var", ExtractorUtils.calculateVariance(arrs));
+
+            List<Double> arrsDiff = ExtractorUtils.calculateDifferences(arrs);
+            result2.put(key+"_diff_"+N+"_avg", ExtractorUtils.calculateAverage(arrsDiff));
+            result2.put(key+"_diff_"+N+"_var", ExtractorUtils.calculateVariance(arrsDiff));
+        }
+
+        key = "return_uv_list_1day";
+        if (maps.containsKey(key)){
+            Map<String, Double> fList = maps.get(key);
+            List<Double> arrs = hourStrs.stream().map(r -> fList.getOrDefault(r, 0.0D)).collect(Collectors.toList());
+            Collections.reverse(arrs);
+            result1.put(key+"_"+N+"_avg", ExtractorUtils.calculateAverage(arrs));
+            result1.put(key+"_"+N+"_var", ExtractorUtils.calculateVariance(arrs));
+
+            List<Double> arrsDiff = ExtractorUtils.calculateDifferences(arrs);
+            result2.put(key+"_diff_"+N+"_avg", ExtractorUtils.calculateAverage(arrsDiff));
+            result2.put(key+"_diff_"+N+"_var", ExtractorUtils.calculateVariance(arrsDiff));
+        }
+
+        key = "share_uv_list_1h";
+        if (maps.containsKey(key)){
+            Map<String, Double> fList = maps.get(key);
+            List<Double> arrs = hourStrs.stream().map(r -> fList.getOrDefault(r, 0.0D)).collect(Collectors.toList());
+            Collections.reverse(arrs);
+            result1.put(key+"_"+N+"_avg", ExtractorUtils.calculateAverage(arrs));
+            result1.put(key+"_"+N+"_var", ExtractorUtils.calculateVariance(arrs));
+
+            List<Double> arrsDiff = ExtractorUtils.calculateDifferences(arrs);
+            result2.put(key+"_diff_"+N+"_avg", ExtractorUtils.calculateAverage(arrsDiff));
+            result2.put(key+"_diff_"+N+"_var", ExtractorUtils.calculateVariance(arrsDiff));
+        }
+
+        key = "return_uv_list_1h";
+        if (maps.containsKey(key)){
+            Map<String, Double> fList = maps.get(key);
+            List<Double> arrs = hourStrs.stream().map(r -> fList.getOrDefault(r, 0.0D)).collect(Collectors.toList());
+            Collections.reverse(arrs);
+            result1.put(key+"_"+N+"_avg", ExtractorUtils.calculateAverage(arrs));
+            result1.put(key+"_"+N+"_var", ExtractorUtils.calculateVariance(arrs));
+
+            List<Double> arrsDiff = ExtractorUtils.calculateDifferences(arrs);
+            result2.put(key+"_diff_"+N+"_avg", ExtractorUtils.calculateAverage(arrsDiff));
+            result2.put(key+"_diff_"+N+"_var", ExtractorUtils.calculateVariance(arrsDiff));
+        }
+        Map<String, String> result = new HashMap<>();
+        for (Map.Entry<String, Double> entry : result1.entrySet()){
+            result.put(entry.getKey(), String.valueOf(entry.getValue()));
+        }
+        for (Map.Entry<String, Double> entry : result2.entrySet()){
+            result.put(entry.getKey(), String.valueOf(entry.getValue()));
+        }
+
+
+        return result;
+    }
+
+
+    public static Map<String, String> rateFeatureChange(Map<String, Double> maps){
+        Map<String, String> result = new HashMap<>();
+        for (Map.Entry<String, Double> entry : maps.entrySet()){
+            int value = ExtractorUtils.ceilLogRate(entry.getValue());
+            result.put(entry.getKey(), String.valueOf(value));
+        }
+        return result;
+    }
+    public static Map<String, String> cntFeatureChange4Double(Map<String, Double> maps){
+        Map<String, String> result = new HashMap<>();
+        for (Map.Entry<String, Double> entry : maps.entrySet()){
+            int value = ExtractorUtils.bucketCnt(entry.getValue());
+            result.put(entry.getKey(), String.valueOf(value));
+        }
+        return result;
+    }
+
+    public static Map<String, String> cntFeatureChange(Map<String, String> maps,
+                                                       Set<String> names){
+        Map<String, String> result = new HashMap<>();
+        for (Map.Entry<String, String> entry : maps.entrySet()){
+            if (!names.contains(entry.getKey())){
+                continue;
+            }
+            int value = ExtractorUtils.bucketCnt(Double.valueOf(entry.getValue()));
+            result.put(entry.getKey(), String.valueOf(value));
+        }
+        return result;
+    }
+
+    public static Map<String, String> getItemRealtimeCnt(Map<String, Map<String, Double>> maps,
+                                                         Set<String> names,
+                                                         String date, String hour){
+        Map<String, String> result = new HashMap<>();
+        if (date.isEmpty() || hour.isEmpty()){
+            return result;
+        }
+        String dateHour = ExtractorUtils.subtractHours(date + hour, 0);
+        for (Map.Entry<String, Map<String, Double>> entry : maps.entrySet()){
+            if (!names.contains(entry.getKey())){
+                continue;
+            }
+            Double num = entry.getValue().getOrDefault(dateHour, 0.0);
+            if (!ExtractorUtils.isDoubleEqualToZero(num)){
+                result.put(entry.getKey(), String.valueOf(num));
+            }
+        }
+        return result;
+    }
+
+    public static Map<String, String> getItemRealtimeRate(Map<String, Map<String, Double>> maps,
+                                                         String date, String hour){
+        Map<String, Double> result = new HashMap<>();
+        if (date.isEmpty() || hour.isEmpty()){
+            return rateFeatureChange(result);
+        }
+        String dateHour = ExtractorUtils.subtractHours(date + hour, 0);
+
+        double d, d1, d2;
+        String k1, k2;
+
+        k1 = "view_pv_list_1day";
+        k2 = "play_pv_list_1day";
+        if (maps.containsKey(k1) && maps.containsKey(k2)){
+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
+            d = ExtractorUtils.divisionDouble(d1, d2);
+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
+                result.put("i_1day_ctr_rt", d);
+            }
+        }
+
+        k1 = "view_pv_list_1day";
+        k2 = "share_pv_list_1day";
+        if (maps.containsKey(k1) && maps.containsKey(k2)){
+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
+            d = ExtractorUtils.divisionDouble(d1, d2);
+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
+                result.put("i_1day_str_rt", d);
+            }
+        }
+
+        k1 = "share_pv_list_1day";
+        k2 = "return_uv_list_1day";
+        if (maps.containsKey(k1) && maps.containsKey(k2)){
+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
+            d = ExtractorUtils.divisionDouble(d1, d2);
+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
+                result.put("i_1day_ros_rt", d);
+            }
+        }
+
+        k1 = "view_pv_list_1day";
+        k2 = "return_uv_list_1day";
+        if (maps.containsKey(k1) && maps.containsKey(k2)){
+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
+            d = ExtractorUtils.divisionDouble(d1, d2);
+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
+                result.put("i_1day_rov_rt", d);
+            }
+        }
+
+        //---
+        k1 = "view_pv_list_1h";
+        k2 = "play_pv_list_1h";
+        if (maps.containsKey(k1) && maps.containsKey(k2)){
+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
+            d = ExtractorUtils.divisionDouble(d1, d2);
+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
+                result.put("i_1h_ctr_rt", d);
+            }
+        }
+
+        k1 = "view_pv_list_1h";
+        k2 = "share_pv_list_1h";
+        if (maps.containsKey(k1) && maps.containsKey(k2)){
+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
+            d = ExtractorUtils.divisionDouble(d1, d2);
+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
+                result.put("i_1h_str_rt", d);
+            }
+        }
+
+        k1 = "share_pv_list_1day";
+        k2 = "return_uv_list_1h";
+        if (maps.containsKey(k1) && maps.containsKey(k2)){
+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
+            d = ExtractorUtils.divisionDouble(d1, d2);
+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
+                result.put("i_1h_ros_rt", d);
+            }
+        }
+
+        k1 = "view_pv_list_1h";
+        k2 = "return_uv_list_1h";
+        if (maps.containsKey(k1) && maps.containsKey(k2)){
+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
+            d = ExtractorUtils.divisionDouble(d1, d2);
+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
+                result.put("i_1h_rov_rt", d);
+            }
+        }
+        Map<String, String> result2 = new HashMap<>();
+        for (Map.Entry<String, Double> entry : result.entrySet()){
+            result2.put(entry.getKey(), String.valueOf(entry.getValue()));
+        }
+
+
+        return result2;
+    }
+
+    public static void main(String[] args) {
+        String s1 = "share_uv_list_1day";
+        String s2 = "2024011300:2,2024011301:2,2024011304:2,2024011309:3,2024011311:3,2024011314:4,2024011315:4,2024011321:1,2024011323:1,2024011400:1,2024011401:1,2024011404:1,2024011406:1,2024011407:1,2024011408:1,2024011410:1,2024011423:1,2024011302:2,2024011305:2,2024011312:4,2024011313:4,2024011317:4,2024011318:4,2024011319:3,2024011320:1,2024011403:1,2024011409:1,2024011411:1,2024011419:1,2024011420:1,2024011422:1,2024011303:2,2024011306:2,2024011307:2,2024011308:2,2024011310:3,2024011316:4,2024011322:1,2024011402:1,2024011405:1,2024011421:1";
+        Map<String, Double> m1 = new HashMap<>();
+        Map<String, Map<String, Double>> maps = new HashMap<>();
+        for (String s : s2.split(",")){
+            String s3 = s.split(":")[0];
+            String s4 = s.split(":")[1];
+            m1.put(s3, Double.valueOf(s4));
+        }
+        maps.put(s1, m1);
+
+        String date = "20240114";
+        String hour = "20";
+        System.out.println(getItemRealtimeTrend(maps, date, hour));
+    }
+}

+ 104 - 0
src/main/java/examples/extractor/RankExtractorUserFeature.java

@@ -0,0 +1,104 @@
+package examples.extractor;
+
+
+import java.util.*;
+
+public class RankExtractorUserFeature {
+    public static Map<String, String> getUserRateFeature(Map<String, String> maps) {
+
+        double d;
+        Map<String, Double> result = new HashMap<>();
+        d = ExtractorUtils.division("u_1day_exp_cnt", "u_1day_click_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_1day_ctr",d);
+        }
+        d = ExtractorUtils.division("u_1day_exp_cnt", "u_1day_share_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_1day_str",d);
+        }
+        d = ExtractorUtils.division("u_1day_exp_cnt", "u_1day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_1day_rov",d);
+        }
+        d = ExtractorUtils.division("u_1day_share_cnt", "u_1day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_1day_ros",d);
+        }
+
+        d = ExtractorUtils.division("u_3day_exp_cnt", "u_3day_click_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_3day_ctr",d);
+        }
+        d = ExtractorUtils.division("u_3day_exp_cnt", "u_3day_share_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_3day_str",d);
+        }
+        d = ExtractorUtils.division("u_3day_exp_cnt", "u_3day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_3day_rov",d);
+        }
+        d = ExtractorUtils.division("u_3day_share_cnt", "u_3day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_3day_ros",d);
+        }
+
+        d = ExtractorUtils.division("u_7day_exp_cnt", "u_7day_click_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_7day_ctr",d);
+        }
+        d = ExtractorUtils.division("u_7day_exp_cnt", "u_7day_share_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_7day_str",d);
+        }
+        d = ExtractorUtils.division("u_7day_exp_cnt", "u_7day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_7day_rov",d);
+        }
+        d = ExtractorUtils.division("u_7day_share_cnt", "u_7day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_7day_ros",d);
+        }
+
+        d = ExtractorUtils.division("u_3month_exp_cnt", "u_3month_click_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_3month_ctr",d);
+        }
+        d = ExtractorUtils.division("u_3month_exp_cnt", "u_3month_share_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_3month_str",d);
+        }
+        d = ExtractorUtils.division("u_3month_exp_cnt", "u_3month_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_3month_rov",d);
+        }
+        d = ExtractorUtils.division("u_3month_share_cnt", "u_3month_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_3month_ros",d);
+        }
+
+        return rateFeatureChange(result);
+    }
+
+
+    public static Map<String, String> rateFeatureChange(Map<String, Double> maps){
+        Map<String, String> result = new HashMap<>();
+        for (Map.Entry<String, Double> entry : maps.entrySet()){
+            int value = ExtractorUtils.ceilLogRate(entry.getValue());
+            result.put(entry.getKey(), String.valueOf(value));
+        }
+        return result;
+    }
+
+    public static Map<String, String> cntFeatureChange(Map<String, String> maps, Set<String> names){
+        Map<String, String> result = new HashMap<>();
+        for (Map.Entry<String, String> entry : maps.entrySet()){
+            if (!names.contains(entry.getKey())){
+                continue;
+            }
+            int value = ExtractorUtils.bucketCnt(Double.valueOf(entry.getValue()));
+            result.put(entry.getKey(), String.valueOf(value));
+        }
+        return result;
+    }
+
+}

+ 110 - 0
src/main/java/examples/extractor/RankExtractorUserFeatureV2.java

@@ -0,0 +1,110 @@
+package examples.extractor;
+
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+public class RankExtractorUserFeatureV2 {
+    public static Map<String, String> getUserRateFeature(Map<String, String> maps) {
+
+        double d;
+        Map<String, Double> result = new HashMap<>();
+        d = ExtractorUtils.division("u_1day_exp_cnt", "u_1day_click_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_1day_ctr",d);
+        }
+        d = ExtractorUtils.division("u_1day_exp_cnt", "u_1day_share_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_1day_str",d);
+        }
+        d = ExtractorUtils.division("u_1day_exp_cnt", "u_1day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_1day_rov",d);
+        }
+        d = ExtractorUtils.division("u_1day_share_cnt", "u_1day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_1day_ros",d);
+        }
+
+        d = ExtractorUtils.division("u_3day_exp_cnt", "u_3day_click_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_3day_ctr",d);
+        }
+        d = ExtractorUtils.division("u_3day_exp_cnt", "u_3day_share_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_3day_str",d);
+        }
+        d = ExtractorUtils.division("u_3day_exp_cnt", "u_3day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_3day_rov",d);
+        }
+        d = ExtractorUtils.division("u_3day_share_cnt", "u_3day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_3day_ros",d);
+        }
+
+        d = ExtractorUtils.division("u_7day_exp_cnt", "u_7day_click_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_7day_ctr",d);
+        }
+        d = ExtractorUtils.division("u_7day_exp_cnt", "u_7day_share_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_7day_str",d);
+        }
+        d = ExtractorUtils.division("u_7day_exp_cnt", "u_7day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_7day_rov",d);
+        }
+        d = ExtractorUtils.division("u_7day_share_cnt", "u_7day_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_7day_ros",d);
+        }
+
+        d = ExtractorUtils.division("u_3month_exp_cnt", "u_3month_click_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_3month_ctr",d);
+        }
+        d = ExtractorUtils.division("u_3month_exp_cnt", "u_3month_share_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_3month_str",d);
+        }
+        d = ExtractorUtils.division("u_3month_exp_cnt", "u_3month_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_3month_rov",d);
+        }
+        d = ExtractorUtils.division("u_3month_share_cnt", "u_3month_return_cnt", maps);
+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
+            result.put("u_3month_ros",d);
+        }
+        Map<String, String> result2 = new HashMap<>();
+        for (Map.Entry<String, Double> entry : result.entrySet()){
+            result2.put(entry.getKey(), String.valueOf(entry.getValue()));
+        }
+
+        return result2;
+    }
+
+
+    public static Map<String, String> rateFeatureChange(Map<String, Double> maps){
+        Map<String, String> result = new HashMap<>();
+        for (Map.Entry<String, Double> entry : maps.entrySet()){
+            int value = ExtractorUtils.ceilLogRate(entry.getValue());
+            result.put(entry.getKey(), String.valueOf(value));
+        }
+        return result;
+    }
+
+    public static Map<String, String> cntFeatureChange(Map<String, String> maps, Set<String> names){
+        Map<String, String> result = new HashMap<>();
+        for (Map.Entry<String, String> entry : maps.entrySet()){
+            if (!names.contains(entry.getKey())){
+                continue;
+            }
+            int value = ExtractorUtils.bucketCnt(Double.valueOf(entry.getValue()));
+            result.put(entry.getKey(), String.valueOf(value));
+        }
+        return result;
+    }
+
+}

+ 100 - 107
src/main/java/examples/sparksql/SparkAdCTRSampleLoader.java

@@ -1,107 +1,100 @@
-package examples.sparksql;
-
-import com.aliyun.odps.TableSchema;
-import com.aliyun.odps.data.Record;
-import com.google.common.collect.ListMultimap;
-
-import com.tzld.piaoquan.recommend.feature.domain.ad.base.*;
-import com.tzld.piaoquan.recommend.feature.domain.ad.feature.VlogAdCtrLRFeatureExtractor;
-import com.tzld.piaoquan.recommend.feature.model.sample.BaseFeature;
-import com.tzld.piaoquan.recommend.feature.model.sample.FeatureGroup;
-import com.tzld.piaoquan.recommend.feature.model.sample.GroupedFeature;
-import com.tzld.piaoquan.recommend.feature.model.sample.LRSamples;
-import examples.dataloader.AdSampleConstructor;
-import org.apache.spark.SparkConf;
-import org.apache.spark.aliyun.odps.OdpsOps;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.Function2;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-
-
-public class SparkAdCTRSampleLoader {
-
-    public static void main(String[] args) {
-
-        String partition = args[0];
-        String accessId = "LTAIWYUujJAm7CbH";
-        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
-        String odpsUrl = "http://service.odps.aliyun.com/api";
-        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
-        String project = "loghubods";
-        String table = "alg_ad_view_sample";
-        String hdfsPath = "/dw/recommend/model/ad_ctr_samples/" + partition;
-
-        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
-        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
-        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
-        System.out.println("Read odps table...");
-
-        JavaRDD<String> readData = odpsOps.readTableWithJava(project, table, partition, new RecordsToSamples(), Integer.valueOf(30));
-        readData.saveAsTextFile(hdfsPath);
-    }
-
-
-    static class RecordsToSamples implements Function2<Record, TableSchema, String> {
-        @Override
-        public String call(Record record, TableSchema schema) throws Exception {
-            String labelName = "adclick_ornot";
-            String ret = singleParse(record, labelName);
-            return ret;
-        }
-    }
-
-
-    // 单条日志处理逻辑
-    public static String singleParse(Record record, String labelName) {
-        // 数据解析
-        String label = record.getString(labelName);
-        if (label == null || label.equals("1")) {
-            label = "0";
-        } else {
-            label = "1";
-        }
-
-        // 从sql的 record中 初始化对象内容
-        AdRequestContext requestContext = AdSampleConstructor.constructRequestContext(record);
-        UserAdFeature userFeature = AdSampleConstructor.constructUserFeature(record);
-        AdItemFeature itemFeature = AdSampleConstructor.constructItemFeature(record);
-
-        // 转化成bytes
-        AdRequestContextBytesFeature adRequestContextBytesFeature = new AdRequestContextBytesFeature(requestContext);
-        UserAdBytesFeature userBytesFeature = new UserAdBytesFeature(userFeature);
-        AdItemBytesFeature adItemBytesFeature = new AdItemBytesFeature(itemFeature);
-
-        // 特征抽取
-        VlogAdCtrLRFeatureExtractor bytesFeatureExtractor;
-        bytesFeatureExtractor = new VlogAdCtrLRFeatureExtractor();
-
-        LRSamples lrSamples = bytesFeatureExtractor.single(userBytesFeature, adItemBytesFeature, adRequestContextBytesFeature);
-
-        return parseSamplesToString2(label, lrSamples);
-    }
-
-
-
-    // 构建样本的字符串
-    public static String parseSamplesToString2(String label, LRSamples lrSamples) {
-        ArrayList<String> featureList = new ArrayList<String>();
-        for (int i = 0; i < lrSamples.getFeaturesCount(); i++) {
-            GroupedFeature groupedFeature = lrSamples.getFeatures(i);
-            if (groupedFeature != null && groupedFeature.getFeaturesCount() != 0) {
-                for (int j = 0; j < groupedFeature.getFeaturesCount(); j++) {
-                    BaseFeature baseFeature = groupedFeature.getFeatures(j);
-                    if (baseFeature != null) {
-                        featureList.add(String.valueOf(baseFeature.getIdentifier()) + ":1" );
-                    }
-                }
-            }
-        }
-        return label + "\t" + String.join("\t", featureList);
-    }
-
-
-}
+//package examples.sparksql;
+//
+//import com.aliyun.odps.TableSchema;
+//import com.aliyun.odps.data.Record;
+//import com.google.common.collect.ListMultimap;
+//import com.tzld.piaoquan.ad.engine.commons.base.*;
+//import com.tzld.piaoquan.ad.engine.commons.score.feature.VlogAdCtrLRFeatureExtractor;
+//import com.tzld.piaoquan.recommend.server.gen.recommend.BaseFeature;
+//import com.tzld.piaoquan.recommend.server.gen.recommend.FeatureGroup;
+//import examples.dataloader.AdSampleConstructor;
+//import examples.dataloader.RecommendSampleConstructor;
+//import org.apache.spark.SparkConf;
+//import org.apache.spark.aliyun.odps.OdpsOps;
+//import org.apache.spark.api.java.JavaRDD;
+//import org.apache.spark.api.java.JavaSparkContext;
+//import org.apache.spark.api.java.function.Function2;
+//
+//import java.util.ArrayList;
+//import java.util.Map;
+//
+//
+//public class SparkAdCTRSampleLoader {
+//
+//    public static void main(String[] args) {
+//
+//        String partition = args[0];
+//        String accessId = "LTAIWYUujJAm7CbH";
+//        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
+//        String odpsUrl = "http://service.odps.aliyun.com/api";
+//        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
+//        String project = "loghubods";
+//        String table = "alg_ad_view_sample";
+//        String hdfsPath = "/dw/recommend/model/ad_ctr_samples/" + partition;
+//
+//        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
+//        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
+//        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
+//        System.out.println("Read odps table...");
+//
+//        JavaRDD<String> readData = odpsOps.readTableWithJava(project, table, partition, new RecordsToSamples(), Integer.valueOf(30));
+//        readData.saveAsTextFile(hdfsPath);
+//    }
+//
+//
+//    static class RecordsToSamples implements Function2<Record, TableSchema, String> {
+//        @Override
+//        public String call(Record record, TableSchema schema) throws Exception {
+//            String labelName = "adclick_ornot";
+//            String ret = singleParse(record, labelName);
+//            return ret;
+//        }
+//    }
+//
+//
+//    // 单条日志处理逻辑
+//    public static String singleParse(Record record, String labelName) {
+//        // 数据解析
+//        String label = record.getString(labelName);
+//        if (label == null || label.equals("0")) {
+//            label = "0";
+//        } else {
+//            label = "1";
+//        }
+//
+//        // 从sql的 record中 初始化对象内容
+//        AdRequestContext requestContext = AdSampleConstructor.constructRequestContext(record);
+//        UserAdFeature userFeature = AdSampleConstructor.constructUserFeature(record);
+//        AdItemFeature itemFeature = AdSampleConstructor.constructItemFeature(record);
+//
+//        // 转化成bytes
+//        AdRequestContextBytesFeature adRequestContextBytesFeature = new AdRequestContextBytesFeature(requestContext);
+//        UserAdBytesFeature userBytesFeature = new UserAdBytesFeature(userFeature);
+//        AdItemBytesFeature adItemBytesFeature = new AdItemBytesFeature(itemFeature);
+//
+//        // 特征抽取
+//        VlogAdCtrLRFeatureExtractor bytesFeatureExtractor;
+//        bytesFeatureExtractor = new VlogAdCtrLRFeatureExtractor();
+//
+//        bytesFeatureExtractor.getUserFeatures(userBytesFeature);
+//        bytesFeatureExtractor.getItemFeature(adItemBytesFeature);
+//        bytesFeatureExtractor.getContextFeatures(adRequestContextBytesFeature);
+//        bytesFeatureExtractor.getCrossFeature(adItemBytesFeature, adRequestContextBytesFeature, userBytesFeature);
+//
+//        ListMultimap<FeatureGroup, BaseFeature> featureMap = bytesFeatureExtractor.getFeatures();
+//        return parseSamplesToString(label, featureMap);
+//    }
+//
+//    // 构建样本的字符串
+//    public static String parseSamplesToString(String label, ListMultimap<FeatureGroup, BaseFeature> featureMap) {
+//        ArrayList<String> featureList = new ArrayList<String>();
+//        for (Map.Entry<FeatureGroup, BaseFeature> entry : featureMap.entries()) {
+//            FeatureGroup groupedFeature = entry.getKey();
+//            BaseFeature baseFeature = entry.getValue();
+//            Long featureIdentifier = baseFeature.getIdentifier();
+//            featureList.add(String.valueOf(featureIdentifier) + ":1");
+//        }
+//        return label + "\t" + String.join("\t", featureList);
+//    }
+//
+//}

+ 125 - 124
src/main/java/examples/sparksql/SparkAdFeaToRedisLoader.java

@@ -1,124 +1,125 @@
-package examples.sparksql;
-
-import com.aliyun.odps.TableSchema;
-import com.aliyun.odps.data.Record;
-
-
-import com.tzld.piaoquan.recommend.feature.domain.ad.base.*;
-import examples.dataloader.AdRedisFeatureConstructor;
-import org.apache.spark.SparkConf;
-import org.apache.spark.aliyun.odps.OdpsOps;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.Function2;
-import org.springframework.data.redis.connection.RedisStandaloneConfiguration;
-import org.springframework.data.redis.connection.jedis.JedisConnectionFactory;
-import org.springframework.data.redis.core.RedisTemplate;
-import org.springframework.data.redis.serializer.StringRedisSerializer;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-
-public class SparkAdFeaToRedisLoader {
-
-    private static final String userKeyFormat = "user:ad:%s";
-
-    private static final String adKeyFormat = "ad:%s";
-
-
-    public static RedisTemplate<String, String> buildRedisTemplate() {
-        RedisStandaloneConfiguration rsc = new RedisStandaloneConfiguration();
-        rsc.setPort(6379);
-        rsc.setPassword("Wqsd@2019");
-        rsc.setHostName("r-bp1pi8wyv6lzvgjy5z.redis.rds.aliyuncs.com");
-        RedisTemplate<String, String> template = new RedisTemplate<>();
-        JedisConnectionFactory fac = new JedisConnectionFactory(rsc);
-        fac.afterPropertiesSet();
-        template.setDefaultSerializer(new StringRedisSerializer());
-        template.setConnectionFactory(fac);
-        template.afterPropertiesSet();
-        return template;
-    }
-
-
-    public static void loadFeatureToRedis(RedisTemplate<String, String> redisTemplate, List<String> line) {
-        Map<String, String> redisFormat = new HashMap<String, String>();
-        String key = line.get(0);
-        String value = line.get(1);
-        redisFormat.put(key, value);
-        redisTemplate.opsForValue().multiSet(redisFormat);
-    }
-
-
-    static class RecordsToAdRedisKV implements Function2<Record, TableSchema, List<String>> {
-        @Override
-        public List<String> call(Record record, TableSchema schema) throws Exception {
-            AdItemFeature adItemFeature = AdRedisFeatureConstructor.constructItemFeature(record);
-            // ad feature 中的key以creativeID拼接
-            String key = String.format(adKeyFormat, adItemFeature.getCreativeId());
-            String value = adItemFeature.getValue();
-            List<String> kv = new ArrayList<String>();
-            kv.add(key);
-            kv.add(value);
-            return kv;
-        }
-    }
-
-
-    static class RecordsToUserRedisKV implements Function2<Record, TableSchema, List<String>> {
-        @Override
-        public List<String> call(Record record, TableSchema schema) throws Exception {
-            UserAdFeature userFeature = AdRedisFeatureConstructor.constructUserFeature(record);
-            List<String> kv = new ArrayList<String>();
-            String key = String.format(userKeyFormat, userFeature.getKey());
-            String value = userFeature.getValue();
-            kv.add(key);
-            kv.add(value);
-            return kv;
-        }
-    }
-
-
-    public static void main(String[] args) {
-
-        String partition = args[0];
-        String accessId = "LTAIWYUujJAm7CbH";
-        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
-        String odpsUrl = "http://service.odps.aliyun.com/api";
-        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
-        String project = "loghubods";
-        String tableAdInfo = "alg_ad_item_info";
-        String tableUserInfo = "alg_ad_user_info";
-
-
-        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
-        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
-        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
-        System.out.println("Read odps table...");
-
-
-        // load Ad features
-        JavaRDD<List<String>> readAdData = odpsOps.readTableWithJava(project, tableAdInfo, partition, new RecordsToAdRedisKV(), Integer.valueOf(10));
-        readAdData.foreachPartition(
-                rowIterator -> {
-                    RedisTemplate<String, String> redisTemplate = buildRedisTemplate();
-                    rowIterator.forEachRemaining(line -> loadFeatureToRedis(redisTemplate, line));
-                }
-        );
-
-
-        // load user features
-        JavaRDD<List<String>> readUserData = odpsOps.readTableWithJava(project, tableUserInfo, partition, new RecordsToUserRedisKV(), Integer.valueOf(50));
-        readUserData.repartition(50).foreachPartition(
-                rowIterator -> {
-                    RedisTemplate<String, String> redisTemplate = buildRedisTemplate();
-                    rowIterator.forEachRemaining(line -> loadFeatureToRedis(redisTemplate, line));
-                }
-        );
-    }
-
-
-}
+//package examples.sparksql;
+//
+//import com.aliyun.odps.TableSchema;
+//import com.aliyun.odps.data.Record;
+//import com.tzld.piaoquan.ad.engine.commons.base.AdItemFeature;
+//import com.tzld.piaoquan.ad.engine.commons.base.UserAdFeature;
+//import examples.dataloader.AdRedisFeatureConstructor;
+//import org.apache.spark.SparkConf;
+//import org.apache.spark.aliyun.odps.OdpsOps;
+//import org.apache.spark.api.java.JavaRDD;
+//import org.apache.spark.api.java.JavaSparkContext;
+//import org.apache.spark.api.java.function.Function2;
+//import org.springframework.data.redis.connection.RedisStandaloneConfiguration;
+//import org.springframework.data.redis.connection.jedis.JedisConnectionFactory;
+//import org.springframework.data.redis.core.RedisTemplate;
+//import org.springframework.data.redis.serializer.StringRedisSerializer;
+//
+//import java.io.IOException;
+//import java.util.ArrayList;
+//import java.util.HashMap;
+//import java.util.List;
+//import java.util.Map;
+//
+//
+//public class SparkAdFeaToRedisLoader {
+//
+//    private static final String userKeyFormat = "user:ad:%s";
+//
+//    private static final String adKeyFormat = "ad:%s";
+//
+//
+//    public static RedisTemplate<String, String> buildRedisTemplate() {
+//        RedisStandaloneConfiguration rsc = new RedisStandaloneConfiguration();
+//        rsc.setPort(6379);
+//        rsc.setPassword("Wqsd@2019");
+//        rsc.setHostName("r-bp1pi8wyv6lzvgjy5z.redis.rds.aliyuncs.com");
+//        RedisTemplate<String, String> template = new RedisTemplate<>();
+//        JedisConnectionFactory fac = new JedisConnectionFactory(rsc);
+//        fac.afterPropertiesSet();
+//        template.setDefaultSerializer(new StringRedisSerializer());
+//        template.setConnectionFactory(fac);
+//        template.afterPropertiesSet();
+//        return template;
+//    }
+//
+//
+//    public static void loadFeatureToRedis(RedisTemplate<String, String> redisTemplate, List<String> line) {
+//        Map<String, String> redisFormat = new HashMap<String, String>();
+//        String key = line.get(0);
+//        String value = line.get(1);
+//        redisFormat.put(key, value);
+//        redisTemplate.opsForValue().multiSet(redisFormat);
+//    }
+//
+//
+//    static class RecordsToAdRedisKV implements Function2<Record, TableSchema, List<String>> {
+//        @Override
+//        public List<String> call(Record record, TableSchema schema) throws Exception {
+//            AdItemFeature adItemFeature = AdRedisFeatureConstructor.constructItemFeature(record);
+//            // ad feature 中的key以creativeID拼接
+//            String key = String.format(adKeyFormat, adItemFeature.getCreativeId());
+//            String value = adItemFeature.getValue();
+//            List<String> kv = new ArrayList<String>();
+//            kv.add(key);
+//            kv.add(value);
+//            return kv;
+//        }
+//    }
+//
+//
+//    static class RecordsToUserRedisKV implements Function2<Record, TableSchema, List<String>> {
+//        @Override
+//        public List<String> call(Record record, TableSchema schema) throws Exception {
+//            UserAdFeature userFeature = AdRedisFeatureConstructor.constructUserFeature(record);
+//            List<String> kv = new ArrayList<String>();
+//            String key = String.format(userKeyFormat, userFeature.getKey());
+//
+//            String value = userFeature.getValue();
+//            kv.add(key);
+//            kv.add(value);
+//            return kv;
+//        }
+//    }
+//
+//
+//    public static void main(String[] args) {
+//
+//        String partition = args[0];
+//        String accessId = "LTAIWYUujJAm7CbH";
+//        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
+//        String odpsUrl = "http://service.odps.aliyun.com/api";
+//        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
+//        String project = "loghubods";
+//        String tableAdInfo = "alg_ad_item_info";
+//        String tableUserInfo = "alg_ad_user_info";
+//
+//
+//        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
+//        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
+//        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
+//        System.out.println("Read odps table...");
+//
+//
+//        // load Ad features
+//        JavaRDD<List<String>> readAdData = odpsOps.readTableWithJava(project, tableAdInfo, partition, new RecordsToAdRedisKV(), Integer.valueOf(10));
+//        readAdData.foreachPartition(
+//                rowIterator -> {
+//                    RedisTemplate<String, String> redisTemplate = buildRedisTemplate();
+//                    rowIterator.forEachRemaining(line -> loadFeatureToRedis(redisTemplate, line));
+//                }
+//        );
+//
+//
+//        // load user features
+//        JavaRDD<List<String>> readUserData = odpsOps.readTableWithJava(project, tableUserInfo, partition, new RecordsToUserRedisKV(), Integer.valueOf(50));
+//        readUserData.repartition(50).foreachPartition(
+//                rowIterator -> {
+//                    RedisTemplate<String, String> redisTemplate = buildRedisTemplate();
+//                    rowIterator.forEachRemaining(line -> loadFeatureToRedis(redisTemplate, line));
+//                }
+//        );
+//    }
+//
+//
+//}

+ 98 - 99
src/main/java/examples/sparksql/SparkShareRatioSampleLoader.java

@@ -1,99 +1,98 @@
-package examples.sparksql;
-
-import com.aliyun.odps.TableSchema;
-import com.aliyun.odps.data.Record;
-import com.google.common.collect.ListMultimap;
-import com.tzld.piaoquan.recommend.feature.domain.video.base.*;
-
-import examples.dataloader.RecommendSampleConstructor;
-import com.tzld.piaoquan.recommend.feature.domain.video.feature.VlogShareLRFeatureExtractor;
-import com.tzld.piaoquan.recommend.feature.model.sample.BaseFeature;
-import com.tzld.piaoquan.recommend.feature.model.sample.FeatureGroup;
-import org.apache.spark.SparkConf;
-import org.apache.spark.aliyun.odps.OdpsOps;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.Function2;
-
-import java.util.ArrayList;
-import java.util.Map;
-
-
-public class SparkShareRatioSampleLoader {
-
-    public static void main(String[] args) {
-
-        String partition = args[0];
-        String accessId = "LTAIWYUujJAm7CbH";
-        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
-        String odpsUrl = "http://service.odps.aliyun.com/api";
-        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
-        String project = "loghubods";
-        String table = "alg_recsys_view_sample";
-        String hdfsPath = "/dw/recommend/model/share_ratio_samples/" + partition;
-
-        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
-        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
-        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
-        System.out.println("Read odps table...");
-
-        JavaRDD<String> readData = odpsOps.readTableWithJava(project, table, partition, new RecordsToSamples(), Integer.valueOf(50));
-        readData.saveAsTextFile(hdfsPath);
-    }
-
-
-    static class RecordsToSamples implements Function2<Record, TableSchema, String> {
-        @Override
-        public String call(Record record, TableSchema schema) throws Exception {
-            String labelName = "share_ornot";
-            String ret = singleParse(record, labelName);
-            return ret;
-        }
-    }
-
-
-    // 单条日志处理逻辑
-    public static String singleParse(Record record, String labelName) {
-        // 数据解析
-        String label = record.getString(labelName);
-        if (label == null || label.equals("1")) {
-            label = "0";
-        } else {
-            label = "1";
-        }
-
-        // 从sql的 record中 初始化对象内容
-        RequestContext requestContext = RecommendSampleConstructor.constructRequestContext(record);
-        UserFeature userFeature = RecommendSampleConstructor.constructUserFeature(record);
-        ItemFeature itemFeature = RecommendSampleConstructor.constructItemFeature(record);
-
-        // 转化成bytes
-        RequestContextBytesFeature requestContextBytesFeature = new RequestContextBytesFeature(requestContext);
-        UserBytesFeature userBytesFeature = new UserBytesFeature(userFeature);
-        VideoBytesFeature videoBytesFeature = new VideoBytesFeature(itemFeature);
-
-        // 特征抽取
-        VlogShareLRFeatureExtractor bytesFeatureExtractor;
-        bytesFeatureExtractor = new VlogShareLRFeatureExtractor();
-
-        bytesFeatureExtractor.getUserFeatures(userBytesFeature);
-        bytesFeatureExtractor.getItemFeature(videoBytesFeature);
-        bytesFeatureExtractor.getContextFeatures(requestContextBytesFeature);
-
-        ListMultimap<FeatureGroup, BaseFeature> featureMap = bytesFeatureExtractor.getFeatures();
-        return parseSamplesToString(label, featureMap);
-    }
-
-    // 构建样本的字符串
-    public static String parseSamplesToString(String label, ListMultimap<FeatureGroup, BaseFeature> featureMap) {
-        ArrayList<String> featureList = new ArrayList<String>();
-        for (Map.Entry<FeatureGroup, BaseFeature> entry : featureMap.entries()) {
-            FeatureGroup groupedFeature = entry.getKey();
-            BaseFeature baseFeature = entry.getValue();
-            Long featureIdentifier = baseFeature.getIdentifier();
-            featureList.add(String.valueOf(featureIdentifier) + ":1");
-        }
-        return label + "\t" + String.join("\t", featureList);
-    }
-
-}
+//package examples.sparksql;
+//
+//import com.aliyun.odps.TableSchema;
+//import com.aliyun.odps.data.Record;
+//import com.google.common.collect.ListMultimap;
+//import com.tzld.piaoquan.data.base.*;
+//import examples.dataloader.RecommendSampleConstructor;
+//import com.tzld.piaoquan.data.score.feature.VlogShareLRFeatureExtractor;
+//import com.tzld.piaoquan.recommend.server.gen.recommend.BaseFeature;
+//import com.tzld.piaoquan.recommend.server.gen.recommend.FeatureGroup;
+//import org.apache.spark.SparkConf;
+//import org.apache.spark.aliyun.odps.OdpsOps;
+//import org.apache.spark.api.java.JavaRDD;
+//import org.apache.spark.api.java.JavaSparkContext;
+//import org.apache.spark.api.java.function.Function2;
+//
+//import java.util.ArrayList;
+//import java.util.Map;
+//
+//
+//public class SparkShareRatioSampleLoader {
+//
+//    public static void main(String[] args) {
+//
+//        String partition = args[0];
+//        String accessId = "LTAIWYUujJAm7CbH";
+//        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
+//        String odpsUrl = "http://service.odps.aliyun.com/api";
+//        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
+//        String project = "loghubods";
+//        String table = "alg_recsys_view_sample";
+//        String hdfsPath = "/dw/recommend/model/share_ratio_samples/" + partition;
+//
+//        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
+//        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
+//        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
+//        System.out.println("Read odps table...");
+//
+//        JavaRDD<String> readData = odpsOps.readTableWithJava(project, table, partition, new RecordsToSamples(), Integer.valueOf(50));
+//        readData.saveAsTextFile(hdfsPath);
+//    }
+//
+//
+//    static class RecordsToSamples implements Function2<Record, TableSchema, String> {
+//        @Override
+//        public String call(Record record, TableSchema schema) throws Exception {
+//            String labelName = "share_ornot";
+//            String ret = singleParse(record, labelName);
+//            return ret;
+//        }
+//    }
+//
+//
+//    // 单条日志处理逻辑
+//    public static String singleParse(Record record, String labelName) {
+//        // 数据解析
+//        String label = record.getString(labelName);
+//        if (label == null || label.equals("1")) {
+//            label = "0";
+//        } else {
+//            label = "1";
+//        }
+//
+//        // 从sql的 record中 初始化对象内容
+//        RequestContext requestContext = RecommendSampleConstructor.constructRequestContext(record);
+//        UserFeature userFeature = RecommendSampleConstructor.constructUserFeature(record);
+//        ItemFeature itemFeature = RecommendSampleConstructor.constructItemFeature(record);
+//
+//        // 转化成bytes
+//        RequestContextBytesFeature requestContextBytesFeature = new RequestContextBytesFeature(requestContext);
+//        UserBytesFeature userBytesFeature = new UserBytesFeature(userFeature);
+//        VideoBytesFeature videoBytesFeature = new VideoBytesFeature(itemFeature);
+//
+//        // 特征抽取
+//        VlogShareLRFeatureExtractor bytesFeatureExtractor;
+//        bytesFeatureExtractor = new VlogShareLRFeatureExtractor();
+//
+//        bytesFeatureExtractor.getUserFeatures(userBytesFeature);
+//        bytesFeatureExtractor.getItemFeature(videoBytesFeature);
+//        bytesFeatureExtractor.getContextFeatures(requestContextBytesFeature);
+//
+//        ListMultimap<FeatureGroup, BaseFeature> featureMap = bytesFeatureExtractor.getFeatures();
+//        return parseSamplesToString(label, featureMap);
+//    }
+//
+//    // 构建样本的字符串
+//    public static String parseSamplesToString(String label, ListMultimap<FeatureGroup, BaseFeature> featureMap) {
+//        ArrayList<String> featureList = new ArrayList<String>();
+//        for (Map.Entry<FeatureGroup, BaseFeature> entry : featureMap.entries()) {
+//            FeatureGroup groupedFeature = entry.getKey();
+//            BaseFeature baseFeature = entry.getValue();
+//            Long featureIdentifier = baseFeature.getIdentifier();
+//            featureList.add(String.valueOf(featureIdentifier) + ":1");
+//        }
+//        return label + "\t" + String.join("\t", featureList);
+//    }
+//
+//}

+ 123 - 124
src/main/java/examples/sparksql/SparkVideoFeaToRedisLoader.java

@@ -1,124 +1,123 @@
-package examples.sparksql;
-
-import com.aliyun.odps.TableSchema;
-import com.aliyun.odps.data.Record;
-
-import com.tzld.piaoquan.recommend.feature.domain.video.base.ItemFeature;
-import com.tzld.piaoquan.recommend.feature.domain.video.base.UserFeature;
-import examples.dataloader.AdRedisFeatureConstructor;
-import examples.dataloader.RecommRedisFeatureConstructor;
-import org.apache.spark.SparkConf;
-import org.apache.spark.aliyun.odps.OdpsOps;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.Function2;
-import org.springframework.data.redis.connection.RedisStandaloneConfiguration;
-import org.springframework.data.redis.connection.jedis.JedisConnectionFactory;
-import org.springframework.data.redis.core.RedisTemplate;
-import org.springframework.data.redis.serializer.StringRedisSerializer;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-
-public class SparkVideoFeaToRedisLoader {
-
-    private static final String userKeyFormat = "user:video:%s";
-
-    private static final String adKeyFormat = "video:%s";
-
-
-    public static RedisTemplate<String, String> buildRedisTemplate() {
-        RedisStandaloneConfiguration rsc = new RedisStandaloneConfiguration();
-        rsc.setPort(6379);
-        rsc.setPassword("Wqsd@2019");
-        rsc.setHostName("r-bp1pi8wyv6lzvgjy5z.redis.rds.aliyuncs.com");
-        RedisTemplate<String, String> template = new RedisTemplate<>();
-        JedisConnectionFactory fac = new JedisConnectionFactory(rsc);
-        fac.afterPropertiesSet();
-        template.setDefaultSerializer(new StringRedisSerializer());
-        template.setConnectionFactory(fac);
-        template.afterPropertiesSet();
-        return template;
-    }
-
-
-    public static void loadFeatureToRedis(RedisTemplate<String, String> redisTemplate, List<String> line) {
-        Map<String, String> redisFormat = new HashMap<String, String>();
-        String key = line.get(0);
-        String value = line.get(1);
-        redisFormat.put(key, value);
-        redisTemplate.opsForValue().multiSet(redisFormat);
-    }
-
-
-    static class RecordsToVideoRedisKV implements Function2<Record, TableSchema, List<String>> {
-        @Override
-        public List<String> call(Record record, TableSchema schema) throws Exception {
-            ItemFeature itemFeature = RecommRedisFeatureConstructor.constructItemFeature(record);
-            String key = String.format(adKeyFormat, itemFeature.getKey());
-            String value = itemFeature.getValue();
-            List<String> kv = new ArrayList<String>();
-            kv.add(key);
-            kv.add(value);
-            return kv;
-        }
-    }
-
-
-    static class RecordsToUserRedisKV implements Function2<Record, TableSchema, List<String>> {
-        @Override
-        public List<String> call(Record record, TableSchema schema) throws Exception {
-            UserFeature userFeature = RecommRedisFeatureConstructor.constructUserFeature(record);
-            String key = String.format(userKeyFormat, userFeature.getKey());
-            String value = userFeature.getValue();
-            List<String> kv = new ArrayList<String>();
-            kv.add(key);
-            kv.add(value);
-            return kv;
-        }
-    }
-
-
-    public static void main(String[] args) {
-
-        String partition = args[0];
-        String accessId = "LTAIWYUujJAm7CbH";
-        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
-        String odpsUrl = "http://service.odps.aliyun.com/api";
-        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
-        String project = "loghubods";
-        String tableItemInfo = "alg_recsys_video_info";
-        String tableUserInfo = "alg_recsys_user_info";
-
-
-        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
-        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
-        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
-        System.out.println("Read odps table...");
-
-
-        // load Ad features
-        JavaRDD<List<String>> readAdData = odpsOps.readTableWithJava(project, tableItemInfo, partition, new RecordsToVideoRedisKV(), Integer.valueOf(10));
-        readAdData.sample(false, 0.0001).foreachPartition(
-                rowIterator -> {
-                    RedisTemplate<String, String> redisTemplate = buildRedisTemplate();
-                    rowIterator.forEachRemaining(line -> loadFeatureToRedis(redisTemplate, line));
-                }
-        );
-
-
-        // load user features
-        JavaRDD<List<String>> readUserData = odpsOps.readTableWithJava(project, tableUserInfo, partition, new RecordsToUserRedisKV(), Integer.valueOf(50));
-        readUserData.repartition(50).sample(false, 0.00001).foreachPartition(
-                rowIterator -> {
-                    RedisTemplate<String, String> redisTemplate = buildRedisTemplate();
-                    rowIterator.forEachRemaining(line -> loadFeatureToRedis(redisTemplate, line));
-                }
-        );
-    }
-
-
-}
+//package examples.sparksql;
+//
+//import com.aliyun.odps.TableSchema;
+//import com.aliyun.odps.data.Record;
+//
+//import com.tzld.piaoquan.recommend.feature.domain.video.base.ItemFeature;
+//import com.tzld.piaoquan.recommend.feature.domain.video.base.UserFeature;
+//import examples.dataloader.AdRedisFeatureConstructor;
+//import examples.dataloader.RecommRedisFeatureConstructor;
+//import org.apache.spark.SparkConf;
+//import org.apache.spark.aliyun.odps.OdpsOps;
+//import org.apache.spark.api.java.JavaRDD;
+//import org.apache.spark.api.java.JavaSparkContext;
+//import org.apache.spark.api.java.function.Function2;
+//import org.springframework.data.redis.connection.RedisStandaloneConfiguration;
+//import org.springframework.data.redis.connection.jedis.JedisConnectionFactory;
+//import org.springframework.data.redis.core.RedisTemplate;
+//import org.springframework.data.redis.serializer.StringRedisSerializer;
+//
+//import java.util.ArrayList;
+//import java.util.HashMap;
+//import java.util.List;
+//import java.util.Map;
+//
+//
+//public class SparkVideoFeaToRedisLoader {
+//
+//    private static final String userKeyFormat = "user:video:%s";
+//
+//    private static final String adKeyFormat = "video:%s";
+//
+//
+//    public static RedisTemplate<String, String> buildRedisTemplate() {
+//        RedisStandaloneConfiguration rsc = new RedisStandaloneConfiguration();
+//        rsc.setPort(6379);
+//        rsc.setPassword("Wqsd@2019");
+//        rsc.setHostName("r-bp1pi8wyv6lzvgjy5z.redis.rds.aliyuncs.com");
+//        RedisTemplate<String, String> template = new RedisTemplate<>();
+//        JedisConnectionFactory fac = new JedisConnectionFactory(rsc);
+//        fac.afterPropertiesSet();
+//        template.setDefaultSerializer(new StringRedisSerializer());
+//        template.setConnectionFactory(fac);
+//        template.afterPropertiesSet();
+//        return template;
+//    }
+//
+//
+//    public static void loadFeatureToRedis(RedisTemplate<String, String> redisTemplate, List<String> line) {
+//        Map<String, String> redisFormat = new HashMap<String, String>();
+//        String key = line.get(0);
+//        String value = line.get(1);
+//        redisFormat.put(key, value);
+//        redisTemplate.opsForValue().multiSet(redisFormat);
+//    }
+//
+//
+//    static class RecordsToVideoRedisKV implements Function2<Record, TableSchema, List<String>> {
+//        @Override
+//        public List<String> call(Record record, TableSchema schema) throws Exception {
+//            ItemFeature itemFeature = RecommRedisFeatureConstructor.constructItemFeature(record);
+//            String key = String.format(adKeyFormat, itemFeature.getKey());
+//            String value = itemFeature.getValue();
+//            List<String> kv = new ArrayList<String>();
+//            kv.add(key);
+//            kv.add(value);
+//            return kv;
+//        }
+//    }
+//
+//
+//    static class RecordsToUserRedisKV implements Function2<Record, TableSchema, List<String>> {
+//        @Override
+//        public List<String> call(Record record, TableSchema schema) throws Exception {
+//            UserFeature userFeature = RecommRedisFeatureConstructor.constructUserFeature(record);
+//            String key = String.format(userKeyFormat, userFeature.getKey());
+//            String value = userFeature.getValue();
+//            List<String> kv = new ArrayList<String>();
+//            kv.add(key);
+//            kv.add(value);
+//            return kv;
+//        }
+//    }
+//
+//
+//    public static void main(String[] args) {
+//
+//        String partition = args[0];
+//        String accessId = "LTAIWYUujJAm7CbH";
+//        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
+//        String odpsUrl = "http://service.odps.aliyun.com/api";
+//        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
+//        String project = "loghubods";
+//        String tableItemInfo = "alg_recsys_video_info";
+//        String tableUserInfo = "alg_recsys_user_info";
+//
+//        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
+//        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
+//        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
+//        System.out.println("Read odps table...");
+//
+//
+//        // load Ad features
+//        JavaRDD<List<String>> readAdData = odpsOps.readTableWithJava(project, tableItemInfo, partition, new RecordsToVideoRedisKV(), Integer.valueOf(10));
+//        readAdData.sample(false, 0.0001).foreachPartition(
+//                rowIterator -> {
+//                    RedisTemplate<String, String> redisTemplate = buildRedisTemplate();
+//                    rowIterator.forEachRemaining(line -> loadFeatureToRedis(redisTemplate, line));
+//                }
+//        );
+//
+//
+//        // load user features
+//        JavaRDD<List<String>> readUserData = odpsOps.readTableWithJava(project, tableUserInfo, partition, new RecordsToUserRedisKV(), Integer.valueOf(50));
+//        readUserData.repartition(50).sample(false, 0.00001).foreachPartition(
+//                rowIterator -> {
+//                    RedisTemplate<String, String> redisTemplate = buildRedisTemplate();
+//                    rowIterator.forEachRemaining(line -> loadFeatureToRedis(redisTemplate, line));
+//                }
+//        );
+//    }
+//
+//
+//}

+ 274 - 0
src/main/resources/20240608_feature_name.txt

@@ -0,0 +1,274 @@
+b123_1h_STR
+b123_1h_log(share)
+b123_1h_ROV
+b123_1h_log(return)
+b123_1h_ROV*log(return)
+b123_2h_STR
+b123_2h_log(share)
+b123_2h_ROV
+b123_2h_log(return)
+b123_2h_ROV*log(return)
+b123_3h_STR
+b123_3h_log(share)
+b123_3h_ROV
+b123_3h_log(return)
+b123_3h_ROV*log(return)
+b123_4h_STR
+b123_4h_log(share)
+b123_4h_ROV
+b123_4h_log(return)
+b123_4h_ROV*log(return)
+b123_12h_STR
+b123_12h_log(share)
+b123_12h_ROV
+b123_12h_log(return)
+b123_12h_ROV*log(return)
+b123_1d_STR
+b123_1d_log(share)
+b123_1d_ROV
+b123_1d_log(return)
+b123_1d_ROV*log(return)
+b123_3d_STR
+b123_3d_log(share)
+b123_3d_ROV
+b123_3d_log(return)
+b123_3d_ROV*log(return)
+b123_7d_STR
+b123_7d_log(share)
+b123_7d_ROV
+b123_7d_log(return)
+b123_7d_ROV*log(return)
+b167_1h_STR
+b167_1h_log(share)
+b167_1h_ROV
+b167_1h_log(return)
+b167_1h_ROV*log(return)
+b167_2h_STR
+b167_2h_log(share)
+b167_2h_ROV
+b167_2h_log(return)
+b167_2h_ROV*log(return)
+b167_3h_STR
+b167_3h_log(share)
+b167_3h_ROV
+b167_3h_log(return)
+b167_3h_ROV*log(return)
+b167_4h_STR
+b167_4h_log(share)
+b167_4h_ROV
+b167_4h_log(return)
+b167_4h_ROV*log(return)
+b167_12h_STR
+b167_12h_log(share)
+b167_12h_ROV
+b167_12h_log(return)
+b167_12h_ROV*log(return)
+b167_1d_STR
+b167_1d_log(share)
+b167_1d_ROV
+b167_1d_log(return)
+b167_1d_ROV*log(return)
+b167_3d_STR
+b167_3d_log(share)
+b167_3d_ROV
+b167_3d_log(return)
+b167_3d_ROV*log(return)
+b167_7d_STR
+b167_7d_log(share)
+b167_7d_ROV
+b167_7d_log(return)
+b167_7d_ROV*log(return)
+b8910_1h_STR
+b8910_1h_log(share)
+b8910_1h_ROV
+b8910_1h_log(return)
+b8910_1h_ROV*log(return)
+b8910_2h_STR
+b8910_2h_log(share)
+b8910_2h_ROV
+b8910_2h_log(return)
+b8910_2h_ROV*log(return)
+b8910_3h_STR
+b8910_3h_log(share)
+b8910_3h_ROV
+b8910_3h_log(return)
+b8910_3h_ROV*log(return)
+b8910_4h_STR
+b8910_4h_log(share)
+b8910_4h_ROV
+b8910_4h_log(return)
+b8910_4h_ROV*log(return)
+b8910_12h_STR
+b8910_12h_log(share)
+b8910_12h_ROV
+b8910_12h_log(return)
+b8910_12h_ROV*log(return)
+b8910_1d_STR
+b8910_1d_log(share)
+b8910_1d_ROV
+b8910_1d_log(return)
+b8910_1d_ROV*log(return)
+b8910_3d_STR
+b8910_3d_log(share)
+b8910_3d_ROV
+b8910_3d_log(return)
+b8910_3d_ROV*log(return)
+b8910_7d_STR
+b8910_7d_log(share)
+b8910_7d_ROV
+b8910_7d_log(return)
+b8910_7d_ROV*log(return)
+b111213_1h_STR
+b111213_1h_log(share)
+b111213_1h_ROV
+b111213_1h_log(return)
+b111213_1h_ROV*log(return)
+b111213_2h_STR
+b111213_2h_log(share)
+b111213_2h_ROV
+b111213_2h_log(return)
+b111213_2h_ROV*log(return)
+b111213_3h_STR
+b111213_3h_log(share)
+b111213_3h_ROV
+b111213_3h_log(return)
+b111213_3h_ROV*log(return)
+b111213_4h_STR
+b111213_4h_log(share)
+b111213_4h_ROV
+b111213_4h_log(return)
+b111213_4h_ROV*log(return)
+b111213_12h_STR
+b111213_12h_log(share)
+b111213_12h_ROV
+b111213_12h_log(return)
+b111213_12h_ROV*log(return)
+b111213_1d_STR
+b111213_1d_log(share)
+b111213_1d_ROV
+b111213_1d_log(return)
+b111213_1d_ROV*log(return)
+b111213_3d_STR
+b111213_3d_log(share)
+b111213_3d_ROV
+b111213_3d_log(return)
+b111213_3d_ROV*log(return)
+b111213_7d_STR
+b111213_7d_log(share)
+b111213_7d_ROV
+b111213_7d_log(return)
+b111213_7d_ROV*log(return)
+b171819_1h_STR
+b171819_1h_log(share)
+b171819_1h_ROV
+b171819_1h_log(return)
+b171819_1h_ROV*log(return)
+b171819_2h_STR
+b171819_2h_log(share)
+b171819_2h_ROV
+b171819_2h_log(return)
+b171819_2h_ROV*log(return)
+b171819_3h_STR
+b171819_3h_log(share)
+b171819_3h_ROV
+b171819_3h_log(return)
+b171819_3h_ROV*log(return)
+b171819_4h_STR
+b171819_4h_log(share)
+b171819_4h_ROV
+b171819_4h_log(return)
+b171819_4h_ROV*log(return)
+b171819_12h_STR
+b171819_12h_log(share)
+b171819_12h_ROV
+b171819_12h_log(return)
+b171819_12h_ROV*log(return)
+b171819_1d_STR
+b171819_1d_log(share)
+b171819_1d_ROV
+b171819_1d_log(return)
+b171819_1d_ROV*log(return)
+b171819_3d_STR
+b171819_3d_log(share)
+b171819_3d_ROV
+b171819_3d_log(return)
+b171819_3d_ROV*log(return)
+b171819_7d_STR
+b171819_7d_log(share)
+b171819_7d_ROV
+b171819_7d_log(return)
+b171819_7d_ROV*log(return)
+total_time
+bit_rate
+playcnt_6h
+playcnt_1d
+playcnt_3d
+playcnt_7d
+share_pv_12h
+share_pv_1d
+share_pv_3d
+share_pv_7d
+return_uv_12h
+return_uv_1d
+return_uv_3d
+return_uv_7d
+c3_feature_tags_1d_matchnum
+c3_feature_tags_1d_maxscore
+c3_feature_tags_1d_avgscore
+c3_feature_tags_3d_matchnum
+c3_feature_tags_3d_maxscore
+c3_feature_tags_3d_avgscore
+c3_feature_tags_7d_matchnum
+c3_feature_tags_7d_maxscore
+c3_feature_tags_7d_avgscore
+c4_feature_tags_1d_matchnum
+c4_feature_tags_1d_maxscore
+c4_feature_tags_1d_avgscore
+c4_feature_tags_3d_matchnum
+c4_feature_tags_3d_maxscore
+c4_feature_tags_3d_avgscore
+c4_feature_tags_7d_matchnum
+c4_feature_tags_7d_maxscore
+c4_feature_tags_7d_avgscore
+c5_feature_tags_1d_matchnum
+c5_feature_tags_1d_maxscore
+c5_feature_tags_1d_avgscore
+c5_feature_tags_3d_matchnum
+c5_feature_tags_3d_maxscore
+c5_feature_tags_3d_avgscore
+c5_feature_tags_7d_matchnum
+c5_feature_tags_7d_maxscore
+c5_feature_tags_7d_avgscore
+c6_feature_tags_1d_matchnum
+c6_feature_tags_1d_maxscore
+c6_feature_tags_1d_avgscore
+c6_feature_tags_3d_matchnum
+c6_feature_tags_3d_maxscore
+c6_feature_tags_3d_avgscore
+c6_feature_tags_7d_matchnum
+c6_feature_tags_7d_maxscore
+c6_feature_tags_7d_avgscore
+c7_feature_tags_1d_matchnum
+c7_feature_tags_1d_maxscore
+c7_feature_tags_1d_avgscore
+c7_feature_tags_3d_matchnum
+c7_feature_tags_3d_maxscore
+c7_feature_tags_3d_avgscore
+c7_feature_tags_7d_matchnum
+c7_feature_tags_7d_maxscore
+c7_feature_tags_7d_avgscore
+c8_feature_share_score
+c8_feature_share_num
+c8_feature_share_rank
+c8_feature_return_score
+c8_feature_return_num
+c8_feature_return_rank
+c9_feature_share_score
+c9_feature_share_num
+c9_feature_share_rank
+c9_feature_return_score
+c9_feature_return_num
+c9_feature_return_rank
+d1_exp
+d1_return_n
+d1_rovn

File diff suppressed because it is too large
+ 0 - 0
src/main/resources/20240609_bucket_274.txt


File diff suppressed because it is too large
+ 2 - 0
src/main/resources/20240609_bucket_274_old.txt


File diff suppressed because it is too large
+ 6 - 0
src/main/resources/20240622_ad_bucket_249.txt


+ 249 - 0
src/main/resources/20240622_ad_feature_name.txt

@@ -0,0 +1,249 @@
+cpa
+b2_3h_ctr
+b2_3h_ctcvr
+b2_3h_cvr
+b2_3h_conver
+b2_3h_ecpm
+b2_6h_ctr
+b2_6h_ctcvr
+b2_6h_cvr
+b2_6h_conver
+b2_6h_ecpm
+b2_12h_ctr
+b2_12h_ctcvr
+b2_12h_cvr
+b2_12h_conver
+b2_12h_ecpm
+b2_1d_ctr
+b2_1d_ctcvr
+b2_1d_cvr
+b2_1d_conver
+b2_1d_ecpm
+b2_3d_ctr
+b2_3d_ctcvr
+b2_3d_cvr
+b2_3d_conver
+b2_3d_ecpm
+b2_7d_ctr
+b2_7d_ctcvr
+b2_7d_cvr
+b2_7d_conver
+b2_7d_ecpm
+b3_3h_ctr
+b3_3h_ctcvr
+b3_3h_cvr
+b3_3h_conver
+b3_3h_ecpm
+b3_6h_ctr
+b3_6h_ctcvr
+b3_6h_cvr
+b3_6h_conver
+b3_6h_ecpm
+b3_12h_ctr
+b3_12h_ctcvr
+b3_12h_cvr
+b3_12h_conver
+b3_12h_ecpm
+b3_1d_ctr
+b3_1d_ctcvr
+b3_1d_cvr
+b3_1d_conver
+b3_1d_ecpm
+b3_3d_ctr
+b3_3d_ctcvr
+b3_3d_cvr
+b3_3d_conver
+b3_3d_ecpm
+b3_7d_ctr
+b3_7d_ctcvr
+b3_7d_cvr
+b3_7d_conver
+b3_7d_ecpm
+b4_3h_ctr
+b4_3h_ctcvr
+b4_3h_cvr
+b4_3h_conver
+b4_3h_ecpm
+b4_6h_ctr
+b4_6h_ctcvr
+b4_6h_cvr
+b4_6h_conver
+b4_6h_ecpm
+b4_12h_ctr
+b4_12h_ctcvr
+b4_12h_cvr
+b4_12h_conver
+b4_12h_ecpm
+b4_1d_ctr
+b4_1d_ctcvr
+b4_1d_cvr
+b4_1d_conver
+b4_1d_ecpm
+b4_3d_ctr
+b4_3d_ctcvr
+b4_3d_cvr
+b4_3d_conver
+b4_3d_ecpm
+b4_7d_ctr
+b4_7d_ctcvr
+b4_7d_cvr
+b4_7d_conver
+b4_7d_ecpm
+b5_3h_ctr
+b5_3h_ctcvr
+b5_3h_cvr
+b5_3h_conver
+b5_3h_ecpm
+b5_6h_ctr
+b5_6h_ctcvr
+b5_6h_cvr
+b5_6h_conver
+b5_6h_ecpm
+b5_12h_ctr
+b5_12h_ctcvr
+b5_12h_cvr
+b5_12h_conver
+b5_12h_ecpm
+b5_1d_ctr
+b5_1d_ctcvr
+b5_1d_cvr
+b5_1d_conver
+b5_1d_ecpm
+b5_3d_ctr
+b5_3d_ctcvr
+b5_3d_cvr
+b5_3d_conver
+b5_3d_ecpm
+b5_7d_ctr
+b5_7d_ctcvr
+b5_7d_cvr
+b5_7d_conver
+b5_7d_ecpm
+b8_3h_ctr
+b8_3h_ctcvr
+b8_3h_cvr
+b8_3h_conver
+b8_3h_ecpm
+b8_6h_ctr
+b8_6h_ctcvr
+b8_6h_cvr
+b8_6h_conver
+b8_6h_ecpm
+b8_12h_ctr
+b8_12h_ctcvr
+b8_12h_cvr
+b8_12h_conver
+b8_12h_ecpm
+b8_1d_ctr
+b8_1d_ctcvr
+b8_1d_cvr
+b8_1d_conver
+b8_1d_ecpm
+b8_3d_ctr
+b8_3d_ctcvr
+b8_3d_cvr
+b8_3d_conver
+b8_3d_ecpm
+b8_7d_ctr
+b8_7d_ctcvr
+b8_7d_cvr
+b8_7d_conver
+b8_7d_ecpm
+b6_7d_ctr
+b6_7d_ctcvr
+b6_7d_cvr
+b6_7d_conver
+b6_7d_ecpm
+b6_14d_ctr
+b6_14d_ctcvr
+b6_14d_cvr
+b6_14d_conver
+b6_14d_ecpm
+b7_7d_ctr
+b7_7d_ctcvr
+b7_7d_cvr
+b7_7d_conver
+b7_7d_ecpm
+b7_14d_ctr
+b7_14d_ctcvr
+b7_14d_cvr
+b7_14d_conver
+b7_14d_ecpm
+viewAll
+clickAll
+converAll
+incomeAll
+ctr_all
+ctcvr_all
+cvr_all
+ecpm_all
+timediff_view
+timediff_click
+timediff_conver
+actionstatic_view
+actionstatic_click
+actionstatic_conver
+actionstatic_income
+actionstatic_ctr
+actionstatic_ctcvr
+actionstatic_cvr
+e1_tags_3d_matchnum
+e1_tags_3d_maxscore
+e1_tags_3d_avgscore
+e1_tags_7d_matchnum
+e1_tags_7d_maxscore
+e1_tags_7d_avgscore
+e1_tags_14d_matchnum
+e1_tags_14d_maxscore
+e1_tags_14d_avgscore
+e2_tags_3d_matchnum
+e2_tags_3d_maxscore
+e2_tags_3d_avgscore
+e2_tags_7d_matchnum
+e2_tags_7d_maxscore
+e2_tags_7d_avgscore
+e2_tags_14d_matchnum
+e2_tags_14d_maxscore
+e2_tags_14d_avgscore
+d1_feature_3h_ctr
+d1_feature_3h_ctcvr
+d1_feature_3h_cvr
+d1_feature_3h_conver
+d1_feature_3h_ecpm
+d1_feature_6h_ctr
+d1_feature_6h_ctcvr
+d1_feature_6h_cvr
+d1_feature_6h_conver
+d1_feature_6h_ecpm
+d1_feature_12h_ctr
+d1_feature_12h_ctcvr
+d1_feature_12h_cvr
+d1_feature_12h_conver
+d1_feature_12h_ecpm
+d1_feature_1d_ctr
+d1_feature_1d_ctcvr
+d1_feature_1d_cvr
+d1_feature_1d_conver
+d1_feature_1d_ecpm
+d1_feature_3d_ctr
+d1_feature_3d_ctcvr
+d1_feature_3d_cvr
+d1_feature_3d_conver
+d1_feature_3d_ecpm
+d1_feature_7d_ctr
+d1_feature_7d_ctcvr
+d1_feature_7d_cvr
+d1_feature_7d_conver
+d1_feature_7d_ecpm
+vid_rank_ctr_1d
+vid_rank_ctr_3d
+vid_rank_ctr_7d
+vid_rank_ctr_14d
+vid_rank_ctcvr_1d
+vid_rank_ctcvr_3d
+vid_rank_ctcvr_7d
+vid_rank_ctcvr_14d
+vid_rank_ecpm_1d
+vid_rank_ecpm_3d
+vid_rank_ecpm_7d
+vid_rank_ecpm_14d

+ 125 - 0
src/main/scala/com/aliyun/odps/spark/examples/ana/ana_01_cidvidpk.scala

@@ -0,0 +1,125 @@
+package com.aliyun.odps.spark.examples.ana
+
+
+import com.alibaba.fastjson.JSONObject
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import java.util
+import java.util.{HashMap, Map}
+import scala.collection.JavaConversions._
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+/*
+   所有获取不到的特征,给默认值0.
+ */
+
+object ana_01_cidvidpk {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val tablePart = param.getOrElse("tablePart", "64").toInt
+    val project = param.getOrElse("project", "loghubods")
+    val table = param.getOrElse("table", "ad_engine_statistics_log_per5min")
+    val beginStr = param.getOrElse("beginStr", "2024060208")
+    val endStr = param.getOrElse("endStr", "2024060223")
+    val vidSelect = param.getOrElse("vidSelect", "")
+    val cidsSelect = param.getOrElse("cidsSelect", "").split(",").toSet
+    val apptypeSelect = param.getOrElse("apptype", "")
+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
+
+    // 2 读取odps+表信息
+    val odpsOps = env.getODPS(sc)
+    val timeRange = MyDateUtils.getDateHourRange(beginStr, endStr)
+    val partitions = new ArrayBuffer[String]()
+    for (dt_hh <- timeRange) {
+      for (mi <- List (
+        "0000", "0500", "1000", "1500", "2000", "2500",
+        "3000", "3500", "4000", "4500", "5000", "5500"
+      )){
+        val partition = dt_hh + mi
+        println("partition:" + partition)
+        partitions.add(partition)
+      }
+    }
+    val rdds = partitions.map(p => {
+      odpsOps.readTable(project = project,
+        table = table,
+        partition = partitionPrefix + p,
+        transfer = func,
+        numPartition = tablePart)
+    }).reduce((r1, r2) => r1.union(r2))
+
+    val data = rdds.map(record=>{
+      val vid = if (record.isNull("videoid")) "" else record.getString("videoid")
+      val recalls = if (record.isNull("creativelist")) "" else record.getString("creativelist")
+      val ranks = if (record.isNull("scoreresult")) "" else record.getString("scoreresult")
+      val apptype = if (record.isNull("apptype")) "" else record.getString("apptype")
+      val abcode = if (record.isNull("adabgroup")) "" else record.getString("adabgroup")
+      (apptype, abcode, vid, recalls, ranks)
+    }).filter(r => r._1.equals(apptypeSelect) && !r._3.equals("") && !r._4.equals("") && !r._5.equals(""))
+      .filter(r=> r._3.equals(vidSelect)) // 过滤的vid
+      .map{
+        case (apptype, abcode, vid, recalls, ranks) =>
+          val recalls_json = JSON.parseArray(recalls).map(r=>{
+            val j = JSON.parseObject(r.toString)
+            j.getOrElse("creativeId", 0).toString
+          }).filter(!_.equals("0")).toSet
+          val ranks_json = JSON.parseArray(ranks).map(r => {
+            val j = JSON.parseObject(r.toString)
+            val adId = j.getOrElse("adId", 0).toString
+            val score = j.getOrElse("score", 0.0)
+            (adId, score.toString.toDouble)
+          })
+          var rankId = ranks_json.get(0)._1
+          var score = ranks_json.get(0)._2
+//          for (i <- 1 until ranks_json.size){
+//            val item = ranks_json.get(i)
+//            if (item._2 > score){
+//              rankId = item._1
+//              score = item._2
+//            }
+//          }
+          (apptype, abcode, vid, recalls_json, rankId)
+      }.flatMap({
+        case (apptype, abcode, vid, recalls_json, rankId) =>
+          recalls_json.map(recallId=> {
+            (apptype, abcode, vid, recallId, rankId, recalls_json)
+          })
+      }).filter(r=> cidsSelect.contains(r._4)) // 过滤的cid
+      .map({
+        case (apptype, abcode, vid, recallId, rankId, recalls_json) =>
+          val x1 = 1
+          val x2 = if (recallId.equals(rankId)) 1 else 0
+          val x3 = if (cidsSelect.subsetOf(recalls_json)) 1 else 0
+          val x4 = if (cidsSelect.subsetOf(recalls_json) && cidsSelect.contains(rankId)) 1 else 0
+          val x5 = if (cidsSelect.subsetOf(recalls_json) && recallId.equals(rankId)) 1 else 0
+          ((apptype, abcode, vid, recallId), (x1, x2, x3, x4, x5))
+      }).aggregateByKey(
+        (0, 0, 0, 0, 0)
+      )(
+        seqOp = (runningSum, x) => (runningSum._1 + x._1, runningSum._2 + x._2, runningSum._3 + x._3, runningSum._4 + x._4, runningSum._5 + x._5),
+        combOp = (sum1, sum2) => (sum1._1 + sum2._1, sum1._2 + sum2._2, sum1._3 + sum2._3, sum1._4 + sum2._4, sum1._5 + sum2._5)
+      )
+
+    data.collect().foreach(r => println("结果\t" + r._1.productIterator.mkString("\t") + "\t" + r._2.productIterator.mkString("\t")))
+
+  }
+
+
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+
+}

+ 79 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_01_readtable2hdfs.scala

@@ -0,0 +1,79 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import org.apache.spark.sql.SparkSession
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import examples.dataloader.RequestContextOffline
+import examples.dataloader.OfflineVlogShareLRFeatureExtractor
+import org.apache.hadoop.io.compress.GzipCodec
+
+import scala.collection.JavaConversions._
+
+object makedata_01_readtable2hdfs {
+  def main(args: Array[String]) {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
+    val tablePart = param.getOrElse("tablePart", "64").toInt
+    val beginStr = param.getOrElse("beginStr", "20230101")
+    val endStr = param.getOrElse("endStr", "20230101")
+    val savePath = param.getOrElse("savePath", "")
+    // /dw/recommend/model/share_ratio_samples/
+
+
+    // 2 读取odps+表信息
+    val odpsOps = env.getODPS(sc)
+    val project = "loghubods"
+    val table = "alg_recsys_view_sample"
+
+    // 3 循环执行数据生产
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      val partition = partitionPrefix + date
+      println("执行partiton:" + partition)
+      val odpsData = odpsOps.readTable(project = project,
+        table = table,
+        partition = partition,
+        transfer = func,
+        numPartition = tablePart)
+      val hdfsPath = savePath + "/" + partition
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        odpsData.saveAsTextFile(hdfsPath, classOf[GzipCodec])
+        println("数据写入完成:" + hdfsPath)
+        println("数据量:" + odpsData.count())
+      }else{
+        println("路径不合法, 无法写入:" + hdfsPath)
+      }
+
+    }
+  }
+
+  def func(record: Record, schema: TableSchema): String = {
+    singleParse(record)
+  }
+
+  def singleParse(record: Record): String = {
+    //1 处理标签
+    val label: String = record.getString("share_ornot")
+    val newLabel = if ("1".equals(label)) "0" else "1"
+    //2 处理特征
+    val reqContext: RequestContextOffline = new RequestContextOffline()
+    reqContext.putUserFeature(record)
+    reqContext.putItemFeature(record)
+    reqContext.putSceneFeature(record)
+    val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractor()
+    bytesFeatureExtractor.makeFeature(reqContext.featureMap)
+    val featureMap = bytesFeatureExtractor.featureMap
+    newLabel + "\t" + featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t")
+  }
+
+
+}

+ 249 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_02_writeredis.scala

@@ -0,0 +1,249 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import com.google.gson.GsonBuilder
+import examples.dataloader.RequestContextOffline
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import java.text.SimpleDateFormat
+import java.util.concurrent.TimeUnit
+import java.util
+import scala.collection.JavaConversions._
+
+
+object makedata_02_writeredis {
+  def main(args: Array[String]) {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
+    val tablePart = param.getOrElse("tablePart", "64").toInt
+    val ifUser = param.getOrDefault("ifUser", "False").toBoolean
+    val ifVideo = param.getOrDefault("ifVideo", "False").toBoolean
+    val date = param.getOrDefault("date", "20231220")
+    val expireDay = param.getOrDefault("expireDay", "2").toInt
+    val ifDebug = param.getOrDefault("ifDebug", "False").toBoolean
+    val ifDeleteRedisUser = param.getOrDefault("ifDeleteRedisUser", "False").toBoolean
+    val ifWriteRedisUser = param.getOrDefault("ifWriteRedisUser", "False").toBoolean
+    val ifWriteRedis = param.getOrDefault("ifWriteRedis", "True").toBoolean
+    val partition = partitionPrefix + date
+    val savePathUser = param.getOrDefault("savePathUser", "")
+    val savePathVideo = param.getOrDefault("savePathVideo", "")
+    val userSampleIDs = param.getOrDefault("userSampleIDs", "")
+    val sampleRate = param.getOrDefault("sampleRate", "1.0").toDouble
+//    val userSampleIDsPathFix = param.getOrDefault("userSampleIDsPathFix", "")
+    //  /dw/recommend/model/feature/
+
+
+    // 2 读取数据库odps
+    val odpsOps = env.getODPS(sc)
+    val project = "loghubods"
+    val tableUser = "alg_recsys_user_info"
+    val tableItem = "alg_recsys_video_info"
+    val userRedisKeyPrefix = "user_info_4video_"
+    val videoRedisKeyPrefix = "video_info_"
+
+
+
+    // 3 用户测特征处理
+    if (ifUser){
+      println("user特征处理")
+      var userData = odpsOps.readTable(project = project, table = tableUser, partition = partition, transfer = handleUser, numPartition = tablePart)
+        .filter {
+          case (mid, fea, feaSize) =>
+            mid.nonEmpty && fea.nonEmpty && feaSize > 0
+        }
+      if (userSampleIDs.nonEmpty){
+        val IDs = userSampleIDs.split(",").filter(_.nonEmpty).map(_.toInt).toList
+        userData = userData.filter(r => IDs.contains(r._1.hashCode % 10))
+      }
+      if (ifDebug){
+        println("user特征处理-debug开启-只保留5条数据-特征数量大于1")
+        val userDataTake = userData.take(5)
+        userDataTake.foreach(r=> println(r._1 + "\t" + r._2 + "\t" + r._3))
+        userData = sc.parallelize(userDataTake)
+      }
+      if (savePathUser.nonEmpty && savePathUser.startsWith("/dw/recommend/model/")) {
+        var savePathPart = savePathUser + "/" + partition
+        if (userSampleIDs.nonEmpty) {
+          savePathPart = savePathPart + "_" + userSampleIDs
+        }
+        MyHdfsUtils.delete_hdfs_path(savePathPart)
+        userData.map(r => r._1 + "\t" + r._2).saveAsTextFile(savePathPart, classOf[GzipCodec])
+      }
+      println("user.action.count=" + userData.count())
+    } else {
+      println("不处理user")
+    }
+
+    if (ifDeleteRedisUser){
+      println("user redis 删除")
+      var savePathPart = savePathUser + "/" + partition
+      if (userSampleIDs.nonEmpty) {
+        savePathPart = savePathPart + "_" + userSampleIDs
+      }
+      println("读取数据路径:" + savePathPart)
+      val userDataRead = sc.textFile(savePathPart)
+      val userDataRead2 = userDataRead.filter(_.split("\t").length >= 2).map(r => {
+        val rList = r.split("\t")
+        (rList(0), rList(1))
+      })
+      println("预计删除数据量:" + userDataRead2.count())
+      val userDataTakeRddRun = userDataRead2.mapPartitions(row => {
+        val redisFormat = new util.HashMap[String, String]
+        val redisTemplate = env.getRedisTemplate()
+        var i = 1
+        row.foreach {
+          case (key, value) =>
+            if (key.nonEmpty) {
+              redisFormat.put(userRedisKeyPrefix + key, value)
+            }
+            if (i % 1000 == 0) {
+              redisTemplate.delete(redisFormat.map(_._1))
+              redisFormat.clear()
+            }
+            i = i + 1
+        }
+        redisTemplate.delete(redisFormat.map(_._1))
+        redisFormat.clear()
+        redisFormat.iterator
+      })
+      println("delete redis.count=" + userDataTakeRddRun.count())
+    } else {
+      println("不处理user的redis删除")
+    }
+
+    if (ifWriteRedisUser){
+      println("user redis 写入")
+      var savePathPart = savePathUser + "/" + partition
+      if (userSampleIDs.nonEmpty) {
+        savePathPart = savePathPart + "_" + userSampleIDs
+      }
+      val userDataRead = sc.textFile(savePathPart).filter(_.split("\t").length >= 2)
+        .sample(false, sampleRate)
+        .map(r => {
+        val rList = r.split("\t")
+        (rList(0), rList(1))
+      })
+      val userDataTakeRddRun = userDataRead.mapPartitions(row => {
+        val redisFormat = new util.HashMap[String, String]
+        val redisTemplate = env.getRedisTemplate()
+        var i = 1
+        row.foreach {
+          case (key, value) =>
+            if (key.nonEmpty) {
+              redisFormat.put(userRedisKeyPrefix + key, value)
+            }
+            if (i % 1000 == 0) {
+              redisTemplate.opsForValue.multiSet(redisFormat)
+              redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
+              redisFormat.clear()
+            }
+            i = i + 1
+        }
+        redisTemplate.opsForValue.multiSet(redisFormat)
+        redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
+        redisFormat.clear()
+        redisFormat.iterator
+      })
+      println("put in redis.count=" + userDataTakeRddRun.count())
+    } else {
+      println("不处理user的redis写入")
+    }
+
+
+
+
+    // 4 video测特征处理
+    if (ifVideo){
+      println("video特征处理")
+      val handleItemFunction: (Record, TableSchema) => Tuple3[String, String, Int] = handleItem(_, _, date)
+      var itemData = odpsOps.readTable(project = project, table = tableItem, partition = partition, transfer = handleItemFunction, numPartition = tablePart)
+      if (ifDebug) {
+        println("video特征处理-debug开启-只保留5条数据-特征数量大于1")
+        val itemDataTake = itemData.filter(_._3 > 1).take(5)
+        itemDataTake.foreach(r => println(r._1 + "\t" + r._2 + "\t" + r._3))
+        itemData = sc.parallelize(itemDataTake)
+      }
+      val itemDataTakeRddRun = itemData.mapPartitions(row => {
+        val redisFormat = new util.HashMap[String, String]
+        val redisTemplate = env.getRedisTemplate()
+        row.foreach {
+          case (key, value, _) =>
+            if (key.nonEmpty && value != null && value.nonEmpty) {
+              redisFormat.put(videoRedisKeyPrefix + key, value)
+              if (ifWriteRedis) {
+                redisTemplate.opsForValue.set(videoRedisKeyPrefix + key, value, 24 * expireDay, TimeUnit.HOURS)
+              }
+            }
+        }
+//        if (ifWriteRedis){
+//          redisTemplate.opsForValue.multiSet(redisFormat)
+//          redisFormat.keySet.foreach(key => redisTemplate.expire(key, 24 * expireDay, TimeUnit.HOURS))
+//        }
+        redisFormat.iterator
+      })
+      if (savePathVideo.nonEmpty && savePathVideo.startsWith("/dw/recommend/model/")){
+        val savePathPart = savePathVideo + "/" + partition
+        MyHdfsUtils.delete_hdfs_path(savePathPart)
+        itemDataTakeRddRun.map(r => r._1 + "\t" + r._2).saveAsTextFile(savePathPart, classOf[GzipCodec])
+      }
+      println("item.action.count=" + itemDataTakeRddRun.count())
+    }else{
+      println("不处理video")
+    }
+  }
+
+  def handleUser(record: Record, schema: TableSchema): Tuple3[String, String, Int] = {
+    val userKey = "mids"
+    val mid = record.getString(userKey)
+    val reqContext: RequestContextOffline = new RequestContextOffline()
+    reqContext.putUserFeature(record)
+    // reqContext.featureMap.put("mid", mid)
+    val gson = (new GsonBuilder).serializeSpecialFloatingPointValues.create
+    val value = gson.toJson(reqContext.featureMap)
+    (mid, value, reqContext.featureMap.size())
+  }
+
+  def handleItem(record: Record, schema: TableSchema, date:String): Tuple3[String, String, Int] = {
+    val videoKey = "videoid"
+    val videoid = record.getBigint(videoKey).toString
+    val reqContext: RequestContextOffline = new RequestContextOffline()
+
+    //---------todo 有特征不在表里 临时修复---------
+//    val i_title_len =  if (record.getString("title") != null) record.getString("title").length.toString else ""
+//    val i_days_since_upload = if (record.getDatetime("gmt_create") != null){
+//      val format = new SimpleDateFormat("yyyyMMdd")
+//      val dateOld = format.format(record.getDatetime("gmt_create"))
+//      val dayDiff = MyDateUtils.calculateDateDifference(dateOld, date)
+//      dayDiff.toString
+//    }else{
+//      ""
+//    }
+//    if (i_title_len.nonEmpty){
+//      val d = reqContext.bucketRatioFeature(i_title_len.toDouble)
+//      reqContext.featureMap.put("i_title_len", d.toString)
+//    }
+//    if (i_days_since_upload.nonEmpty) {
+//      val d = reqContext.bucketRatioFeature(i_days_since_upload.toDouble)
+//      reqContext.featureMap.put("i_days_since_upload", d.toString)
+//    }
+    //------修复完成---------
+
+    reqContext.putItemFeature(record)
+     reqContext.featureMap.put("videoid", videoid)
+
+    val gson = (new GsonBuilder).serializeSpecialFloatingPointValues.create
+    val value = gson.toJson(reqContext.featureMap)
+    (videoid, value, reqContext.featureMap.size())
+  }
+
+}

+ 74 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_03_deleteredis.scala

@@ -0,0 +1,74 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import examples.dataloader.RecommRedisFeatureConstructor
+import org.apache.spark.aliyun.odps.OdpsOps
+import org.apache.spark.sql.SparkSession
+import com.aliyun.odps.spark.examples.myUtils.{ParamUtils, env}
+
+import java.util
+import scala.collection.JavaConversions._
+
+
+object makedata_03_deleteredis {
+  def main(args: Array[String]) {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+
+    // 读取数据库odps
+    val accessKeyId = "LTAIWYUujJAm7CbH"
+    val accessKeySecret = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P"
+    val odpsUrl = "http://service.odps.aliyun.com/api"
+    val tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com"
+
+    val project = "loghubods"
+    val tableItem = "alg_recsys_video_info"
+    val tableUser = "alg_recsys_user_info"
+    val partition = "dt=20231220"
+
+    val odpsOps = OdpsOps(sc, accessKeyId, accessKeySecret, odpsUrl, tunnelUrl)
+
+    //用户测特征处理
+    val userData = odpsOps.readTable(project = project, table = tableUser, partition = partition, transfer = handleUser, numPartition = 100)
+    val userDataTake = userData.take(10)
+    userDataTake.foreach(r=>{
+      println(r.get(0) + "\t" + r.get(1))
+    })
+
+    val userDataTakeRddRun = userData.mapPartitions(row=>{
+      val redisTemplate = env.getRedisTemplate()
+      val redisFormat = new util.HashMap[String, String]
+      row.foreach(r =>{
+        val key = r.get(0)
+        val value = r.get(1)
+        redisFormat.put(key, value)
+        if (redisTemplate.hasKey(key)){
+          redisTemplate.delete(key)
+        }
+      })
+      // redisTemplate.delete(redisFormat.keySet().toList)
+      redisFormat.iterator
+    })
+    println("delete.user.action.count="+userDataTakeRddRun.count())
+
+
+  }
+
+  def handleUser(record: Record, schema: TableSchema): util.ArrayList[String] = {
+    val feature = RecommRedisFeatureConstructor.constructUserFeature(record)
+    val key = String.format("user_info_%s", feature.getUid)
+    val value = feature.getValue
+    val kv = new util.ArrayList[String](2)
+    kv.add(key)
+    kv.add(value)
+    kv
+  }
+
+
+
+}

+ 85 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_04_rosHdfsFromTablev1.scala

@@ -0,0 +1,85 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import examples.dataloader.{OfflineVlogShareLRFeatureExtractor, RequestContextOffline}
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+
+object makedata_04_rosHdfsFromTablev1 {
+  def main(args: Array[String]) {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val tablePart = param.getOrElse("tablePart", "64").toInt
+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
+    val beginStr = param.getOrElse("beginStr", "20230101")
+    val endStr = param.getOrElse("endStr", "20230101")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/ros_sample/")
+    val project = param.getOrElse("project", "loghubods")
+    val table = param.getOrElse("table", "alg_recsys_view_sample")
+
+
+    // 2 读取odps+表信息
+    val odpsOps = env.getODPS(sc)
+
+    // 3 循环执行数据生产
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      val partition = partitionPrefix + date
+      println("执行partiton:" + partition)
+      val odpsData = odpsOps.readTable(project = project,
+        table = table,
+        partition = partition,
+        transfer = func,
+        numPartition = tablePart)
+        .filter{
+          case record =>
+            val not_share: String = record.getString("share_ornot")
+            "0".equals(not_share)
+        }
+        .map{
+          case record =>
+            singleParse(record)
+      }
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + partition
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        odpsData.saveAsTextFile(hdfsPath, classOf[GzipCodec])
+        println("写入数据量:" + odpsData.count())
+      }else{
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+  }
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+
+  def singleParse(record: Record): String = {
+    //1 处理标签
+    val label: String = record.getString("return_ornot")
+    val newLabel = if ("1".equals(label)) "0" else "1"
+    //2 处理特征
+    val reqContext: RequestContextOffline = new RequestContextOffline()
+    reqContext.putUserFeature(record)
+    reqContext.putItemFeature(record)
+    reqContext.putSceneFeature(record)
+    val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractor()
+    bytesFeatureExtractor.makeFeature(reqContext.featureMap)
+    val featureMap = bytesFeatureExtractor.featureMap
+    newLabel + "\t" + featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t")
+  }
+}

+ 106 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_04_rosHdfsFromTablev2.scala

@@ -0,0 +1,106 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import examples.dataloader.{OfflineVlogShareLRFeatureExtractor, RequestContextOffline}
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+
+object makedata_04_rosHdfsFromTablev2 {
+  def main(args: Array[String]) {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val tablePart = param.getOrElse("tablePart", "64").toInt
+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
+    val beginStr = param.getOrElse("beginStr", "20230101")
+    val endStr = param.getOrElse("endStr", "20230101")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/ros_sample_v2/")
+    val project = param.getOrElse("project", "loghubods")
+    val table = param.getOrElse("table", "alg_recsys_view_sample")
+
+
+    // 2 读取odps+表信息
+    val odpsOps = env.getODPS(sc)
+
+    // 3 循环执行数据生产
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      val partition = partitionPrefix + date
+      println("执行partiton:" + partition)
+      val odpsData = odpsOps.readTable(project = project,
+        table = table,
+        partition = partition,
+        transfer = func,
+        numPartition = tablePart)
+        .filter{
+          case record =>
+            val not_share: String = record.getString("share_ornot")
+            "0".equals(not_share)
+        }
+        .flatMap(record =>{
+          val res = ArrayBuffer[(Record, String)]()
+          val hour = record.getString("ctx_hour").toInt
+          hour match {
+            case 23 => res
+            case _ =>
+              res.add((record, "0"))
+              val label_return = record.getString("return_ornot")
+              val expTs = record.getString("view_logtimestamp").toLong / 1000
+              if ("0".equals(label_return)) {
+                if (!record.isNull("machinecode_clienttimestamp")) {
+                  record.getString("machinecode_clienttimestamp").split(",")
+                    .map(r => r.split(":")(1).toLong / 1000)
+                    .foreach(ts=>{
+                      if (ts - expTs < 3600){
+                        res.add((record, "1"))
+                      }
+                    })
+                }
+              }
+              res
+          }
+        })
+        .map{
+          case (record, label) =>
+            singleParse(record, label)
+      }
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + partition
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        odpsData.saveAsTextFile(hdfsPath, classOf[GzipCodec])
+        println("写入数据量:" + odpsData.count())
+      }else{
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+  }
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+
+  def singleParse(record: Record, label: String): String = {
+    //2 处理特征
+    val reqContext: RequestContextOffline = new RequestContextOffline()
+    reqContext.putUserFeature(record)
+    reqContext.putItemFeature(record)
+    reqContext.putSceneFeature(record)
+    val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractor()
+    bytesFeatureExtractor.makeFeature(reqContext.featureMap)
+    val featureMap = bytesFeatureExtractor.featureMap
+    label + "\t" + featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t")
+  }
+}

+ 43 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_05_sampleStatic.scala

@@ -0,0 +1,43 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, ParamUtils, env}
+import examples.dataloader.RecommRedisFeatureConstructor
+import org.apache.spark.aliyun.odps.OdpsOps
+import org.apache.spark.sql.SparkSession
+
+import java.util
+import scala.collection.JavaConversions._
+
+
+object makedata_05_sampleStatic {
+  def main(args: Array[String]) {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    val param = ParamUtils.parseArgs(args)
+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
+    val beginStr = param.getOrElse("beginStr", "20230101")
+    val endStr = param.getOrElse("endStr", "20230101")
+    val path = param.getOrElse("path", "")
+
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      val partition = partitionPrefix + date
+      val hdfsPath = path + "/" + partition
+      println("数据路径:" + hdfsPath)
+      val data = sc.textFile(hdfsPath).map(r =>{
+        (r.split("\t")(0), 1)
+      }).reduceByKey{
+        case (a, b) => a + b
+      }
+      data.collect().foreach(r=> println(r._1 + "\t" + r._2))
+    }
+
+  }
+
+}

+ 257 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_06_originData.scala

@@ -0,0 +1,257 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import examples.extractor.RankExtractorUserFeature
+import examples.extractor.RankExtractorItemFeature
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import java.util
+import scala.collection.JavaConversions._
+import scala.collection.mutable
+import java.util.{Arrays, HashMap, HashSet, Map}
+import com.alibaba.fastjson.JSONObject
+
+/*
+   注意:所有的构造特征,原始值为0.0时,当作无意义,不保留; 如果经过change变换,得到0.0,保留。
+ */
+
+object makedata_06_originData {
+  def main(args: Array[String]) {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val tablePart = param.getOrElse("tablePart", "32").toInt
+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
+    val beginStr = param.getOrElse("beginStr", "20230101")
+    val endStr = param.getOrElse("endStr", "20230101")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/00_sample_data/")
+    val project = param.getOrElse("project", "loghubods")
+    val table = param.getOrElse("table", "alg_recsys_view_sample_v2")
+
+
+    // 2 读取odps+表信息
+    val odpsOps = env.getODPS(sc)
+
+    // 3 循环执行数据生产
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      val partition = partitionPrefix + date
+      println("执行partiton:" + partition)
+      val odpsData = odpsOps.readTable(project = project,
+        table = table,
+        partition = partition,
+        transfer = func,
+        numPartition = tablePart)
+        .map(record => {
+
+          val originFeatureName = Set(
+            "apptype", "logtimestamp", "clientip", "ctx_day", "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
+
+            "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion",
+            "machineinfo_system", "machineinfo_wechatversion", "gmt_create_user",
+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt",
+
+            "title", "tags", "total_time", "play_count_total",
+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
+            "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt"
+          )
+          val originFeatureMap = getFeatureFromSet(originFeatureName, record)
+
+          val itemRealtimeFeatureMap = getFeatureFromSet(Set(
+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
+
+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
+          ), record).map(r => {
+            val m = new java.util.HashMap[String, Double]()
+            r._2.split(",").foreach(r => {
+              m.put(r.split(":")(0), r.split(":")(1).toDouble)
+            })
+            (r._1, m)
+          })
+          val javaMap = new HashMap[String, Map[String, java.lang.Double]]()
+          itemRealtimeFeatureMap.foreach { case (key, value) =>
+            val javaValue = new HashMap[String, java.lang.Double]()
+            value.foreach { case (innerKey, innerValue) =>
+              javaValue.put(innerKey, innerValue.asInstanceOf[java.lang.Double])
+            }
+            javaMap.put(key, javaValue)
+          }
+
+          val f1 = getFeatureFromSet(Set(
+            "apptype", "logtimestamp", "clientip", "ctx_day", "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
+            "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion",
+            "machineinfo_system", "machineinfo_wechatversion", "gmt_create_user",
+            "title", "tags"
+          ), record)
+          val f2 = RankExtractorUserFeature.getUserRateFeature(originFeatureMap)
+          val f3 = RankExtractorUserFeature.cntFeatureChange(originFeatureMap,
+            new util.HashSet[String](util.Arrays.asList(
+              "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+              "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+              "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
+              "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt"))
+          )
+          val f4 = RankExtractorItemFeature.getItemRateFeature(originFeatureMap)
+          val f5 = RankExtractorItemFeature.cntFeatureChange(originFeatureMap,
+            new util.HashSet[String](util.Arrays.asList(
+              "total_time", "play_count_total",
+              "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+              "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+              "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
+              "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt")))
+          val f6 = RankExtractorItemFeature.getItemRealtimeTrend(javaMap,
+            originFeatureMap.getOrElse("ctx_day", ""),
+            originFeatureMap.getOrElse("ctx_hour", ""))
+          val f7 = RankExtractorItemFeature.getItemRealtimeCnt(javaMap,
+            new util.HashSet[String](util.Arrays.asList(
+              "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
+              "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
+              "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
+              "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
+
+              "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
+              "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
+            )),
+            originFeatureMap.getOrElse("ctx_day", ""),
+            originFeatureMap.getOrElse("ctx_hour", "")
+          )
+          val f8 = RankExtractorItemFeature.getItemRealtimeRate(javaMap,
+            originFeatureMap.getOrElse("ctx_day", ""),
+            originFeatureMap.getOrElse("ctx_hour", "")
+          )
+
+          // 1:特征聚合到map中
+          val result = new util.HashMap[String, String]()
+          result ++= f1
+          result ++= f2
+          result ++= f3
+          result ++= f4
+          result ++= f5
+          result ++= f6
+          result ++= f7
+          result ++= f8
+          val names = Set(
+            "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
+
+            "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion",
+            "machineinfo_system", "machineinfo_wechatversion",
+
+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt",
+
+            "title", "tags", "total_time", "play_count_total",
+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
+            "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt",
+
+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
+
+            "u_7day_ctr", "u_7day_str", "u_7day_rov", "u_7day_ros",
+            "u_3month_ctr", "u_3month_str", "u_3month_rov", "u_3month_ros",
+            "i_7day_ctr", "i_7day_str", "i_7day_rov", "i_7day_ros",
+            "i_3month_ctr", "i_3month_str", "i_3month_rov", "i_3month_ros",
+
+            "share_uv_list_1day_6_avg", "share_uv_list_1day_6_var", "share_uv_list_1day_diff_6_avg", "share_uv_list_1day_diff_6_var",
+            "return_uv_list_1day_6_avg", "return_uv_list_1day_6_var", "return_uv_list_1day_diff_6_avg", "return_uv_list_1day_diff_6_var",
+            "share_uv_list_1h_6_avg", "share_uv_list_1h_6_var", "share_uv_list_1h_diff_6_avg", "share_uv_list_1h_diff_6_var",
+            "return_uv_list_1h_6_avg", "return_uv_list_1h_6_var", "return_uv_list_1h_diff_6_avg", "return_uv_list_1h_diff_6_var",
+
+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
+
+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h",
+
+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
+
+
+          )
+          val resultNew = new JSONObject
+          names.foreach(r => {
+            if (result.containsKey(r)){
+              resultNew.put(r, result.get(r))
+            }
+          })
+          //2: label聚合到map中
+          val labels = Set(
+            "is_share", "is_return", "playtime",
+            "is_play",
+            "share_ts", "share_ts_list", "return_mid_ts_list"
+          )
+          val labelNew = new JSONObject
+          val labelMap = getFeatureFromSet(labels, record)
+          labels.foreach(r => {
+            if (labelMap.containsKey(r)) {
+              labelNew.put(r, labelMap.get(r).get)
+            }
+          })
+          //3:记录唯一key
+          val mid = record.getString("mid")
+          val videoid = record.getString("videoid")
+          val logtimestamp = record.getString("logtimestamp")
+          val sessionid = record.getString("sessionid")
+
+          val logKey = (mid, videoid, logtimestamp, sessionid).productIterator.mkString(":")
+          val labelKey = labelNew.toString()
+          val featureKey = resultNew.toString()
+
+          logKey + "\t" + labelKey + "\t" + featureKey
+        })
+
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + partition
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        odpsData.saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      }else{
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+  }
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+
+  def getFeatureFromSet(set: Set[String], record: Record): mutable.HashMap[String, String] = {
+    val result = mutable.HashMap[String, String]()
+    set.foreach(r =>{
+      if (!record.isNull(r)){
+        try{
+          result.put(r, record.getString(r))
+        }catch {
+          case _ => result.put(r, String.valueOf(record.getBigint(r)))
+        }
+      }
+    })
+    result
+  }
+}

+ 260 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_06_originData_v3.scala

@@ -0,0 +1,260 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.alibaba.fastjson.JSONObject
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import examples.extractor.{RankExtractorItemFeature, RankExtractorUserFeature}
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import java.util
+import java.util.{HashMap, Map}
+import scala.collection.JavaConversions._
+import scala.collection.mutable
+
+/*
+   注意:所有的构造特征,原始值为0.0时,当作无意义,不保留; 如果经过change变换,得到0.0,保留。
+   =》 所有获取不到的特征,给默认值0.
+ */
+
+object makedata_06_originData_v3 {
+  def main(args: Array[String]) {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val tablePart = param.getOrElse("tablePart", "32").toInt
+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
+    val beginStr = param.getOrElse("beginStr", "20230101")
+    val endStr = param.getOrElse("endStr", "20230101")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/00_sample_data_v3/")
+    val project = param.getOrElse("project", "loghubods")
+    val table = param.getOrElse("table", "alg_recsys_view_sample_v3")
+
+
+    // 2 读取odps+表信息
+    val odpsOps = env.getODPS(sc)
+
+    // 3 循环执行数据生产
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      val partition = partitionPrefix + date
+      println("执行partiton:" + partition)
+      val odpsData = odpsOps.readTable(project = project,
+        table = table,
+        partition = partition,
+        transfer = func,
+        numPartition = tablePart)
+        .map(record => {
+
+          val originFeatureName = Set(
+            "apptype", "logtimestamp", "clientip", "ctx_day", "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
+
+            "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion",
+            "machineinfo_system", "machineinfo_wechatversion", "gmt_create_user",
+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt",
+
+            "title", "tags", "total_time", "play_count_total",
+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
+            "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt",
+            "video_recommend"
+          )
+          val originFeatureMap = getFeatureFromSet(originFeatureName, record)
+
+          val itemRealtimeFeatureMap = getFeatureFromSet(Set(
+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
+
+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
+          ), record).map(r => {
+            val m = new java.util.HashMap[String, Double]()
+            r._2.split(",").foreach(r => {
+              m.put(r.split(":")(0), r.split(":")(1).toDouble)
+            })
+            (r._1, m)
+          })
+          val javaMap = new HashMap[String, Map[String, java.lang.Double]]()
+          itemRealtimeFeatureMap.foreach { case (key, value) =>
+            val javaValue = new HashMap[String, java.lang.Double]()
+            value.foreach { case (innerKey, innerValue) =>
+              javaValue.put(innerKey, innerValue.asInstanceOf[java.lang.Double])
+            }
+            javaMap.put(key, javaValue)
+          }
+
+          val f1 = getFeatureFromSet(Set(
+            "apptype", "logtimestamp", "clientip", "ctx_day", "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
+            "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion",
+            "machineinfo_system", "machineinfo_wechatversion", "gmt_create_user",
+            "title", "tags", "video_recommend"
+          ), record)
+          val f2 = RankExtractorUserFeature.getUserRateFeature(originFeatureMap)
+          val f3 = RankExtractorUserFeature.cntFeatureChange(originFeatureMap,
+            new util.HashSet[String](util.Arrays.asList(
+              "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+              "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+              "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
+              "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt"))
+          )
+          val f4 = RankExtractorItemFeature.getItemRateFeature(originFeatureMap)
+          val f5 = RankExtractorItemFeature.cntFeatureChange(originFeatureMap,
+            new util.HashSet[String](util.Arrays.asList(
+              "total_time", "play_count_total",
+              "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+              "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+              "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
+              "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt")))
+          val f6 = RankExtractorItemFeature.getItemRealtimeTrend(javaMap,
+            originFeatureMap.getOrElse("ctx_day", ""),
+            originFeatureMap.getOrElse("ctx_hour", ""))
+          val f7 = RankExtractorItemFeature.getItemRealtimeCnt(javaMap,
+            new util.HashSet[String](util.Arrays.asList(
+              "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
+              "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
+              "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
+              "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
+
+              "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
+              "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
+            )),
+            originFeatureMap.getOrElse("ctx_day", ""),
+            originFeatureMap.getOrElse("ctx_hour", "")
+          )
+          val f8 = RankExtractorItemFeature.getItemRealtimeRate(javaMap,
+            originFeatureMap.getOrElse("ctx_day", ""),
+            originFeatureMap.getOrElse("ctx_hour", "")
+          )
+
+          // 1:特征聚合到map中
+          val result = new util.HashMap[String, String]()
+          result ++= f1
+          result ++= f2
+          result ++= f3
+          result ++= f4
+          result ++= f5
+          result ++= f6
+          result ++= f7
+          result ++= f8
+          val names = Set(
+            "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
+
+            "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion",
+            "machineinfo_system", "machineinfo_wechatversion",
+
+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt",
+
+            "title", "tags", "total_time", "play_count_total", "video_recommend",
+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
+            "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt",
+
+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
+
+            "u_7day_ctr", "u_7day_str", "u_7day_rov", "u_7day_ros",
+            "u_3month_ctr", "u_3month_str", "u_3month_rov", "u_3month_ros",
+            "i_7day_ctr", "i_7day_str", "i_7day_rov", "i_7day_ros",
+            "i_3month_ctr", "i_3month_str", "i_3month_rov", "i_3month_ros",
+
+            "share_uv_list_1day_6_avg", "share_uv_list_1day_6_var", "share_uv_list_1day_diff_6_avg", "share_uv_list_1day_diff_6_var",
+            "return_uv_list_1day_6_avg", "return_uv_list_1day_6_var", "return_uv_list_1day_diff_6_avg", "return_uv_list_1day_diff_6_var",
+            "share_uv_list_1h_6_avg", "share_uv_list_1h_6_var", "share_uv_list_1h_diff_6_avg", "share_uv_list_1h_diff_6_var",
+            "return_uv_list_1h_6_avg", "return_uv_list_1h_6_var", "return_uv_list_1h_diff_6_avg", "return_uv_list_1h_diff_6_var",
+
+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
+
+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h",
+
+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
+
+
+          )
+          val resultNew = new JSONObject
+          names.foreach(r => {
+            if (result.containsKey(r)){
+              resultNew.put(r, result.get(r))
+            }
+          })
+          //2: label聚合到map中
+          val labels = Set(
+            "pagesource", "recommend_page_type", "pagesource_change",
+            "abcode",
+            "is_play", "playtime",
+            "is_share", "share_cnt_pv", "share_ts_list",
+            "is_return", "return_cnt_pv", "return_cnt_uv", "return_mid_ts_list"
+          )
+          val labelNew = new JSONObject
+          val labelMap = getFeatureFromSet(labels, record)
+          labels.foreach(r => {
+            if (labelMap.containsKey(r)) {
+              labelNew.put(r, labelMap.get(r).get)
+            }
+          })
+          //3:记录唯一key
+          val mid = record.getString("mid")
+          val videoid = record.getString("videoid")
+          val logtimestamp = record.getString("logtimestamp")
+          val apptype = record.getString("apptype")
+
+          val logKey = (mid, videoid, logtimestamp, apptype).productIterator.mkString(":")
+          val labelKey = labelNew.toString()
+          val featureKey = resultNew.toString()
+
+          logKey + "\t" + labelKey + "\t" + featureKey
+        })
+
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + partition
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        odpsData.saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      }else{
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+  }
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+
+  def getFeatureFromSet(set: Set[String], record: Record): mutable.HashMap[String, String] = {
+    val result = mutable.HashMap[String, String]()
+    set.foreach(r =>{
+      if (!record.isNull(r)){
+        try{
+          result.put(r, record.getString(r))
+        }catch {
+          case _ => result.put(r, String.valueOf(record.getBigint(r)))
+        }
+      }
+    })
+    result
+  }
+}

+ 243 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_07_rosData.scala

@@ -0,0 +1,243 @@
+package com.aliyun.odps.spark.examples.makedata
+
+
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
+import examples.dataloader.{OfflineVlogShareLRFeatureExtractorV1, OfflineVlogShareLRFeatureExtractorV2}
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import java.util
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import com.alibaba.fastjson.JSON
+import com.alibaba.fastjson.JSONObject
+
+
+object makedata_07_rosData {
+  def main(args: Array[String]) {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
+    val beginStr = param.getOrElse("beginStr", "20230101")
+    val endStr = param.getOrElse("endStr", "20230101")
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/00_sample_data/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/04_ros_data/")
+    val featureVersion =  param.getOrElse("featureVersion", "v2")
+    val ifRepart = param.getOrElse("ifRepart", "10").toInt
+    val labelVersion = param.getOrElse("labelVersion", "v1")
+
+
+
+    // 3 循环执行数据生产
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      val partition = partitionPrefix + date
+      println("执行partiton:" + partition)
+      var hdfsPath = readPath + "/" + partition
+
+      // 4 过滤保留分享样本
+      val dataFilter = sc.textFile(hdfsPath).map(r=>{
+        val rList = r.split("\t")
+        val logKeyStr = rList(0)
+        val labelStr = rList(1)
+        val feaStr = rList(2)
+        val logTs = logKeyStr.split(":")(2)
+        val labelJson = JSON.parseObject(labelStr)
+        val feaJson = JSON.parseObject(feaStr)
+        val is_share = if (labelJson.containsKey("is_share")) labelJson.getString("is_share") else "0"
+        (logTs, feaJson, labelJson, is_share)
+      }).filter(_._4.equals("1"))
+
+      // 5 label处理
+      val dataTrain = labelVersion match {
+        case "v2" => dataFilter.flatMap({
+          case (logTs, feaJson, labelJson, _) =>
+            val res = ArrayBuffer[(String, JSONObject)]()
+            val hour = feaJson.getString("ctx_hour").toInt
+            val expTs = logTs.toLong / 1000
+            hour match {
+              case 23 => res
+              case _ =>
+                res.add(("0", feaJson))
+                val is_return = if (labelJson.containsKey("is_return")) labelJson.getString("is_return") else "0"
+                if ("1".equals(is_return)) {
+                  if (labelJson.containsKey("return_mid_ts_list")){
+                    labelJson.getString("return_mid_ts_list").split(",")
+                      .map(r => r.split(":")(1).toLong / 1000)
+                      .foreach(ts => {
+                        if (ts - expTs < 3600) {
+                          res.add(("1", feaJson))
+                        }
+                      })
+                  }
+                }
+                res
+            }
+        })
+        case _ => dataFilter.map({
+          case (logTs, feaJson, labelJson, _) =>
+            val is_return = if (labelJson.containsKey("is_return")) labelJson.getString("is_return") else "0"
+            (is_return, feaJson)
+        })
+      }
+      // 6 特征选择
+      val data = dataTrain.map{
+        case (is_return, feaJson) =>
+          if ("v1".equals(featureVersion)) {
+            val feaSet = Set(
+              "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
+              "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
+              "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+              "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+              "total_time", "play_count_total",
+              "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+              "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+              "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
+              "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
+              "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
+              "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros"
+            )
+            val feaMap = new util.HashMap[String, String]()
+            feaSet.foreach(r => {
+              if (feaJson.containsKey(r)) {
+                feaMap.put(r, feaJson.getString(r))
+              }
+            })
+            val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV1()
+            bytesFeatureExtractor.makeFeature4String(feaMap)
+            val featureMap = bytesFeatureExtractor.featureMap
+            (is_return, featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t"))
+
+          } else if ("v2".equals(featureVersion)) {
+            val feaSet = Set(
+              "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
+              "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
+              "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+              "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+              "total_time", "play_count_total",
+              "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+              "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+              "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
+              "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
+              "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
+              "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
+
+              //            "share_uv_list_1day_6_avg", "share_uv_list_1day_6_var", "share_uv_list_1day_diff_6_avg", "share_uv_list_1day_diff_6_var",
+              //            "return_uv_list_1day_6_avg", "return_uv_list_1day_6_var", "return_uv_list_1day_diff_6_avg", "return_uv_list_1day_diff_6_var",
+              //            "share_uv_list_1h_6_avg", "share_uv_list_1h_6_var", "share_uv_list_1h_diff_6_avg", "share_uv_list_1h_diff_6_var",
+              //            "return_uv_list_1h_6_avg", "return_uv_list_1h_6_var", "return_uv_list_1h_diff_6_avg", "return_uv_list_1h_diff_6_var",
+
+              "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
+              "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
+              "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
+              "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
+
+              "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
+              "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h",
+
+              "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
+              "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
+            )
+            val feaMap = new util.HashMap[String, String]()
+            feaSet.foreach(r => {
+              if (feaJson.containsKey(r)) {
+                feaMap.put(r, feaJson.getString(r))
+              }
+            })
+            val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV2()
+            bytesFeatureExtractor.makeFeature4String(feaMap)
+            val featureMap = bytesFeatureExtractor.featureMap
+            (is_return, featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t"))
+          } else if ("v4".equals(featureVersion)) {
+            val feaSet = Set(
+              "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
+              "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
+              "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+              "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+              "total_time", "play_count_total",
+              "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+              "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+              "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
+              "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
+              "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
+              "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
+
+              "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
+              "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
+            )
+            val feaMap = new util.HashMap[String, String]()
+            feaSet.foreach(r => {
+              if (feaJson.containsKey(r)) {
+                feaMap.put(r, feaJson.getString(r))
+              }
+            })
+            val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV2()
+            bytesFeatureExtractor.makeFeature4String(feaMap)
+            val featureMap = bytesFeatureExtractor.featureMap
+            (is_return, featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t"))
+          } else if ("v5".equals(featureVersion)) {
+            val feaSet = Set(
+              "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
+              "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
+              "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+              "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+              "total_time", "play_count_total",
+              "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+              "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+              "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
+              "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
+              "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
+              "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
+
+              "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
+              "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
+              "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
+              "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
+
+              "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
+              "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
+            )
+            val feaMap = new util.HashMap[String, String]()
+            feaSet.foreach(r => {
+              if (feaJson.containsKey(r)) {
+                feaMap.put(r, feaJson.getString(r))
+              }
+            })
+            val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV2()
+            bytesFeatureExtractor.makeFeature4String(feaMap)
+            val featureMap = bytesFeatureExtractor.featureMap
+            (is_return, featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t"))
+          } else {
+            (is_return, "")
+          }
+      }.filter(_._2.nonEmpty).map(r=> r._1 + "\t" + r._2)
+
+      // 7 保存数据到hdfs
+      hdfsPath = savePath + "/" + partition
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        if (ifRepart == 0){
+          data.saveAsTextFile(hdfsPath, classOf[GzipCodec])
+        }else{
+          data.repartition(ifRepart).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+        }
+      }else{
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+  }
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+
+}

+ 202 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_07_strData.scala

@@ -0,0 +1,202 @@
+package com.aliyun.odps.spark.examples.makedata
+
+
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import java.util
+import scala.collection.JavaConversions._
+import com.alibaba.fastjson.JSON
+import examples.dataloader.OfflineVlogShareLRFeatureExtractorV2
+import examples.dataloader.OfflineVlogShareLRFeatureExtractorV1
+
+
+object makedata_07_strData {
+  def main(args: Array[String]) {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
+    val beginStr = param.getOrElse("beginStr", "20230101")
+    val endStr = param.getOrElse("endStr", "20230101")
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/00_sample_data/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/01_str_data/")
+    val featureVersion =  param.getOrElse("featureVersion", "v2")
+    val ifRepart = param.getOrElse("ifRepart", "100").toInt
+
+
+
+    // 3 循环执行数据生产
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      val partition = partitionPrefix + date
+      println("执行partiton:" + partition)
+      var hdfsPath = readPath + "/" + partition
+      val data = sc.textFile(hdfsPath).map(r=>{
+        val rList = r.split("\t")
+        val labelStr = rList(1)
+        val feaStr = rList(2)
+        val labelJson = JSON.parseObject(labelStr)
+        val label = if (labelJson.containsKey("is_share")) labelJson.getString("is_share") else "0"
+        val feaJson = JSON.parseObject(feaStr)
+
+
+        if ("v1".equals(featureVersion)){
+          val feaSet = Set(
+            "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
+            "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+            "total_time", "play_count_total",
+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros"
+          )
+          val feaMap = new util.HashMap[String, String]()
+          feaSet.foreach(r => {
+            if (feaJson.containsKey(r)) {
+              feaMap.put(r, feaJson.getString(r))
+            }
+          })
+          val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV1()
+          bytesFeatureExtractor.makeFeature4String(feaMap)
+          val featureMap = bytesFeatureExtractor.featureMap
+          label + "\t" + featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t")
+
+        }else if ("v2".equals(featureVersion)){
+          val feaSet = Set(
+            "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
+            "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+            "total_time", "play_count_total",
+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
+
+//            "share_uv_list_1day_6_avg", "share_uv_list_1day_6_var", "share_uv_list_1day_diff_6_avg", "share_uv_list_1day_diff_6_var",
+//            "return_uv_list_1day_6_avg", "return_uv_list_1day_6_var", "return_uv_list_1day_diff_6_avg", "return_uv_list_1day_diff_6_var",
+//            "share_uv_list_1h_6_avg", "share_uv_list_1h_6_var", "share_uv_list_1h_diff_6_avg", "share_uv_list_1h_diff_6_var",
+//            "return_uv_list_1h_6_avg", "return_uv_list_1h_6_var", "return_uv_list_1h_diff_6_avg", "return_uv_list_1h_diff_6_var",
+
+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
+
+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h",
+
+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
+          )
+          val feaMap = new util.HashMap[String, String]()
+          feaSet.foreach(r => {
+            if (feaJson.containsKey(r)) {
+              feaMap.put(r, feaJson.getString(r))
+            }
+          })
+          val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV2()
+          bytesFeatureExtractor.makeFeature4String(feaMap)
+          val featureMap = bytesFeatureExtractor.featureMap
+          label + "\t" + featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t")
+        }else if ("v4".equals(featureVersion)){
+          val feaSet = Set(
+            "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
+            "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+            "total_time", "play_count_total",
+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
+
+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
+          )
+          val feaMap = new util.HashMap[String, String]()
+          feaSet.foreach(r => {
+            if (feaJson.containsKey(r)) {
+              feaMap.put(r, feaJson.getString(r))
+            }
+          })
+          val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV2()
+          bytesFeatureExtractor.makeFeature4String(feaMap)
+          val featureMap = bytesFeatureExtractor.featureMap
+          label + "\t" + featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t")
+        } else if ("v5".equals(featureVersion)) {
+          val feaSet = Set(
+            "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
+            "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+            "total_time", "play_count_total",
+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
+
+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
+
+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
+          )
+          val feaMap = new util.HashMap[String, String]()
+          feaSet.foreach(r => {
+            if (feaJson.containsKey(r)) {
+              feaMap.put(r, feaJson.getString(r))
+            }
+          })
+          val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV2()
+          bytesFeatureExtractor.makeFeature4String(feaMap)
+          val featureMap = bytesFeatureExtractor.featureMap
+          label + "\t" + featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t")
+        }
+
+      })
+      // 4 保存数据到hdfs
+      hdfsPath = savePath + "/" + partition
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        if (ifRepart == 0){
+          data.saveAsTextFile(hdfsPath, classOf[GzipCodec])
+        }else{
+          data.repartition(ifRepart).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+        }
+
+      }else{
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+  }
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+
+
+
+}

+ 140 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_08_item2redis.scala

@@ -0,0 +1,140 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.alibaba.fastjson.JSONObject
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyHdfsUtils, ParamUtils, env}
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import java.util
+import java.util.Date
+import java.util.concurrent.TimeUnit
+import scala.collection.JavaConversions._
+import scala.collection.mutable
+
+
+object makedata_08_item2redis {
+  def main(args: Array[String]) {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
+    val tablePart = param.getOrElse("tablePart", "64").toInt
+    val ifUser = param.getOrDefault("ifUser", "False").toBoolean
+    val ifVideo = param.getOrDefault("ifVideo", "False").toBoolean
+    val date = param.getOrDefault("date", "20231220")
+    val expireDay = param.getOrDefault("expireDay", "2").toInt
+    val ifDebug = param.getOrDefault("ifDebug", "False").toBoolean
+    val ifDeleteRedisUser = param.getOrDefault("ifDeleteRedisUser", "False").toBoolean
+    val ifWriteRedisUser = param.getOrDefault("ifWriteRedisUser", "False").toBoolean
+    val ifWriteRedis = param.getOrDefault("ifWriteRedis", "True").toBoolean
+    val partition = partitionPrefix + date
+    val savePathUser = param.getOrDefault("savePathUser", "")
+    val savePathVideo = param.getOrDefault("savePathVideo", "")
+    val userSampleIDs = param.getOrDefault("userSampleIDs", "")
+    val sampleRate = param.getOrDefault("sampleRate", "1.0").toDouble
+
+
+    // 2 读取数据库odps
+    val odpsOps = env.getODPS(sc)
+    val project = "loghubods"
+    val tableItem = "alg_recsys_video_info"
+    val videoRedisKeyPrefix = "video_info_"
+
+    // 4 video测特征处理
+    if (ifVideo){
+      println("video特征处理")
+      val itemData = odpsOps.readTable(project = project, table = tableItem, partition = partition, transfer = func, numPartition = tablePart)
+
+      val itemDataTakeRddRun = itemData.map(record =>{
+        val originFeatureName = Set(
+          "gmt_create", "existence_days",
+          "title", "tags", "total_time", "play_count_total",
+          "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+          "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+          "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
+          "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt"
+        )
+//        val myList: List[(String, String)] = List(("value1", "value2"), ("value3", "value4"))
+        val originFeatureMap = getFeatureFromRecord(originFeatureName, record)
+        val videoid = record.getBigint("videoid").toString
+        val resultNew = new JSONObject
+        originFeatureName.foreach(r => {
+          if (originFeatureMap.containsKey(r)) {
+            val v = originFeatureMap.get(r).get
+            resultNew.put(r, v)
+          }
+        })
+        (videoid, resultNew.toString())
+      }).mapPartitions(row => {
+          val redisFormat = new util.HashMap[String, String]
+          val redisFormatSave = new util.HashMap[String, String]
+          val redisTemplate = env.getRedisTemplate()
+          var i = 1
+          row.foreach {
+            case (key, value) =>
+              if (key.nonEmpty && value != null && value.nonEmpty) {
+                redisFormat.put(videoRedisKeyPrefix + key, value)
+                redisFormatSave.put(videoRedisKeyPrefix + key, value)
+              }
+              if (i % 1000 == 0 && ifWriteRedis) {
+                redisTemplate.opsForValue.multiSet(redisFormat)
+                redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
+                redisFormat.clear()
+              }
+              i = i + 1
+          }
+          if (ifWriteRedis){
+            redisTemplate.opsForValue.multiSet(redisFormat)
+            redisFormat.keySet.foreach(key => redisTemplate.expire(key, 24 * expireDay, TimeUnit.HOURS))
+            redisFormat.clear()
+          }
+          redisFormatSave.iterator
+      })
+      if (savePathVideo.nonEmpty && savePathVideo.startsWith("/dw/recommend/model/")){
+        val savePathPart = savePathVideo + "/" + partition
+        MyHdfsUtils.delete_hdfs_path(savePathPart)
+        itemDataTakeRddRun.map(r => r._1 + "\t" + r._2).saveAsTextFile(savePathPart, classOf[GzipCodec])
+      }
+      println("item写入成功:item.action.count=" + itemDataTakeRddRun.count())
+    }else{
+      println("不处理video")
+    }
+  }
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+
+  def getFeatureFromRecord(set: Set[String], record: Record): mutable.HashMap[String, String] = {
+    val result = mutable.HashMap[String, String]()
+    set.foreach(r => {
+      if (!record.isNull(r)) {
+        val obj = record.get(r)
+        if (obj.isInstanceOf[String]){
+          result.put(r, record.getString(r))
+        } else if (obj.isInstanceOf[BigInt]){
+          result.put(r, String.valueOf(record.getBigint(r)))
+        } else if (obj.isInstanceOf[Double]) {
+          result.put(r, String.valueOf(record.getDouble(r)))
+        } else if (obj.isInstanceOf[Date]) {
+          result.put(r, String.valueOf(record.getDatetime(r)))
+        } else {
+          try {
+            result.put(r, record.getString(r))
+          } catch {
+            case _ => result.put(r, String.valueOf(record.getBigint(r)))
+          }
+        }
+      }
+    })
+    result
+  }
+
+}

+ 220 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_09_user2redis.scala

@@ -0,0 +1,220 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import com.google.gson.GsonBuilder
+import examples.dataloader.RequestContextOffline
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+import com.aliyun.odps.spark.examples.makedata.makedata_06_originData.getFeatureFromSet
+import com.alibaba.fastjson.JSONObject
+
+import java.util
+import java.util.concurrent.TimeUnit
+import scala.collection.JavaConversions._
+
+
+object makedata_09_user2redis {
+  def main(args: Array[String]) {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
+    val tablePart = param.getOrElse("tablePart", "64").toInt
+    val ifUser = param.getOrDefault("ifUser", "False").toBoolean
+    val ifVideo = param.getOrDefault("ifVideo", "False").toBoolean
+    val date = param.getOrDefault("date", "20231220")
+    val expireDay = param.getOrDefault("expireDay", "2").toInt
+    val ifDebug = param.getOrDefault("ifDebug", "False").toBoolean
+    val ifDeleteRedisUser = param.getOrDefault("ifDeleteRedisUser", "False").toBoolean
+    val ifWriteRedisUser = param.getOrDefault("ifWriteRedisUser", "False").toBoolean
+    val ifWriteRedis = param.getOrDefault("ifWriteRedis", "True").toBoolean
+    val partition = partitionPrefix + date
+    val savePathUser = param.getOrDefault("savePathUser", "")
+    val savePathVideo = param.getOrDefault("savePathVideo", "")
+    val userSampleIDs = param.getOrDefault("userSampleIDs", "")
+    val sampleRate = param.getOrDefault("sampleRate", "1.0").toDouble
+    val midDays = param.getOrDefault("midDays", "3").toInt
+
+
+    // 2 读取数据库odps
+    val odpsOps = env.getODPS(sc)
+    val project = "loghubods"
+    val tableUser = "alg_recsys_user_info"
+    val userRedisKeyPrefix = "user_info_4video_"
+
+
+
+    // 3-1 用户测特征处理
+    if (ifUser){
+      println("user特征处理")
+
+
+      var userData = odpsOps.readTable(project = project, table = tableUser, partition = partition,
+        transfer = func, numPartition = tablePart)
+        .map(record =>{
+          val userKey = "mids"
+          val mid = record.getString(userKey)
+          val originFeatureName = Set(
+            "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion",
+            "machineinfo_system", "machineinfo_wechatversion",
+//            "gmt_create_user",
+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt"
+          )
+          val originFeatureMap = getFeatureFromSet(originFeatureName, record)
+          val resultNew = new JSONObject
+          originFeatureName.foreach(r => {
+            if (originFeatureMap.containsKey(r)) {
+              val v = originFeatureMap.get(r).get
+              resultNew.put(r, v)
+            }
+          })
+          (mid, resultNew.toString())
+        })
+//      userData = userData.join(midRdd.map(r=> (r, 1))).map(r=> (r._1, r._2._1))
+
+      if (userSampleIDs.nonEmpty){
+        val IDs = userSampleIDs.split(",").filter(_.nonEmpty).map(_.toInt).toList
+        userData = userData.filter(r => IDs.contains(r._1.hashCode % 10))
+      }
+      if (ifDebug){
+        println("user特征处理-debug开启-只保留5条数据-特征数量大于1")
+        val userDataTake = userData.take(5)
+        userDataTake.foreach(r=> println(r._1 + "\t" + r._2))
+        userData = sc.parallelize(userDataTake)
+      }
+      if (savePathUser.nonEmpty && savePathUser.startsWith("/dw/recommend/model/")) {
+        var savePathPart = savePathUser + "/" + partition
+        if (userSampleIDs.nonEmpty) {
+          savePathPart = savePathPart + "_" + userSampleIDs
+        }
+        MyHdfsUtils.delete_hdfs_path(savePathPart)
+        userData.map(r => r._1 + "\t" + r._2).saveAsTextFile(savePathPart, classOf[GzipCodec])
+      }
+      println("user.action.count=" + userData.count())
+    } else {
+      println("不处理user")
+    }
+
+    if (ifDeleteRedisUser){
+      println("user redis 删除")
+      var savePathPart = savePathUser + "/" + partition
+      if (userSampleIDs.nonEmpty) {
+        savePathPart = savePathPart + "_" + userSampleIDs
+      }
+      println("读取数据路径:" + savePathPart)
+      val userDataRead = sc.textFile(savePathPart)
+      val userDataRead2 = userDataRead.filter(_.split("\t").length >= 2).map(r => {
+        val rList = r.split("\t")
+        (rList(0), rList(1))
+      })
+      println("预计删除数据量:" + userDataRead2.count())
+      val userDataTakeRddRun = userDataRead2.mapPartitions(row => {
+        val redisFormat = new util.HashMap[String, String]
+        val redisTemplate = env.getRedisTemplate()
+        var i = 1
+        row.foreach {
+          case (key, value) =>
+            if (key.nonEmpty) {
+              redisFormat.put(userRedisKeyPrefix + key, value)
+            }
+            if (i % 1000 == 0) {
+              redisTemplate.delete(redisFormat.map(_._1))
+              redisFormat.clear()
+            }
+            i = i + 1
+        }
+        redisTemplate.delete(redisFormat.map(_._1))
+        redisFormat.clear()
+        redisFormat.iterator
+      })
+      println("delete redis.count=" + userDataTakeRddRun.count())
+    } else {
+      println("不处理user的redis删除")
+    }
+
+    if (ifWriteRedisUser){
+
+      // 3-2 读取最近2个月有播放行为的mid集合
+      var midRdd = sc.emptyRDD[String]
+      MyDateUtils.getDateRange(MyDateUtils.getNumDaysBefore(date, midDays), date).foreach(d => {
+        println("-----------读取播放信息:" + d)
+        val partitionMid = "dt=" + d
+        val data = odpsOps.readTable(project = "loghubods", table = "play_action_log",
+            partition = partitionMid, transfer = func, numPartition = tablePart)
+          .map(r => {
+            if (r.isNull("machinecode")) "" else r.getString("machinecode")
+          }).filter(_.nonEmpty)
+        midRdd = midRdd.union(data).distinct()
+      })
+      println("------------mid处理完毕:" + midRdd.count() + "------------------")
+
+
+      println("user redis 写入")
+      var savePathPart = savePathUser + "/" + partition
+      if (userSampleIDs.nonEmpty) {
+        savePathPart = savePathPart + "_" + userSampleIDs
+      }
+      val userDataRead = sc.textFile(savePathPart).filter(_.split("\t").length >= 2)
+        .sample(false, sampleRate)
+        .map(r => {
+        val rList = r.split("\t")
+        (rList(0), rList(1))
+      })
+        .join(midRdd.map(r=> (r, 1))).map(r=> (r._1, r._2._1))
+
+      val userDataTakeRddRun = userDataRead.mapPartitions(row => {
+        val redisFormat = new util.HashMap[String, String]
+        val redisTemplate = env.getRedisTemplate()
+        var i = 1
+        row.foreach {
+          case (key, value) =>
+            if (key.nonEmpty) {
+              redisFormat.put(userRedisKeyPrefix + key, value)
+            }
+            if (i % 1000 == 0) {
+              redisTemplate.opsForValue.multiSet(redisFormat)
+              redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
+              redisFormat.clear()
+            }
+            i = i + 1
+        }
+        redisTemplate.opsForValue.multiSet(redisFormat)
+        redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
+        redisFormat.clear()
+        redisFormat.iterator
+      })
+      println("user写入成功:put in redis.count=" + userDataTakeRddRun.count())
+    } else {
+      println("不处理user的redis写入")
+    }
+
+
+
+  }
+
+  def handleUser(record: Record, schema: TableSchema): Tuple3[String, String, Int] = {
+    val userKey = "mids"
+    val mid = record.getString(userKey)
+    val reqContext: RequestContextOffline = new RequestContextOffline()
+    reqContext.putUserFeature(record)
+    // reqContext.featureMap.put("mid", mid)
+    val gson = (new GsonBuilder).serializeSpecialFloatingPointValues.create
+    val value = gson.toJson(reqContext.featureMap)
+    (mid, value, reqContext.featureMap.size())
+  }
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+
+}

+ 167 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_09_user2redis_freq.scala

@@ -0,0 +1,167 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.alibaba.fastjson.JSONObject
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.makedata.makedata_06_originData.getFeatureFromSet
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import com.google.gson.GsonBuilder
+import examples.dataloader.RequestContextOffline
+import org.apache.commons.lang.time.DateUtils
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import java.util
+import java.util.concurrent.TimeUnit
+import scala.collection.JavaConversions._
+
+
+object makedata_09_user2redis_freq {
+  def main(args: Array[String]) {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+    MyHdfsUtils.delete_hdfs_path("/dw/recommend/model/99_zhangbo_checkpoint/")
+    sc.setCheckpointDir("/dw/recommend/model/99_zhangbo_checkpoint/")
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
+    val tablePart = param.getOrElse("tablePart", "64").toInt
+    val date = param.getOrDefault("date", "20231220")
+    val expireDay = param.getOrDefault("expireDay", "3").toInt
+    val ifUser = param.getOrDefault("ifUser", "False").toBoolean
+    val ifWriteRedisUser = param.getOrDefault("ifWriteRedisUser", "False").toBoolean
+    val partition = partitionPrefix + date
+    val savePathUser = param.getOrDefault("savePathUser", "")
+    val midDays = param.getOrDefault("midDays", "7").toInt
+    val redisLimit = param.getOrDefault("redisLimit", "100000000").toLong
+
+    //2 读取数据库odps
+    val odpsOps = env.getODPS(sc)
+    val project = "loghubods"
+    val tableUser = "alg_recsys_user_info"
+    val userRedisKeyPrefix = "user_info_4video_"
+
+
+
+    if (ifUser){
+      //3 特征处理
+      println("user特征处理")
+      val userData = odpsOps.readTable(project = project, table = tableUser, partition = partition,
+          transfer = func, numPartition = tablePart)
+        .map(record => {
+          val mid = record.getString("mids")
+          val originFeatureName = Set(
+            "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion",
+            "machineinfo_system", "machineinfo_wechatversion",
+            //"gmt_create_user",
+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt"
+          )
+          val originFeatureMap = getFeatureFromSet(originFeatureName, record)
+          val resultNew = new JSONObject
+          originFeatureName.foreach(r => {
+            if (originFeatureMap.containsKey(r)) {
+              val v = originFeatureMap(r)
+              resultNew.put(r, v)
+            }
+          })
+          (mid, resultNew.toString())
+        })
+      //3 特征原始文件保存
+      if (savePathUser.nonEmpty && savePathUser.startsWith("/dw/recommend/model/")) {
+        val savePathPart = savePathUser + "/all/" + partition
+        MyHdfsUtils.delete_hdfs_path(savePathPart)
+        userData.map(r => r._1 + "\t" + r._2).saveAsTextFile(savePathPart, classOf[GzipCodec])
+      }
+    }
+
+
+    //4 近期用户统计
+    val dateEarly = MyDateUtils.getNumDaysBefore(date, 0)
+    val midRdd = odpsOps.readTable(project = "loghubods", table = "mid_uid",
+        partition = "dt=" + dateEarly, transfer = func, numPartition = tablePart)
+      .map(r => {
+        val mid = if (r.isNull("mid")) "" else r.getString("mid")
+        val actionTs = if (r.isNull("user_last_action_time")) "" else r.getString("user_last_action_time")
+        (mid, actionTs)
+      }).filter(r => r._1.nonEmpty && r._2.nonEmpty)
+      .reduceByKey((a, b) => Math.max(a.toLong, b.toLong).toString)
+      .filter(r => DateUtils.parseDate(date, Array[String]("yyyyMMdd")).getTime / 1000 - r._2.toLong / 1000 < 3600 * 24 * midDays)
+    println("------------mid处理完毕,近期保留的用户有:" + midRdd.count() + "------------------")
+    //5 用户区分
+    val savePathPart = savePathUser + "/all/" + partition
+    val userDataRead = sc.textFile(savePathPart).filter(_.split("\t").length >= 2)
+      .map(r => {
+        val rList = r.split("\t")
+        (rList(0), rList(1))
+      }).join(midRdd).map(r => (r._1, r._2._1))
+    userDataRead.checkpoint()
+//      .leftOuterJoin(midRdd).map {
+//        case (mid, (fea, Some(_))) =>
+//          (mid, fea, true)
+//        case (mid, (fea, None)) =>
+//          (mid, fea, false)
+//      }
+    val userDataReadTrue = userDataRead.map(r => r._1 + "\t" + r._2)
+    // val userDataReadFalse = userDataRead.filter(!_._3).map(r => r._1 + "\t" + r._2)
+    if (savePathUser.nonEmpty && savePathUser.startsWith("/dw/recommend/model/")) {
+      val p1 = savePathUser + "/true/" + partition
+      MyHdfsUtils.delete_hdfs_path(p1)
+      userDataReadTrue.saveAsTextFile(p1, classOf[GzipCodec])
+      //val p2 = savePathUser + "/false/" + partition
+      //MyHdfsUtils.delete_hdfs_path(p2)
+      //userDataReadFalse.saveAsTextFile(p2, classOf[GzipCodec])
+    }
+
+    //6 redis
+    if (ifWriteRedisUser) {
+      println("开始处理redis写入")
+      val p1 = savePathUser + "/true/" + partition
+      val userDataRead = sc.textFile(p1).filter(_.split("\t").length >= 2)
+        .map(r => {
+          val rList = r.split("\t")
+          (rList(0), rList(1))
+        })
+      val count = userDataRead.count()
+      println("待写入数据有:" + count)
+      if (count > redisLimit) {
+        println(s"数据量超过${redisLimit},不执行写入。")
+      } else {
+        val userDataTakeRddRun = userDataRead.mapPartitions(row => {
+          val redisFormat = new util.HashMap[String, String]
+          val redisTemplate = env.getRedisTemplate()
+          var i = 1
+          row.foreach {
+            case (key, value) =>
+              if (key.nonEmpty) {
+                redisFormat.put(userRedisKeyPrefix + key, value)
+              }
+              if (i % 1000 == 0) {
+                redisTemplate.opsForValue.multiSet(redisFormat)
+                redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
+                redisFormat.clear()
+              }
+              i = i + 1
+          }
+          redisTemplate.opsForValue.multiSet(redisFormat)
+          redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
+          redisFormat.clear()
+          redisFormat.iterator
+        })
+        println("user写入成功:put in redis.count=" + userDataTakeRddRun.count())
+      }
+    }
+  }
+
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+
+}

+ 244 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_10_originData_v3.scala

@@ -0,0 +1,244 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.alibaba.fastjson.JSONObject
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import examples.extractor.{RankExtractorItemFeatureV2, RankExtractorUserFeatureV2}
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import java.util
+import java.util.{HashMap, Map}
+import scala.collection.JavaConversions._
+import scala.collection.mutable
+
+/*
+   所有获取不到的特征,给默认值0.
+ */
+
+object makedata_10_originData_v3 {
+  def main(args: Array[String]) {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val tablePart = param.getOrElse("tablePart", "32").toInt
+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
+    val beginStr = param.getOrElse("beginStr", "20230101")
+    val endStr = param.getOrElse("endStr", "20230101")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/10_sample_data_v3/")
+    val project = param.getOrElse("project", "loghubods")
+    val table = param.getOrElse("table", "alg_recsys_view_sample_v3")
+
+
+    // 2 读取odps+表信息
+    val odpsOps = env.getODPS(sc)
+
+    // 3 循环执行数据生产
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      val partition = partitionPrefix + date
+      println("开始执行partiton:" + partition)
+      val odpsData = odpsOps.readTable(project = project,
+        table = table,
+        partition = partition,
+        transfer = func,
+        numPartition = tablePart)
+        .map(record => {
+
+          //1 拿到所有的特征,保存在hashmap中。如果表中是空,那么map中没有这个key。
+          val originFeatureName = Set(
+            "apptype","mid","uid","videoid","logtimestamp","ctx_day","ctx_week","ctx_hour","clientip","ctx_region",
+            "ctx_city","pagesource","recommend_page_type","pagesource_change","abcode",
+            // ----------
+            "playtime","is_play","share_cnt_pv","is_share","share_ts_list","return_cnt_pv","return_cnt_uv","return_mid_ts_list","is_return",
+            // ----------
+
+            // ----------
+            "gender","machineinfo_brand","machineinfo_model","machineinfo_platform","machineinfo_sdkversion","machineinfo_system","machineinfo_wechatversion","gmt_create_user",
+            "u_1day_exp_cnt","u_1day_click_cnt","u_1day_share_cnt","u_1day_return_cnt",
+            "u_3day_exp_cnt","u_3day_click_cnt","u_3day_share_cnt","u_3day_return_cnt",
+            "u_7day_exp_cnt","u_7day_click_cnt","u_7day_share_cnt","u_7day_return_cnt",
+            "u_3month_exp_cnt","u_3month_click_cnt","u_3month_share_cnt","u_3month_return_cnt",
+            // ----------
+            "title","distrubute_title","gmt_create_video","tags","existence_days","total_time","play_count","play_count_total","video_recommend",
+            "i_1day_exp_cnt","i_1day_click_cnt","i_1day_share_cnt","i_1day_return_cnt",
+            "i_3day_exp_cnt","i_3day_click_cnt","i_3day_share_cnt","i_3day_return_cnt",
+            "i_7day_exp_cnt","i_7day_click_cnt","i_7day_share_cnt","i_7day_return_cnt",
+            "i_3month_exp_cnt","i_3month_click_cnt","i_3month_share_cnt","i_3month_return_cnt"
+          )
+          val originFeatureMap = getFeatureFromSet(originFeatureName, record)
+          //2 计算天级别的比率特征。
+          val f2 = RankExtractorUserFeatureV2.getUserRateFeature(originFeatureMap)
+          val f4 = RankExtractorItemFeatureV2.getItemRateFeature(originFeatureMap)
+          //3 计算item的实时特征。先解析格式,再进行计算。
+          val itemRealtimeFeatureMap = getFeatureFromSet(Set(
+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day", "share_pv_list_1day",
+            "share_uv_list_1day", "return_uv_list_1day", "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
+            // ----------
+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
+          ), record).map(r => {
+            val m = new java.util.HashMap[String, Double]()
+            r._2.split(",").foreach(r => {
+              m.put(r.split(":")(0), r.split(":")(1).toDouble)
+            })
+            (r._1, m)
+          })
+          val javaMap = new HashMap[String, Map[String, java.lang.Double]]()
+          itemRealtimeFeatureMap.foreach { case (key, value) =>
+            val javaValue = new HashMap[String, java.lang.Double]()
+            value.foreach { case (innerKey, innerValue) =>
+              javaValue.put(innerKey, innerValue.asInstanceOf[java.lang.Double])
+            }
+            javaMap.put(key, javaValue)
+          }
+          val f6 = RankExtractorItemFeatureV2.getItemRealtimeTrend(javaMap,
+            originFeatureMap.getOrElse("ctx_day", ""),
+            originFeatureMap.getOrElse("ctx_hour", ""))
+          val f7 = RankExtractorItemFeatureV2.getItemRealtimeCnt(javaMap,
+            new util.HashSet[String](util.Arrays.asList(
+              "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
+              "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
+              "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
+              "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
+              // ----------
+              "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
+              "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
+            )),
+            originFeatureMap.getOrElse("ctx_day", ""),
+            originFeatureMap.getOrElse("ctx_hour", "")
+          )
+          val f8 = RankExtractorItemFeatureV2.getItemRealtimeRate(javaMap,
+            originFeatureMap.getOrElse("ctx_day", ""),
+            originFeatureMap.getOrElse("ctx_hour", "")
+          )
+          val result = new util.HashMap[String, String]()
+          result ++= originFeatureMap
+          result ++= f2
+          result ++= f4
+          result ++= f6
+          result ++= f7
+          result ++= f8
+          val names = Set(
+            "apptype", "mid", "uid", "videoid", "logtimestamp", "ctx_day", "ctx_week", "ctx_hour", "clientip", "ctx_region",
+            "ctx_city", "pagesource", "recommend_page_type", "pagesource_change", "abcode",
+            // ----------
+            "playtime", "is_play", "share_cnt_pv", "is_share", "share_ts_list", "return_cnt_pv", "return_cnt_uv", "return_mid_ts_list", "is_return",
+            // ----------
+
+            // ----------
+            "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion", "machineinfo_system", "machineinfo_wechatversion", "gmt_create_user",
+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt",
+            // ----------
+            "title", "distrubute_title", "gmt_create_video", "tags", "existence_days", "total_time", "play_count", "play_count_total", "video_recommend",
+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
+            "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt",
+            // ---------- rate
+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
+            "u_7day_ctr", "u_7day_str", "u_7day_rov", "u_7day_ros",
+            "u_3month_ctr", "u_3month_str", "u_3month_rov", "u_3month_ros",
+            // ---------- rate
+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
+            "i_7day_ctr", "i_7day_str", "i_7day_rov", "i_7day_ros",
+            "i_3month_ctr", "i_3month_str", "i_3month_rov", "i_3month_ros",
+            // ----------
+            "share_uv_list_1day_6_avg", "share_uv_list_1day_6_var", "share_uv_list_1day_diff_6_avg", "share_uv_list_1day_diff_6_var",
+            "return_uv_list_1day_6_avg", "return_uv_list_1day_6_var", "return_uv_list_1day_diff_6_avg", "return_uv_list_1day_diff_6_var",
+            "share_uv_list_1h_6_avg", "share_uv_list_1h_6_var", "share_uv_list_1h_diff_6_avg", "share_uv_list_1h_diff_6_var",
+            "return_uv_list_1h_6_avg", "return_uv_list_1h_6_var", "return_uv_list_1h_diff_6_avg", "return_uv_list_1h_diff_6_var",
+            // ----------
+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
+            // ----------
+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h",
+            // ---------- rate
+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
+          )
+          val resultNew = new JSONObject
+          names.foreach(r => {
+            if (result.containsKey(r)) {
+              resultNew.put(r, result.get(r))
+            }
+          })
+
+          //4 处理label信息。
+          val labels = Set(
+            "pagesource", "recommend_page_type", "pagesource_change",
+            "abcode",
+            "is_play", "playtime",
+            "is_share", "share_cnt_pv", "share_ts_list",
+            "is_return", "return_cnt_pv", "return_cnt_uv", "return_mid_ts_list"
+          )
+          val labelNew = new JSONObject
+          val labelMap = getFeatureFromSet(labels, record)
+          labels.foreach(r => {
+            if (labelMap.containsKey(r)) {
+              labelNew.put(r, labelMap(r))
+            }
+          })
+          //5 处理log key表头。
+          val mid = record.getString("mid")
+          val videoid = record.getString("videoid")
+          val logtimestamp = record.getString("logtimestamp")
+          val apptype = record.getString("apptype")
+          val pagesource_change = record.getString("pagesource_change")
+          val abcode = record.getString("abcode")
+          val video_recommend = if (!record.isNull("video_recommend")) record.getString("video_recommend") else "111"
+
+          val logKey = (mid, videoid, logtimestamp, apptype, pagesource_change, abcode, video_recommend).productIterator.mkString(":")
+          val labelKey = labelNew.toString()
+          val featureKey = resultNew.toString()
+          //6 拼接数据,保存。
+          logKey + "\t" + labelKey + "\t" + featureKey
+
+        })
+
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + partition
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        odpsData.saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      }else{
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+  }
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+
+  def getFeatureFromSet(set: Set[String], record: Record): mutable.HashMap[String, String] = {
+    val result = mutable.HashMap[String, String]()
+    set.foreach(r =>{
+      if (!record.isNull(r)){
+        try{
+          result.put(r, record.getString(r))
+        }catch {
+          case _ => result.put(r, String.valueOf(record.getBigint(r)))
+        }
+      }
+    })
+    result
+  }
+}

+ 187 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_11_strData_v3.scala

@@ -0,0 +1,187 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.alibaba.fastjson.{JSON, JSONObject}
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
+import examples.dataloader.OfflineVlogShareLRFeatureExtractorV2
+import examples.extractor.ExtractorUtils
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import java.util
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+
+object makedata_11_strData_v3 {
+  def main(args: Array[String]) {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
+    val beginStr = param.getOrElse("beginStr", "20230101")
+    val endStr = param.getOrElse("endStr", "20230101")
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/10_sample_data_v3/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/11_str_data_v3/")
+    val featureVersion =  param.getOrElse("featureVersion", "v2")
+    val ifRepart = param.getOrElse("ifRepart", "10").toInt
+    val labelVersion = param.getOrElse("labelVersion", "v1")
+
+    // 3 循环执行数据生产
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      val partition = partitionPrefix + date
+      println("开始执行partiton:" + partition)
+      var hdfsPath = readPath + "/" + partition
+
+      //1 样本过滤(分享样本、012345中的、可推荐的video、不同产品)
+      val data1 = sc.textFile(hdfsPath).map(r => {
+        val rList = r.split("\t")
+        val logKeyStr = rList(0)
+        val (mid, videoid, logtimestamp, apptype, pagesource_change, abcode, video_recommend) = ParamUtils.parseLogKey(logKeyStr)
+        val labelStr = rList(1)
+        val feaStr = rList(2)
+        val labelJson = JSON.parseObject(labelStr)
+        val is_share = labelJson.getString("is_share")
+        (logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp.toLong)
+      }).filter({
+        case (logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp) =>
+          val pages = Set("2")
+          val video_status = Set("-6")
+          val apps = Set("0", "4", "5", "21", "3", "6")
+          pages.contains(pagesource_change) && video_status.contains(video_recommend) && apps.contains(apptype)
+      })
+
+      //2 样本采样(str模型不做采样 保留所有曝光样本)
+      val data2 = data1.map({
+        case (logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp) =>
+          val feaJson = JSON.parseObject(feaStr)
+          val is_share = labelJson.getString("is_share")
+          if ("0".equals(is_share)){
+            ("0", feaJson)
+          }else{
+            ("1", feaJson)
+          }
+      })
+
+      //3 保留一份原始样本的中间数据
+      println("样本比例")
+      data2.map(r=> (r._1, 1)).reduceByKey(_+_).map(r=> r._1 + "\t" + r._2).collect().foreach(println)
+
+      //4 特征绝对值转换 如 0.456变成19
+      val data3 = data2.map({
+        case (label, feaJson) =>
+          Set(
+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
+            "u_7day_ctr", "u_7day_str", "u_7day_rov", "u_7day_ros",
+            "u_3month_ctr", "u_3month_str", "u_3month_rov", "u_3month_ros",
+            // ----------
+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
+            "i_7day_ctr", "i_7day_str", "i_7day_rov", "i_7day_ros",
+            "i_3month_ctr", "i_3month_str", "i_3month_rov", "i_3month_ros",
+            // ----------
+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
+          ).foreach(key =>{
+            if (feaJson.containsKey(key)){
+              val value = ExtractorUtils.ceilLogRate(feaJson.getString(key).toDouble)
+              feaJson.put(key, value.toString)
+            }
+          })
+          Set(
+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt",
+            // ----------
+            "total_time", "play_count", "play_count_total",
+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
+            "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt",
+            // ----------
+            "share_uv_list_1day_6_avg", "share_uv_list_1day_6_var", "share_uv_list_1day_diff_6_avg", "share_uv_list_1day_diff_6_var",
+            "return_uv_list_1day_6_avg", "return_uv_list_1day_6_var", "return_uv_list_1day_diff_6_avg", "return_uv_list_1day_diff_6_var",
+            "share_uv_list_1h_6_avg", "share_uv_list_1h_6_var", "share_uv_list_1h_diff_6_avg", "share_uv_list_1h_diff_6_var",
+            "return_uv_list_1h_6_avg", "return_uv_list_1h_6_var", "return_uv_list_1h_diff_6_avg", "return_uv_list_1h_diff_6_var",
+            // ----------
+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
+            // ----------
+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
+
+          ).foreach(key => {
+            if (feaJson.containsKey(key)) {
+              val value = ExtractorUtils.bucketCnt(feaJson.getString(key).toDouble)
+              feaJson.put(key, value.toString)
+            }
+          })
+          (label, feaJson)
+      })
+      //5 libsvm 转换
+      val data4 = data3.map({
+        case (label, feaJson) =>
+          val feaSet = Set(
+            "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
+            "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
+            "total_time", "play_count_total",
+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
+            "u_7day_ctr", "u_7day_str", "u_7day_rov", "u_7day_ros",
+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
+            "i_7day_ctr", "i_7day_str", "i_7day_rov", "i_7day_ros",
+
+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
+          )
+          val feaMap = new util.HashMap[String, String]()
+          feaSet.foreach(r => {
+            if (feaJson.containsKey(r)) {
+              feaMap.put(r, feaJson.getString(r))
+            }
+          })
+          val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV2()
+          bytesFeatureExtractor.makeFeature4String(feaMap)
+          val featureMap = bytesFeatureExtractor.featureMap
+          label + "\t" + featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t")
+
+      })
+
+      // 7 保存数据到hdfs
+      hdfsPath = savePath + "/" + partition
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        if (ifRepart == 0){
+          data4.saveAsTextFile(hdfsPath, classOf[GzipCodec])
+        }else{
+          data4.repartition(ifRepart).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+        }
+      }else{
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+  }
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+
+}

+ 215 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_12_rosData_v3.scala

@@ -0,0 +1,215 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.alibaba.fastjson.{JSON, JSONObject}
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
+import examples.dataloader.{OfflineVlogShareLRFeatureExtractorV1, OfflineVlogShareLRFeatureExtractorV2}
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import java.util
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import examples.extractor.ExtractorUtils
+
+object makedata_12_rosData_v3 {
+  def main(args: Array[String]) {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
+    val beginStr = param.getOrElse("beginStr", "20230101")
+    val endStr = param.getOrElse("endStr", "20230101")
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/10_sample_data_v3/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/12_ros_data_v3/")
+    val featureVersion =  param.getOrElse("featureVersion", "v2")
+    val ifRepart = param.getOrElse("ifRepart", "10").toInt
+    val labelVersion = param.getOrElse("labelVersion", "v1")
+
+    // 3 循环执行数据生产
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      val partition = partitionPrefix + date
+      println("开始执行partiton:" + partition)
+      var hdfsPath = readPath + "/" + partition
+
+      //1 样本过滤(分享样本、012345中的、可推荐的video、不同产品)
+      val data1 = sc.textFile(hdfsPath).map(r => {
+        val rList = r.split("\t")
+        val logKeyStr = rList(0)
+        val (mid, videoid, logtimestamp, apptype, pagesource_change, abcode, video_recommend) = ParamUtils.parseLogKey(logKeyStr)
+        val labelStr = rList(1)
+        val feaStr = rList(2)
+        val labelJson = JSON.parseObject(labelStr)
+        val is_share = labelJson.getString("is_share")
+        (mid, logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp.toLong)
+      }).filter({
+        case (mid, logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp) =>
+          val pages = Set("2")
+          val video_status = Set("-6")
+          val apps = Set("0", "4", "5", "21", "3", "6")
+          "1".equals(is_share) && pages.contains(pagesource_change) && video_status.contains(video_recommend) && apps.contains(apptype)
+      })
+
+      //2 样本采样(多个回流的样本复制,等价回流量的加权)
+      val data2 = data1.flatMap({
+        case (mid, logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp) =>
+          val res = ArrayBuffer[(String, JSONObject)]()
+          val feaJson = JSON.parseObject(feaStr)
+          val is_return = labelJson.getString("is_return")
+          if ("0".equals(is_return)){
+            res.add(("0", feaJson))
+          }else{
+            val return_mid_ts_list = labelJson.getString("return_mid_ts_list").split(",").map(r => {
+              val midReturn = r.split(":")(0)
+              val ts = r.split(":")(1).toLong
+              (midReturn, ts)
+            }).filter(!_._1.equals(mid)).sortBy(_._2)
+            // 样本中做了一个必要的过滤,如果是自己的回流,过滤掉。
+
+            if (return_mid_ts_list.nonEmpty){
+              var flag = true
+              val midSet = scala.collection.mutable.HashSet[String]()
+              for ((midReturn, tsReturn) <- return_mid_ts_list) {
+                if (!midSet.contains(midReturn)) {
+                  midSet.add(midReturn)
+                  if ((tsReturn / 1000 - logtimestamp / 1000) <= 3600 && tsReturn - logtimestamp > 0) {
+                    res.add(("1", feaJson))
+                    flag = false
+                  }
+                }
+              }
+              if (flag) {
+                // 如果上面一个正样本都没添加,那么添加一个负样本。代表近一个小时内没有回流。
+                res.add(("0", feaJson))
+              }
+            }else {
+              // 如果把自己的回流过滤掉了之后,没有其他回流,那么是负样本。
+              res.add(("0", feaJson))
+            }
+          }
+          res.iterator
+      })
+
+      //3 保留一份原始样本的中间数据
+      println("样本比例")
+      data2.map(r=> (r._1, 1)).reduceByKey(_+_).map(r=> r._1 + "\t" + r._2).collect().foreach(println)
+
+      //4 特征绝对值转换 如 0.456变成19
+      val data3 = data2.map({
+        case (label, feaJson) =>
+          Set(
+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
+            "u_7day_ctr", "u_7day_str", "u_7day_rov", "u_7day_ros",
+            "u_3month_ctr", "u_3month_str", "u_3month_rov", "u_3month_ros",
+            // ----------
+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
+            "i_7day_ctr", "i_7day_str", "i_7day_rov", "i_7day_ros",
+            "i_3month_ctr", "i_3month_str", "i_3month_rov", "i_3month_ros",
+            // ----------
+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
+          ).foreach(key =>{
+            if (feaJson.containsKey(key)){
+              val value = ExtractorUtils.ceilLogRate(feaJson.getString(key).toDouble)
+              feaJson.put(key, value.toString)
+            }
+          })
+          Set(
+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt",
+            // ----------
+            "total_time", "play_count", "play_count_total",
+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
+            "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt",
+            // ----------
+            "share_uv_list_1day_6_avg", "share_uv_list_1day_6_var", "share_uv_list_1day_diff_6_avg", "share_uv_list_1day_diff_6_var",
+            "return_uv_list_1day_6_avg", "return_uv_list_1day_6_var", "return_uv_list_1day_diff_6_avg", "return_uv_list_1day_diff_6_var",
+            "share_uv_list_1h_6_avg", "share_uv_list_1h_6_var", "share_uv_list_1h_diff_6_avg", "share_uv_list_1h_diff_6_var",
+            "return_uv_list_1h_6_avg", "return_uv_list_1h_6_var", "return_uv_list_1h_diff_6_avg", "return_uv_list_1h_diff_6_var",
+            // ----------
+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
+            // ----------
+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
+
+          ).foreach(key => {
+            if (feaJson.containsKey(key)) {
+              val value = ExtractorUtils.bucketCnt(feaJson.getString(key).toDouble)
+              feaJson.put(key, value.toString)
+            }
+          })
+          (label, feaJson)
+      })
+      //5 libsvm 转换
+      val data4 = data3.map({
+        case (label, feaJson) =>
+          val feaSet = Set(
+            "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
+            "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
+            "total_time", "play_count_total",
+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
+            "u_7day_ctr", "u_7day_str", "u_7day_rov", "u_7day_ros",
+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
+            "i_7day_ctr", "i_7day_str", "i_7day_rov", "i_7day_ros",
+
+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
+          )
+          val feaMap = new util.HashMap[String, String]()
+          feaSet.foreach(r => {
+            if (feaJson.containsKey(r)) {
+              feaMap.put(r, feaJson.getString(r))
+            }
+          })
+          val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV2()
+          bytesFeatureExtractor.makeFeature4String(feaMap)
+          val featureMap = bytesFeatureExtractor.featureMap
+          label + "\t" + featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t")
+
+      })
+
+      // 7 保存数据到hdfs
+      hdfsPath = savePath + "/" + partition
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        if (ifRepart == 0){
+          data4.saveAsTextFile(hdfsPath, classOf[GzipCodec])
+        }else{
+          data4.repartition(ifRepart).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+        }
+      }else{
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+  }
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+
+}

+ 216 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_12_rosData_v3_noweight.scala

@@ -0,0 +1,216 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.alibaba.fastjson.{JSON, JSONObject}
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
+import examples.dataloader.OfflineVlogShareLRFeatureExtractorV2
+import examples.extractor.ExtractorUtils
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import java.util
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+
+object makedata_12_rosData_v3_noweight {
+  def main(args: Array[String]) {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
+    val beginStr = param.getOrElse("beginStr", "20230101")
+    val endStr = param.getOrElse("endStr", "20230101")
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/10_sample_data_v3/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/12_ros_data_v3_noweight/")
+    val featureVersion =  param.getOrElse("featureVersion", "v2")
+    val ifRepart = param.getOrElse("ifRepart", "10").toInt
+    val labelVersion = param.getOrElse("labelVersion", "v1")
+
+    // 3 循环执行数据生产
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      val partition = partitionPrefix + date
+      println("开始执行partiton:" + partition)
+      var hdfsPath = readPath + "/" + partition
+
+      //1 样本过滤(分享样本、012345中的、可推荐的video、不同产品)
+      val data1 = sc.textFile(hdfsPath).map(r => {
+        val rList = r.split("\t")
+        val logKeyStr = rList(0)
+        val (mid, videoid, logtimestamp, apptype, pagesource_change, abcode, video_recommend) = ParamUtils.parseLogKey(logKeyStr)
+        val labelStr = rList(1)
+        val feaStr = rList(2)
+        val labelJson = JSON.parseObject(labelStr)
+        val is_share = labelJson.getString("is_share")
+        (mid, logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp.toLong)
+      }).filter({
+        case (mid, logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp) =>
+          val pages = Set("2")
+          val video_status = Set("-6")
+          val apps = Set("0", "4", "5", "21", "3", "6")
+          "1".equals(is_share) && pages.contains(pagesource_change) && video_status.contains(video_recommend) && apps.contains(apptype)
+      })
+
+      //2 样本采样
+      val data2 = data1.flatMap({
+        case (mid, logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp) =>
+          val res = ArrayBuffer[(String, JSONObject)]()
+          val feaJson = JSON.parseObject(feaStr)
+          val is_return = labelJson.getString("is_return")
+          if ("0".equals(is_return)) {
+            res.add(("0", feaJson))
+          } else {
+            val return_mid_ts_list = labelJson.getString("return_mid_ts_list").split(",").map(r => {
+              val midReturn = r.split(":")(0)
+              val ts = r.split(":")(1).toLong
+              (midReturn, ts)
+            }).filter(!_._1.equals(mid)).sortBy(_._2)
+            // 样本中做了一个必要的过滤,如果是自己的回流,过滤掉。
+
+            if (return_mid_ts_list.nonEmpty) {
+              var flag = true
+              val midSet = scala.collection.mutable.HashSet[String]()
+              for ((midReturn, tsReturn) <- return_mid_ts_list) {
+                if (flag && !midSet.contains(midReturn)) {
+                  // 通过flag的变化,只添加一条正样本。实现不加权。
+                  midSet.add(midReturn)
+                  if ((tsReturn / 1000 - logtimestamp / 1000) <= 3600 && tsReturn - logtimestamp > 0) {
+                    res.add(("1", feaJson))
+                    flag = false
+                  }
+                }
+              }
+              if (flag) {
+                // 如果上面一个正样本都没添加,那么添加一个负样本。代表近一个小时内没有回流。
+                res.add(("0", feaJson))
+              }
+            } else {
+              // 如果把自己的回流过滤掉了之后,没有其他回流,那么是负样本。
+              res.add(("0", feaJson))
+            }
+          }
+          res.iterator
+      })
+
+      //3 保留一份原始样本的中间数据
+      println("样本比例")
+      data2.map(r=> (r._1, 1)).reduceByKey(_+_).map(r=> r._1 + "\t" + r._2).collect().foreach(println)
+
+      //4 特征绝对值转换 如 0.456变成19
+      val data3 = data2.map({
+        case (label, feaJson) =>
+          Set(
+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
+            "u_7day_ctr", "u_7day_str", "u_7day_rov", "u_7day_ros",
+            "u_3month_ctr", "u_3month_str", "u_3month_rov", "u_3month_ros",
+            // ----------
+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
+            "i_7day_ctr", "i_7day_str", "i_7day_rov", "i_7day_ros",
+            "i_3month_ctr", "i_3month_str", "i_3month_rov", "i_3month_ros",
+            // ----------
+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
+          ).foreach(key =>{
+            if (feaJson.containsKey(key)){
+              val value = ExtractorUtils.ceilLogRate(feaJson.getString(key).toDouble)
+              feaJson.put(key, value.toString)
+            }
+          })
+          Set(
+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt",
+            // ----------
+            "total_time", "play_count", "play_count_total",
+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
+            "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt",
+            // ----------
+            "share_uv_list_1day_6_avg", "share_uv_list_1day_6_var", "share_uv_list_1day_diff_6_avg", "share_uv_list_1day_diff_6_var",
+            "return_uv_list_1day_6_avg", "return_uv_list_1day_6_var", "return_uv_list_1day_diff_6_avg", "return_uv_list_1day_diff_6_var",
+            "share_uv_list_1h_6_avg", "share_uv_list_1h_6_var", "share_uv_list_1h_diff_6_avg", "share_uv_list_1h_diff_6_var",
+            "return_uv_list_1h_6_avg", "return_uv_list_1h_6_var", "return_uv_list_1h_diff_6_avg", "return_uv_list_1h_diff_6_var",
+            // ----------
+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
+            // ----------
+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
+
+          ).foreach(key => {
+            if (feaJson.containsKey(key)) {
+              val value = ExtractorUtils.bucketCnt(feaJson.getString(key).toDouble)
+              feaJson.put(key, value.toString)
+            }
+          })
+          (label, feaJson)
+      })
+      //5 libsvm 转换
+      val data4 = data3.map({
+        case (label, feaJson) =>
+          val feaSet = Set(
+            "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
+            "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
+            "total_time", "play_count_total",
+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
+            "u_7day_ctr", "u_7day_str", "u_7day_rov", "u_7day_ros",
+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
+            "i_7day_ctr", "i_7day_str", "i_7day_rov", "i_7day_ros",
+
+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
+          )
+          val feaMap = new util.HashMap[String, String]()
+          feaSet.foreach(r => {
+            if (feaJson.containsKey(r)) {
+              feaMap.put(r, feaJson.getString(r))
+            }
+          })
+          val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV2()
+          bytesFeatureExtractor.makeFeature4String(feaMap)
+          val featureMap = bytesFeatureExtractor.featureMap
+          label + "\t" + featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t")
+
+      })
+
+      // 7 保存数据到hdfs
+      hdfsPath = savePath + "/" + partition
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        if (ifRepart == 0){
+          data4.saveAsTextFile(hdfsPath, classOf[GzipCodec])
+        }else{
+          data4.repartition(ifRepart).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+        }
+      }else{
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+  }
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+
+}

+ 278 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_13_originData_20240529.scala

@@ -0,0 +1,278 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.alibaba.fastjson.JSONObject
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+import scala.collection.JavaConversions._
+import examples.extractor.RankExtractorFeature_20240530
+import org.xm.Similarity
+import scala.collection.mutable.ArrayBuffer
+/*
+   20240608 提取特征
+ */
+
+object makedata_13_originData_20240529 {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val tablePart = param.getOrElse("tablePart", "64").toInt
+    val beginStr = param.getOrElse("beginStr", "2023010100")
+    val endStr = param.getOrElse("endStr", "2023010123")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/13_sample_data/")
+    val project = param.getOrElse("project", "loghubods")
+    val table = param.getOrElse("table", "XXXX")
+    val repartition = param.getOrElse("repartition", "100").toInt
+
+    // 2 读取odps+表信息
+    val odpsOps = env.getODPS(sc)
+
+    // 3 循环执行数据生产
+    val timeRange = MyDateUtils.getDateHourRange(beginStr, endStr)
+    for (dt_hh <- timeRange) {
+      val dt = dt_hh.substring(0, 8)
+      val hh = dt_hh.substring(8, 10)
+      val partition = s"dt=$dt,hh=$hh"
+      println("开始执行partiton:" + partition)
+      val odpsData = odpsOps.readTable(project = project,
+        table = table,
+        partition = partition,
+        transfer = func,
+        numPartition = tablePart)
+        .map(record => {
+
+          val featureMap = new JSONObject()
+
+          // a 视频特征
+          val b1: JSONObject = if (record.isNull("b1_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b1_feature"))
+          val b2: JSONObject = if (record.isNull("b2_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b2_feature"))
+          val b3: JSONObject = if (record.isNull("b3_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b3_feature"))
+          val b6: JSONObject = if (record.isNull("b6_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b6_feature"))
+          val b7: JSONObject = if (record.isNull("b7_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b7_feature"))
+
+          val b8: JSONObject = if (record.isNull("b8_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b8_feature"))
+          val b9: JSONObject = if (record.isNull("b9_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b9_feature"))
+          val b10: JSONObject = if (record.isNull("b10_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b10_feature"))
+          val b11: JSONObject = if (record.isNull("b11_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b11_feature"))
+          val b12: JSONObject = if (record.isNull("b12_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b12_feature"))
+          val b13: JSONObject = if (record.isNull("b13_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b13_feature"))
+          val b17: JSONObject = if (record.isNull("b17_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b17_feature"))
+          val b18: JSONObject = if (record.isNull("b18_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b18_feature"))
+          val b19: JSONObject = if (record.isNull("b19_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b19_feature"))
+
+
+          val origin_data = List(
+            (b1, b2, b3, "b123"), (b1, b6, b7, "b167"),
+            (b8, b9, b10, "b8910"), (b11, b12, b13, "b111213"),
+            (b17, b18, b19, "b171819")
+          )
+          for ((b_1, b_2, b_3, prefix1) <- origin_data){
+            for (prefix2 <- List(
+              "1h", "2h", "3h", "4h", "12h", "1d", "3d", "7d"
+            )){
+              val exp = if (b_1.isEmpty) 0D else b_1.getIntValue("exp_pv_" + prefix2).toDouble
+              val share = if (b_2.isEmpty) 0D else b_2.getIntValue("share_pv_" + prefix2).toDouble
+              val returns = if (b_3.isEmpty) 0D else b_3.getIntValue("return_uv_" + prefix2).toDouble
+              val f1 = RankExtractorFeature_20240530.calDiv(share, exp)
+              val f2 = RankExtractorFeature_20240530.calLog(share)
+              val f3 = RankExtractorFeature_20240530.calDiv(returns, exp)
+              val f4 = RankExtractorFeature_20240530.calLog(returns)
+              val f5 = f3 * f4
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "STR", f1)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "log(share)", f2)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ROV", f3)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "log(return)", f4)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ROV*log(return)", f5)
+            }
+          }
+
+          val video_info: JSONObject = if (record.isNull("t_v_info_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("t_v_info_feature"))
+          featureMap.put("total_time", if (video_info.containsKey("total_time")) video_info.getIntValue("total_time").toDouble else 0D)
+          featureMap.put("bit_rate", if (video_info.containsKey("bit_rate")) video_info.getIntValue("bit_rate").toDouble else 0D)
+
+          val c1: JSONObject = if (record.isNull("c1_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("c1_feature"))
+          if (c1.nonEmpty){
+            featureMap.put("playcnt_6h", if (c1.containsKey("playcnt_6h")) c1.getIntValue("playcnt_6h").toDouble else 0D)
+            featureMap.put("playcnt_1d", if (c1.containsKey("playcnt_1d")) c1.getIntValue("playcnt_1d").toDouble else 0D)
+            featureMap.put("playcnt_3d", if (c1.containsKey("playcnt_3d")) c1.getIntValue("playcnt_3d").toDouble else 0D)
+            featureMap.put("playcnt_7d", if (c1.containsKey("playcnt_7d")) c1.getIntValue("playcnt_7d").toDouble else 0D)
+          }
+          val c2: JSONObject = if (record.isNull("c2_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("c2_feature"))
+          if (c2.nonEmpty){
+            featureMap.put("share_pv_12h", if (c2.containsKey("share_pv_12h")) c2.getIntValue("share_pv_12h").toDouble else 0D)
+            featureMap.put("share_pv_1d", if (c2.containsKey("share_pv_1d")) c2.getIntValue("share_pv_1d").toDouble else 0D)
+            featureMap.put("share_pv_3d", if (c2.containsKey("share_pv_3d")) c2.getIntValue("share_pv_3d").toDouble else 0D)
+            featureMap.put("share_pv_7d", if (c2.containsKey("share_pv_7d")) c2.getIntValue("share_pv_7d").toDouble else 0D)
+            featureMap.put("return_uv_12h", if (c2.containsKey("return_uv_12h")) c2.getIntValue("return_uv_12h").toDouble else 0D)
+            featureMap.put("return_uv_1d", if (c2.containsKey("return_uv_1d")) c2.getIntValue("return_uv_1d").toDouble else 0D)
+            featureMap.put("return_uv_3d", if (c2.containsKey("return_uv_3d")) c2.getIntValue("return_uv_3d").toDouble else 0D)
+            featureMap.put("return_uv_7d", if (c2.containsKey("return_uv_7d")) c2.getIntValue("return_uv_7d").toDouble else 0D)
+          }
+
+          val title = if (video_info.containsKey("title")) video_info.getString("title") else ""
+          if (!title.equals("")){
+            for (key_feature <- List("c3_feature", "c4_feature", "c5_feature", "c6_feature", "c7_feature")){
+              val c34567: JSONObject = if (record.isNull(key_feature)) new JSONObject() else
+                JSON.parseObject(record.getString(key_feature))
+              for (key_time <- List("tags_1d", "tags_3d", "tags_7d")) {
+                val tags = if (c34567.containsKey(key_time)) c34567.getString(key_time) else ""
+                if (!tags.equals("")){
+                  val (f1, f2, f3, f4) = funcC34567ForTags(tags, title)
+                  featureMap.put(key_feature + "_" + key_time + "_matchnum", f1)
+                  featureMap.put(key_feature + "_" + key_time + "_maxscore", f3)
+                  featureMap.put(key_feature + "_" + key_time + "_avgscore", f4)
+                }
+              }
+            }
+          }
+
+          val vid = if (record.isNull("vid")) "" else record.getString("vid")
+          if (!vid.equals("")){
+            for (key_feature <- List("c8_feature", "c9_feature")){
+              val c89: JSONObject = if (record.isNull(key_feature)) new JSONObject() else
+                JSON.parseObject(record.getString(key_feature))
+              for (key_action <- List("share", "return")){
+                  val cfListStr = if (c89.containsKey(key_action)) c89.getString(key_action) else ""
+                  if (!cfListStr.equals("")){
+                    val cfMap = cfListStr.split(",").map(r =>{
+                      val rList = r.split(":")
+                      (rList(0), (rList(1), rList(2), rList(3)))
+                    }).toMap
+                    if (cfMap.contains(vid)){
+                      val (score, num, rank) = cfMap(vid)
+                      featureMap.put(key_feature + "_" + key_action + "_score", score.toDouble)
+                      featureMap.put(key_feature + "_" + key_action + "_num", num.toDouble)
+                      featureMap.put(key_feature + "_" + key_action + "_rank", 1.0 / rank.toDouble)
+                    }
+                  }
+              }
+            }
+          }
+
+          val d1: JSONObject = if (record.isNull("d1_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("d1_feature"))
+          if (d1.nonEmpty){
+            featureMap.put("d1_exp", if (d1.containsKey("exp")) d1.getString("exp").toDouble else 0D)
+            featureMap.put("d1_return_n", if (d1.containsKey("return_n")) d1.getString("return_n").toDouble else 0D)
+            featureMap.put("d1_rovn", if (d1.containsKey("rovn")) d1.getString("rovn").toDouble else 0D)
+          }
+
+
+          /*
+
+
+          视频:
+          曝光使用pv 分享使用pv 回流使用uv --> 1h 2h 3h 4h 12h 1d 3d 7d
+          STR log(share) ROV log(return) ROV*log(return)
+          40个特征组合
+          整体、整体曝光对应、推荐非冷启root、推荐冷启root、分省份root
+          200个特征值
+
+          视频:
+          视频时长、比特率
+
+          人:
+          播放次数 --> 6h 1d 3d 7d --> 4个
+          带回来的分享pv 回流uv --> 12h 1d 3d 7d --> 8个
+          人+vid-title:
+          播放点/回流点/分享点/累积分享/累积回流 --> 1d 3d 7d --> 匹配数量 语义最高相似度分 语义平均相似度分 --> 45个
+          人+vid-cf
+          基于分享行为/基于回流行为 -->  “分享cf”+”回流点击cf“ 相似分 相似数量 相似rank的倒数 --> 12个
+
+          头部视频:
+          曝光 回流 ROVn 3个特征
+
+          场景:
+          小时 星期 apptype city province pagesource 机器型号
+           */
+
+
+
+          //4 处理label信息。
+          val labels = new JSONObject
+          for (labelKey <- List(
+            "is_play", "is_share", "is_return", "noself_is_return", "return_uv", "noself_return_uv", "total_return_uv",
+            "share_pv", "total_share_uv"
+          )){
+            if (!record.isNull(labelKey)){
+              labels.put(labelKey, record.getString(labelKey))
+            }
+          }
+          //5 处理log key表头。
+          val apptype = record.getString("apptype")
+          val pagesource = record.getString("pagesource")
+          val mid = record.getString("mid")
+          // vid 已经提取了
+          val ts = record.getString("ts")
+          val abcode = record.getString("abcode")
+          val level = if (record.isNull("level")) "0" else record.getString("level")
+          val logKey = (apptype, pagesource, mid, vid, ts, abcode, level).productIterator.mkString(",")
+          val labelKey = labels.toString()
+          val featureKey = featureMap.toString()
+          //6 拼接数据,保存。
+          logKey + "\t" + labelKey + "\t" + featureKey
+
+        })
+
+      // 4 保存数据到hdfs
+      val savePartition = dt + hh
+      val hdfsPath = savePath + "/" + savePartition
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        odpsData.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      }else{
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+  }
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+  def funcC34567ForTags(tags: String, title: String): Tuple4[Double, String, Double, Double] = {
+    // 匹配数量 匹配词 语义最高相似度分 语义平均相似度分
+    val tagsList = tags.split(",")
+    var d1 = 0.0
+    val d2 = new ArrayBuffer[String]()
+    var d3 = 0.0
+    var d4 = 0.0
+    for (tag <- tagsList){
+      if (title.contains(tag)){
+        d1 = d1 + 1.0
+        d2.add(tag)
+      }
+      val score = Similarity.conceptSimilarity(tag, title)
+      d3 = if (score > d3) score else d3
+      d4 = d4 + score
+    }
+    d4 = if (tagsList.nonEmpty) d4 / tagsList.size else d4
+    (d1, d2.mkString(","), d3, d4)
+  }
+}

+ 256 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_13_originData_20240529_check.scala

@@ -0,0 +1,256 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.alibaba.fastjson.{JSON, JSONObject}
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import examples.extractor.RankExtractorFeature_20240530
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+import org.xm.Similarity
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+/*
+   20240608 提取特征
+ */
+
+object makedata_13_originData_20240529_check {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val tablePart = param.getOrElse("tablePart", "64").toInt
+    val beginStr = param.getOrElse("beginStr", "2023010100")
+    val endStr = param.getOrElse("endStr", "2023010123")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/13_sample_data/")
+    val project = param.getOrElse("project", "loghubods")
+    val table = param.getOrElse("table", "XXXX")
+    val repartition = param.getOrElse("repartition", "100").toInt
+
+    // 2 读取odps+表信息
+    val odpsOps = env.getODPS(sc)
+
+    // 3 循环执行数据生产
+    val timeRange = MyDateUtils.getDateHourRange(beginStr, endStr)
+    for (dt_hh <- timeRange) {
+      val dt = dt_hh.substring(0, 8)
+      val hh = dt_hh.substring(8, 10)
+      val partition = s"dt=$dt,hh=$hh"
+      println("开始执行partiton:" + partition)
+      val odpsData = odpsOps.readTable(project = project,
+        table = table,
+        partition = partition,
+        transfer = func,
+        numPartition = tablePart)
+        .map(record_ => {
+
+
+          val record = if (record_.isNull("metafeaturemap")) new JSONObject() else
+            JSON.parseObject(record_.getString("metafeaturemap"))
+
+          val featureMap = new JSONObject()
+
+          // a 视频特征
+          val b1: JSONObject = if (!record.containsKey("alg_vid_feature_all_exp")) new JSONObject() else
+            JSON.parseObject(record.getString("alg_vid_feature_all_exp"))
+          val b2: JSONObject = if (!record.containsKey("alg_vid_feature_all_share")) new JSONObject() else
+            JSON.parseObject(record.getString("alg_vid_feature_all_share"))
+          val b3: JSONObject = if (!record.containsKey("alg_vid_feature_all_return")) new JSONObject() else
+            JSON.parseObject(record.getString("alg_vid_feature_all_return"))
+          val b6: JSONObject = if (!record.containsKey("alg_vid_feature_exp2share")) new JSONObject() else
+            JSON.parseObject(record.getString("alg_vid_feature_exp2share"))
+          val b7: JSONObject = if (!record.containsKey("alg_vid_feature_share2return")) new JSONObject() else
+            JSON.parseObject(record.getString("alg_vid_feature_share2return"))
+
+          val b8: JSONObject = if (!record.containsKey("alg_vid_feature_feed_noflow_exp")) new JSONObject() else
+            JSON.parseObject(record.getString("alg_vid_feature_feed_noflow_exp"))
+          val b9: JSONObject = if (!record.containsKey("alg_vid_feature_feed_noflow_root_share")) new JSONObject() else
+            JSON.parseObject(record.getString("alg_vid_feature_feed_noflow_root_share"))
+          val b10: JSONObject = if (!record.containsKey("alg_vid_feature_feed_noflow_root_return")) new JSONObject() else
+            JSON.parseObject(record.getString("alg_vid_feature_feed_noflow_root_return"))
+          val b11: JSONObject = if (!record.containsKey("alg_vid_feature_feed_flow_exp")) new JSONObject() else
+            JSON.parseObject(record.getString("alg_vid_feature_feed_flow_exp"))
+          val b12: JSONObject = if (!record.containsKey("alg_vid_feature_feed_flow_root_share")) new JSONObject() else
+            JSON.parseObject(record.getString("alg_vid_feature_feed_flow_root_share"))
+          val b13: JSONObject = if (!record.containsKey("alg_vid_feature_feed_flow_root_return")) new JSONObject() else
+            JSON.parseObject(record.getString("alg_vid_feature_feed_flow_root_return"))
+          val b17: JSONObject = if (!record.containsKey("alg_vid_feature_feed_province_exp")) new JSONObject() else
+            JSON.parseObject(record.getString("alg_vid_feature_feed_province_exp"))
+          val b18: JSONObject = if (!record.containsKey("alg_vid_feature_feed_province_root_share")) new JSONObject() else
+            JSON.parseObject(record.getString("alg_vid_feature_feed_province_root_share"))
+          val b19: JSONObject = if (!record.containsKey("alg_vid_feature_feed_province_root_return")) new JSONObject() else
+            JSON.parseObject(record.getString("alg_vid_feature_feed_province_root_return"))
+
+
+          val origin_data = List(
+            (b1, b2, b3, "b123"), (b1, b6, b7, "b167"),
+            (b8, b9, b10, "b8910"), (b11, b12, b13, "b111213"),
+            (b17, b18, b19, "b171819")
+          )
+          for ((b_1, b_2, b_3, prefix1) <- origin_data) {
+            for (prefix2 <- List(
+              "1h", "2h", "3h", "4h", "12h", "1d", "3d", "7d"
+            )) {
+              val exp = if (b_1.isEmpty) 0D else b_1.getIntValue("exp_pv_" + prefix2).toDouble
+              val share = if (b_2.isEmpty) 0D else b_2.getIntValue("share_pv_" + prefix2).toDouble
+              val returns = if (b_3.isEmpty) 0D else b_3.getIntValue("return_uv_" + prefix2).toDouble
+              val f1 = RankExtractorFeature_20240530.calDiv(share, exp)
+              val f2 = RankExtractorFeature_20240530.calLog(share)
+              val f3 = RankExtractorFeature_20240530.calDiv(returns, exp)
+              val f4 = RankExtractorFeature_20240530.calLog(returns)
+              val f5 = f3 * f4
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "STR", f1)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "log(share)", f2)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ROV", f3)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "log(return)", f4)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ROV*log(return)", f5)
+            }
+          }
+
+          val video_info: JSONObject = if (!record.containsKey("alg_vid_feature_basic_info")) new JSONObject() else
+            JSON.parseObject(record.getString("alg_vid_feature_basic_info"))
+          featureMap.put("total_time", if (video_info.containsKey("total_time")) video_info.getIntValue("total_time").toDouble else 0D)
+          featureMap.put("bit_rate", if (video_info.containsKey("bit_rate")) video_info.getIntValue("bit_rate").toDouble else 0D)
+
+          val c1: JSONObject = if (!record.containsKey("alg_mid_feature_play")) new JSONObject() else
+            JSON.parseObject(record.getString("alg_mid_feature_play"))
+          if (c1.nonEmpty) {
+            featureMap.put("playcnt_6h", if (c1.containsKey("playcnt_6h")) c1.getIntValue("playcnt_6h").toDouble else 0D)
+            featureMap.put("playcnt_1d", if (c1.containsKey("playcnt_1d")) c1.getIntValue("playcnt_1d").toDouble else 0D)
+            featureMap.put("playcnt_3d", if (c1.containsKey("playcnt_3d")) c1.getIntValue("playcnt_3d").toDouble else 0D)
+            featureMap.put("playcnt_7d", if (c1.containsKey("playcnt_7d")) c1.getIntValue("playcnt_7d").toDouble else 0D)
+          }
+          val c2: JSONObject = if (!record.containsKey("alg_mid_feature_share_and_return")) new JSONObject() else
+            JSON.parseObject(record.getString("alg_mid_feature_share_and_return"))
+          if (c2.nonEmpty) {
+            featureMap.put("share_pv_12h", if (c2.containsKey("share_pv_12h")) c2.getIntValue("share_pv_12h").toDouble else 0D)
+            featureMap.put("share_pv_1d", if (c2.containsKey("share_pv_1d")) c2.getIntValue("share_pv_1d").toDouble else 0D)
+            featureMap.put("share_pv_3d", if (c2.containsKey("share_pv_3d")) c2.getIntValue("share_pv_3d").toDouble else 0D)
+            featureMap.put("share_pv_7d", if (c2.containsKey("share_pv_7d")) c2.getIntValue("share_pv_7d").toDouble else 0D)
+            featureMap.put("return_uv_12h", if (c2.containsKey("return_uv_12h")) c2.getIntValue("return_uv_12h").toDouble else 0D)
+            featureMap.put("return_uv_1d", if (c2.containsKey("return_uv_1d")) c2.getIntValue("return_uv_1d").toDouble else 0D)
+            featureMap.put("return_uv_3d", if (c2.containsKey("return_uv_3d")) c2.getIntValue("return_uv_3d").toDouble else 0D)
+            featureMap.put("return_uv_7d", if (c2.containsKey("return_uv_7d")) c2.getIntValue("return_uv_7d").toDouble else 0D)
+          }
+
+          val title = if (video_info.containsKey("title")) video_info.getString("title") else ""
+          if (!title.equals("")) {
+            for (key_feature <- List(("c3_feature", "alg_mid_feature_play_tags"),
+              ("c4_feature", "alg_mid_feature_play_tags"),
+              ("c5_feature", "alg_mid_feature_play_tags"),
+              ("c6_feature", "alg_mid_feature_play_tags"),
+              ("c7_feature", "alg_mid_feature_play_tags"))) {
+              val c34567: JSONObject = if (!record.containsKey(key_feature._2)) new JSONObject() else
+                JSON.parseObject(record.getString(key_feature._2))
+              for (key_time <- List("tags_1d", "tags_3d", "tags_7d")) {
+                val tags = if (c34567.containsKey(key_time)) c34567.getString(key_time) else ""
+                if (!tags.equals("")) {
+                  val (f1, f2, f3, f4) = funcC34567ForTags(tags, title)
+                  featureMap.put(key_feature._1 + "_" + key_time + "_matchnum", f1)
+                  featureMap.put(key_feature._1 + "_" + key_time + "_maxscore", f3)
+                  featureMap.put(key_feature._1 + "_" + key_time + "_avgscore", f4)
+                }
+              }
+            }
+          }
+
+          val vid = if (record_.isNull("vid")) "" else record_.getString("vid")
+          if (!vid.equals("")) {
+            for (key_feature <- List(("c8_feature", "alg_mid_feature_sharecf"), ("c9_feature", "alg_mid_feature_returncf"))) {
+              val c89: JSONObject = if (!record.containsKey(key_feature._2)) new JSONObject() else
+                JSON.parseObject(record.getString(key_feature._2))
+              for (key_action <- List("share", "return")) {
+                val cfListStr = if (c89.containsKey(key_action)) c89.getString(key_action) else ""
+                if (!cfListStr.equals("")) {
+                  val cfMap = cfListStr.split(",").map(r => {
+                    val rList = r.split(":")
+                    (rList(0), (rList(1), rList(2), rList(3)))
+                  }).toMap
+                  if (cfMap.contains(vid)) {
+                    val (score, num, rank) = cfMap(vid)
+                    featureMap.put(key_feature._1 + "_" + key_action + "_score", score.toDouble)
+                    featureMap.put(key_feature._1 + "_" + key_action + "_num", num.toDouble)
+                    featureMap.put(key_feature._1 + "_" + key_action + "_rank", 1.0 / rank.toDouble)
+                  }
+                }
+              }
+            }
+          }
+
+          val d1: JSONObject = if (!record.containsKey("alg_recsys_feature_cf_i2i_new")) new JSONObject() else
+            JSON.parseObject(record.getString("alg_recsys_feature_cf_i2i_new"))
+          if (d1.nonEmpty) {
+            featureMap.put("d1_exp", if (d1.containsKey("exp")) d1.getString("exp").toDouble else 0D)
+            featureMap.put("d1_return_n", if (d1.containsKey("return_n")) d1.getString("return_n").toDouble else 0D)
+            featureMap.put("d1_rovn", if (d1.containsKey("rovn")) d1.getString("rovn").toDouble else 0D)
+          }
+
+
+          //4 处理label信息。
+          val labels = new JSONObject
+          for (labelKey <- List(
+            "is_play", "is_share", "is_return", "noself_is_return", "return_uv", "noself_return_uv", "total_return_uv",
+            "share_pv", "total_share_uv"
+          )){
+            if (!record_.isNull(labelKey)){
+              labels.put(labelKey, record_.getString(labelKey))
+            }
+          }
+          //5 处理log key表头。
+          val apptype = record_.getString("apptype")
+          val pagesource = record_.getString("pagesource")
+          val mid = record_.getString("mid")
+          // vid 已经提取了
+          val ts = record_.getString("ts")
+          val abcode = record_.getString("abcode")
+          val level = if (record_.isNull("level")) "0" else record_.getString("level")
+          val logKey = (apptype, pagesource, mid, vid, ts, abcode, level).productIterator.mkString(",")
+          val labelKey = labels.toString()
+          val featureKey = featureMap.toString()
+          //6 拼接数据,保存。
+          logKey + "\t" + labelKey + "\t" + featureKey
+
+        })
+
+      // 4 保存数据到hdfs
+      val savePartition = dt + hh
+      val hdfsPath = savePath + "/" + savePartition
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        odpsData.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      }else{
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+  }
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+  def funcC34567ForTags(tags: String, title: String): Tuple4[Double, String, Double, Double] = {
+    // 匹配数量 匹配词 语义最高相似度分 语义平均相似度分
+    val tagsList = tags.split(",")
+    var d1 = 0.0
+    val d2 = new ArrayBuffer[String]()
+    var d3 = 0.0
+    var d4 = 0.0
+    for (tag <- tagsList){
+      if (title.contains(tag)){
+        d1 = d1 + 1.0
+        d2.add(tag)
+      }
+      val score = Similarity.conceptSimilarity(tag, title)
+      d3 = if (score > d3) score else d3
+      d4 = d4 + score
+    }
+    d4 = if (tagsList.nonEmpty) d4 / tagsList.size else d4
+    (d1, d2.mkString(","), d3, d4)
+  }
+}

+ 92 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_14_valueData_20240608.scala

@@ -0,0 +1,92 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+/*
+
+ */
+
+object makedata_14_valueData_20240608 {
+  def main(args: Array[String]): Unit = {
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    val loader = getClass.getClassLoader
+    val resourceUrl = loader.getResource("20240608_feature_name.txt")
+    val content =
+      if (resourceUrl != null) {
+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
+        Source.fromURL(resourceUrl).close()
+        content
+      } else {
+        ""
+      }
+    println(content)
+    val contentList = content.split("\n")
+      .map(r=> r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r=> r.nonEmpty).toList
+    val contentList_bc = sc.broadcast(contentList)
+
+
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val beginStr = param.getOrElse("beginStr", "20230101")
+    val endStr = param.getOrElse("endStr", "20230101")
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/13_sample_data/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/14_feature_data/")
+    val repartition = param.getOrElse("repartition", "200").toInt
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      val data = sc.textFile(readPath + "/" + date + "*")
+      val data1 = data.map(r => {
+        val rList = r.split("\t")
+        val logKey = rList(0)
+        val labelKey = rList(1)
+        val featureKey = rList(2)
+        (logKey, labelKey, featureKey)
+      }).filter(r =>
+        r._1.split(",")(6).equals("0")
+      ).mapPartitions(row => {
+        val result = new ArrayBuffer[String]()
+        val contentList = contentList_bc.value
+        row.foreach {
+          case (logKey, labelKey, featureKey) =>
+            val featureJson = JSON.parseObject(featureKey)
+
+            val featureValues = contentList.map(key => {
+              if (featureJson.containsKey(key)) {
+                featureJson.getDouble(key)
+              } else {
+                0.0
+              }
+            })
+            result.add(logKey + "\t" + labelKey + "\t" + featureValues.mkString(","))
+        }
+        result.iterator
+      })
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + date
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        data1.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      } else {
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+
+
+  }
+}

+ 92 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_15_bucket_20240608.scala

@@ -0,0 +1,92 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.aliyun.odps.spark.examples.myUtils.{MyHdfsUtils, ParamUtils}
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+/*
+
+ */
+
+object makedata_15_bucket_20240608 {
+  def main(args: Array[String]): Unit = {
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    val loader = getClass.getClassLoader
+    val resourceUrl = loader.getResource("20240608_feature_name.txt")
+    val content =
+      if (resourceUrl != null) {
+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
+        Source.fromURL(resourceUrl).close()
+        content
+      } else {
+        ""
+      }
+    println(content)
+    val contentList = content.split("\n")
+      .map(r=> r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r=> r.nonEmpty).toList
+
+
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/14_feature_data/20240607")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/15_bucket_data/")
+    val fileName = param.getOrElse("fileName", "20240607_200")
+    val sampleRate = param.getOrElse("sampleRate", "0.1").toDouble
+    val bucketNum = param.getOrElse("bucketNum", "200").toInt
+
+    val data = sc.textFile(readPath)
+    val data1 = data.map(r => {
+      val rList = r.split("\t")
+      val doubles = rList(2).split(",").map(_.toDouble)
+      doubles
+    }).sample(false, sampleRate ).repartition(20)
+
+    val result = new ArrayBuffer[String]()
+
+    for (i <- contentList.indices){
+      println("特征:" + contentList(i))
+      val data2 = data1.map(r => r(i)).filter(_ > 1E-8).collect().sorted
+      val len = data2.length
+      val oneBucketNum = (len - 1) / (bucketNum - 1) + 1 // 确保每个桶至少有一个元素
+      val buffers = new ArrayBuffer[Double]()
+
+      var lastBucketValue = data2(0) // 记录上一个桶的切分点
+      for (j <- 0 until len by oneBucketNum) {
+        val d = data2(j)
+        if (j > 0 && d != lastBucketValue) {
+          // 如果当前切分点不同于上一个切分点,则保存当前切分点
+          buffers += d
+        }
+        lastBucketValue = d // 更新上一个桶的切分点
+      }
+
+      // 最后一个桶的结束点应该是数组的最后一个元素
+      if (!buffers.contains(data2.last)) {
+        buffers += data2.last
+      }
+      result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(","))
+    }
+    val data3 = sc.parallelize(result)
+
+
+    // 4 保存数据到hdfs
+    val hdfsPath = savePath + "/" + fileName
+    if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+      println("删除路径并开始数据写入:" + hdfsPath)
+      MyHdfsUtils.delete_hdfs_path(hdfsPath)
+      data3.repartition(1).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+    } else {
+      println("路径不合法,无法写入:" + hdfsPath)
+    }
+  }
+}

+ 127 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_16_bucketData_20240609.scala

@@ -0,0 +1,127 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
+import examples.extractor.ExtractorUtils
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import com.alibaba.fastjson.JSON
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+/*
+
+ */
+
+object makedata_16_bucketData_20240609 {
+  def main(args: Array[String]): Unit = {
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    val loader = getClass.getClassLoader
+    val resourceUrl = loader.getResource("20240608_feature_name.txt")
+    val content =
+      if (resourceUrl != null) {
+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
+        Source.fromURL(resourceUrl).close()
+        content
+      } else {
+        ""
+      }
+    println(content)
+    val contentList = content.split("\n")
+      .map(r=> r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r=> r.nonEmpty).toList
+    val contentList_br = sc.broadcast(contentList)
+
+    val resourceUrlBucket = loader.getResource("20240609_bucket_274.txt")
+    val buckets =
+      if (resourceUrlBucket != null) {
+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
+        Source.fromURL(resourceUrlBucket).close()
+        buckets
+      } else {
+        ""
+      }
+    println(buckets)
+    val bucketsMap = buckets.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+      .map(r =>{
+        val rList = r.split("\t")
+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
+      }).toMap
+    val bucketsMap_br = sc.broadcast(bucketsMap)
+
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/14_feature_data/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/16_train_data/")
+    val beginStr = param.getOrElse("beginStr", "20240606")
+    val endStr = param.getOrElse("endStr", "20240607")
+    val repartition = param.getOrElse("repartition", "200").toInt
+
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      println("开始执行:" + date)
+      val data = sc.textFile(readPath + date).map(r=>{
+        val rList = r.split("\t")
+        val logKey = rList(0)
+        val labelKey = rList(1)
+        val features = rList(2).split(",").map(_.toDouble)
+        (logKey, labelKey, features)
+      })
+        .filter{
+          case (logKey, labelKey, features) =>
+            val logKeyList = logKey.split(",")
+            val apptype = logKeyList(0)
+            val pagesource = logKeyList(1)
+            Set("0", "4", "5", "21", "3", "6").contains(apptype) && pagesource.endsWith("recommend")
+        }
+        .map{
+          case (logKey, labelKey, features) =>
+            val label = JSON.parseObject(labelKey).getOrDefault("is_return", "0").toString
+            (label, features)
+        }
+        .mapPartitions(row => {
+        val result = new ArrayBuffer[String]()
+        val contentList = contentList_br.value
+        val bucketsMap = bucketsMap_br.value
+        row.foreach{
+          case (label, features) =>
+            val featuresBucket = contentList.indices.map(i =>{
+              val featureName = contentList(i)
+              val score = features(i)
+              if (score > 1E-8){
+                val (bucketNum, buckets) = bucketsMap(featureName)
+                val scoreNew = 1.0 / bucketNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
+                featureName + ":" + scoreNew.toString
+              }else{
+                ""
+              }
+            }).filter(_.nonEmpty)
+            result.add(label + "\t" + featuresBucket.mkString("\t"))
+        }
+        result.iterator
+      })
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + date
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      } else {
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+
+
+
+  }
+}

+ 132 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_16_bucketData_20240609_check.scala

@@ -0,0 +1,132 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
+import examples.extractor.ExtractorUtils
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+/*
+
+ */
+
+object makedata_16_bucketData_20240609_check {
+  def main(args: Array[String]): Unit = {
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    val loader = getClass.getClassLoader
+    val resourceUrl = loader.getResource("20240608_feature_name.txt")
+    val content =
+      if (resourceUrl != null) {
+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
+        Source.fromURL(resourceUrl).close()
+        content
+      } else {
+        ""
+      }
+    println(content)
+    val contentList = content.split("\n")
+      .map(r=> r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r=> r.nonEmpty).toList
+    val contentList_br = sc.broadcast(contentList)
+
+    val resourceUrlBucket = loader.getResource("20240609_bucket_274.txt")
+    val buckets =
+      if (resourceUrlBucket != null) {
+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
+        Source.fromURL(resourceUrlBucket).close()
+        buckets
+      } else {
+        ""
+      }
+    println(buckets)
+    val bucketsMap = buckets.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+      .map(r =>{
+        val rList = r.split("\t")
+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
+      }).toMap
+    val bucketsMap_br = sc.broadcast(bucketsMap)
+
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/14_feature_data/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/16_train_data/")
+    val beginStr = param.getOrElse("beginStr", "20240606")
+    val endStr = param.getOrElse("endStr", "20240607")
+    val repartition = param.getOrElse("repartition", "200").toInt
+    val APPSETS = param.getOrElse("APPSETS", "3").split(",").filter(_.nonEmpty).toSet
+    val ABSETS = param.getOrElse("ABSETS", "ab0,ab1,ab2,ab3").split(",").filter(_.startsWith("ab")).toSet
+
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      println("开始执行:" + date)
+      val data = sc.textFile(readPath + date).map(r=>{
+        val rList = r.split("\t")
+        val logKey = rList(0)
+        val labelKey = rList(1)
+        val features = rList(2).split(",").map(_.toDouble)
+        (logKey, labelKey, features)
+      })
+        .filter{
+          case (logKey, labelKey, features) =>
+            val logKeyList = logKey.split(",")
+            val apptype = logKeyList(0)
+            val pagesource = logKeyList(1)
+            val abcode = logKeyList(5)
+            val level = logKeyList(6)
+            APPSETS.contains(apptype) && pagesource.endsWith("recommend") &&
+              ABSETS.contains(abcode) && level.equals("0")
+        }
+        .map{
+          case (logKey, labelKey, features) =>
+            val label = JSON.parseObject(labelKey).getOrDefault("is_return", "0").toString
+            (label, features)
+        }
+        .mapPartitions(row => {
+        val result = new ArrayBuffer[String]()
+        val contentList = contentList_br.value
+        val bucketsMap = bucketsMap_br.value
+        row.foreach{
+          case (label, features) =>
+            val featuresBucket = contentList.indices.map(i =>{
+              val featureName = contentList(i)
+              val score = features(i)
+              if (score > 1E-8){
+                val (bucketNum, buckets) = bucketsMap(featureName)
+                val scoreNew = 1.0 / bucketNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
+                featureName + ":" + scoreNew.toString
+              }else{
+                ""
+              }
+            }).filter(_.nonEmpty)
+            result.add(label + "\t" + featuresBucket.mkString("\t"))
+        }
+        result.iterator
+      })
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + date
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      } else {
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+
+
+
+  }
+}

File diff suppressed because it is too large
+ 300 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_17_bucketDataPrint_20240617.scala


+ 43 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_18_mergehour2day_20240617.scala

@@ -0,0 +1,43 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.alibaba.fastjson.{JSON, JSONObject}
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+import org.xm.Similarity
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+/*
+   20240608 提取特征
+ */
+
+object makedata_18_mergehour2day_20240617 {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/16_train_data_print_online/20240615*")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/16_train_data_print_online_merge/20240615/")
+    val repartition = param.getOrElse("repartition", "100").toInt
+
+    val data = sc.textFile(readPath)
+
+    // 4 保存数据到hdfs
+    val hdfsPath = savePath
+    if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
+      println("删除路径并开始数据写入:" + hdfsPath)
+      MyHdfsUtils.delete_hdfs_path(hdfsPath)
+      data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+    }else{
+      println("路径不合法,无法写入:" + hdfsPath)
+    }
+  }
+}

+ 388 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_31_originData_20240620.scala

@@ -0,0 +1,388 @@
+package com.aliyun.odps.spark.examples.makedata_ad
+
+import com.alibaba.fastjson.{JSON, JSONObject}
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import examples.extractor.RankExtractorFeature_20240530
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+import org.xm.Similarity
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+/*
+   20240608 提取特征
+ */
+
+object makedata_ad_31_originData_20240620 {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val tablePart = param.getOrElse("tablePart", "64").toInt
+    val beginStr = param.getOrElse("beginStr", "2024062008")
+    val endStr = param.getOrElse("endStr", "2024062023")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/31_ad_sample_data/")
+    val project = param.getOrElse("project", "loghubods")
+    val table = param.getOrElse("table", "alg_recsys_ad_sample_all")
+    val repartition = param.getOrElse("repartition", "100").toInt
+
+    // 2 读取odps+表信息
+    val odpsOps = env.getODPS(sc)
+
+    // 3 循环执行数据生产
+    val timeRange = MyDateUtils.getDateHourRange(beginStr, endStr)
+    for (dt_hh <- timeRange) {
+      val dt = dt_hh.substring(0, 8)
+      val hh = dt_hh.substring(8, 10)
+      val partition = s"dt=$dt,hh=$hh"
+      println("开始执行partiton:" + partition)
+      val odpsData = odpsOps.readTable(project = project,
+        table = table,
+        partition = partition,
+        transfer = func,
+        numPartition = tablePart)
+        .map(record => {
+
+
+          val ts = record.getString("ts").toInt
+          val cid = record.getString("cid")
+
+
+          val featureMap = new JSONObject()
+
+          val b1: JSONObject = if (record.isNull("b1_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b1_feature"))
+          val b2: JSONObject = if (record.isNull("b2_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b2_feature"))
+          val b3: JSONObject = if (record.isNull("b3_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b3_feature"))
+          val b4: JSONObject = if (record.isNull("b4_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b4_feature"))
+          val b5: JSONObject = if (record.isNull("b5_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b5_feature"))
+          val b6: JSONObject = if (record.isNull("b6_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b6_feature"))
+          val b7: JSONObject = if (record.isNull("b7_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b7_feature"))
+          val b8: JSONObject = if (record.isNull("b8_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b8_feature"))
+          val b9: JSONObject = if (record.isNull("b9_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b9_feature"))
+
+
+          featureMap.put("cid_" + cid, 1.0)
+          if (b1.containsKey("adid") && b1.getString("adid").nonEmpty) {
+            featureMap.put("adid_" + b1.getString("adid"), 1.0)
+          }
+          if (b1.containsKey("adverid") && b1.getString("adverid").nonEmpty) {
+            featureMap.put("adverid_" + b1.getString("adverid"), 1.0)
+          }
+          if (b1.containsKey("targeting_conversion") && b1.getString("targeting_conversion").nonEmpty) {
+            featureMap.put("targeting_conversion_" + b1.getString("targeting_conversion"), 1.0)
+          }
+
+
+          if (b1.containsKey("cpa")) {
+            featureMap.put("cpa", b1.getString("cpa").toDouble)
+          }
+
+          for ((bn, prefix1) <- List(
+            (b2, "b2"), (b3, "b3"),(b4, "b4"),(b5, "b5"),(b8, "b8")
+          )){
+            for (prefix2 <- List(
+              "3h", "6h", "12h", "1d", "3d", "7d"
+            )){
+              val view = if (bn.isEmpty) 0D else bn.getIntValue("ad_view_" + prefix2).toDouble
+              val click = if (bn.isEmpty) 0D else bn.getIntValue("ad_click_" + prefix2).toDouble
+              val conver = if (bn.isEmpty) 0D else bn.getIntValue("ad_conversion_" + prefix2).toDouble
+              val income = if (bn.isEmpty) 0D else bn.getIntValue("ad_income_" + prefix2).toDouble
+              val f1 = RankExtractorFeature_20240530.calDiv(click, view)
+              val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
+              val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
+              val f4 = conver
+              val f5 = RankExtractorFeature_20240530.calDiv(income*1000, view)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctr", f1)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctcvr", f2)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "cvr", f3)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver", f4)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ecpm", f5)
+            }
+          }
+
+          for ((bn, prefix1) <- List(
+            (b6, "b6"), (b7, "b7")
+          )) {
+            for (prefix2 <- List(
+              "7d", "14d"
+            )) {
+              val view = if (bn.isEmpty) 0D else bn.getIntValue("ad_view_" + prefix2).toDouble
+              val click = if (bn.isEmpty) 0D else bn.getIntValue("ad_click_" + prefix2).toDouble
+              val conver = if (bn.isEmpty) 0D else bn.getIntValue("ad_conversion_" + prefix2).toDouble
+              val income = if (bn.isEmpty) 0D else bn.getIntValue("ad_income_" + prefix2).toDouble
+              val f1 = RankExtractorFeature_20240530.calDiv(click, view)
+              val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
+              val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
+              val f4 = conver
+              val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctr", f1)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctcvr", f2)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "cvr", f3)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver", f4)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ecpm", f5)
+            }
+          }
+
+          val c1: JSONObject = if (record.isNull("c1_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("c1_feature"))
+
+          val midActionList = if (c1.containsKey("action") && c1.getString("action").nonEmpty){
+            c1.getString("action").split(",").map(r=>{
+              val rList = r.split(":")
+              (rList(0), (rList(1).toInt, rList(2).toInt, rList(3).toInt, rList(4).toInt, rList(5)))
+            }).sortBy(-_._2._1).toList
+          }else {
+            new ArrayBuffer[(String, (Int, Int, Int, Int, String))]().toList
+          }
+          // u特征
+          val viewAll = midActionList.size.toDouble
+          val clickAll = midActionList.map(_._2._2).sum.toDouble
+          val converAll = midActionList.map(_._2._3).sum.toDouble
+          val incomeAll = midActionList.map(_._2._4).sum.toDouble
+          featureMap.put("viewAll", viewAll)
+          featureMap.put("clickAll", clickAll)
+          featureMap.put("converAll", converAll)
+          featureMap.put("incomeAll", incomeAll)
+          featureMap.put("ctr_all", RankExtractorFeature_20240530.calDiv(clickAll, viewAll))
+          featureMap.put("ctcvr_all", RankExtractorFeature_20240530.calDiv(converAll, viewAll))
+          featureMap.put("cvr_all", RankExtractorFeature_20240530.calDiv(clickAll, converAll))
+          featureMap.put("ecpm_all", RankExtractorFeature_20240530.calDiv(incomeAll * 1000, viewAll))
+
+          // ui特征
+          val midTimeDiff = scala.collection.mutable.Map[String, Double]()
+          midActionList.foreach{
+            case (cid, (ts_history, click, conver, income, title)) =>
+              if (!midTimeDiff.contains("timediff_view_" + cid)){
+                midTimeDiff.put("timediff_view_" + cid, 1.0 / ((ts - ts_history).toDouble/3600.0/24.0))
+              }
+              if (!midTimeDiff.contains("timediff_click_" + cid) && click > 0) {
+                midTimeDiff.put("timediff_click_" + cid, 1.0 / ((ts - ts_history).toDouble / 3600.0 / 24.0))
+              }
+              if (!midTimeDiff.contains("timediff_conver_" + cid) && conver > 0) {
+                midTimeDiff.put("timediff_conver_" + cid, 1.0 / ((ts - ts_history).toDouble / 3600.0 / 24.0))
+              }
+          }
+
+          val midActionStatic = scala.collection.mutable.Map[String, Double]()
+          midActionList.foreach {
+            case (cid, (ts_history, click, conver, income, title)) =>
+              midActionStatic.put("actionstatic_view_" + cid, 1.0 + midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0))
+              midActionStatic.put("actionstatic_click_" + cid, click + midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0))
+              midActionStatic.put("actionstatic_conver_" + cid, conver + midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0))
+              midActionStatic.put("actionstatic_income_" + cid, income + midActionStatic.getOrDefault("actionstatic_income_" + cid, 0.0))
+          }
+
+          if (midTimeDiff.contains("timediff_view_" + cid)){
+            featureMap.put("timediff_view", midTimeDiff.getOrDefault("timediff_view_" + cid, 0.0))
+          }
+          if (midTimeDiff.contains("timediff_click_" + cid)) {
+            featureMap.put("timediff_click", midTimeDiff.getOrDefault("timediff_click_" + cid, 0.0))
+          }
+          if (midTimeDiff.contains("timediff_conver_" + cid)) {
+            featureMap.put("timediff_conver", midTimeDiff.getOrDefault("timediff_conver_" + cid, 0.0))
+          }
+          if (midActionStatic.contains("actionstatic_view_" + cid)) {
+            featureMap.put("actionstatic_view", midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0))
+          }
+          if (midActionStatic.contains("actionstatic_click_" + cid)) {
+            featureMap.put("actionstatic_click", midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0))
+          }
+          if (midActionStatic.contains("actionstatic_conver_" + cid)) {
+            featureMap.put("actionstatic_conver", midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0))
+          }
+          if (midActionStatic.contains("actionstatic_income_" + cid)) {
+            featureMap.put("actionstatic_income", midActionStatic.getOrDefault("actionstatic_income_" + cid, 0.0))
+          }
+          if (midActionStatic.contains("actionstatic_view_" + cid) && midActionStatic.contains("actionstatic_click_" + cid)) {
+            featureMap.put("actionstatic_ctr", RankExtractorFeature_20240530.calDiv(
+              midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0),
+              midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0)
+            ))
+          }
+          if (midActionStatic.contains("actionstatic_view_" + cid) && midActionStatic.contains("timediff_conver_" + cid)) {
+            featureMap.put("actionstatic_ctcvr", RankExtractorFeature_20240530.calDiv(
+              midActionStatic.getOrDefault("timediff_conver_" + cid, 0.0),
+              midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0)
+            ))
+          }
+          if (midActionStatic.contains("actionstatic_conver_" + cid) && midActionStatic.contains("actionstatic_click_" + cid)) {
+            featureMap.put("actionstatic_cvr", RankExtractorFeature_20240530.calDiv(
+              midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0),
+              midActionStatic.getOrDefault("timediff_conver_" + cid, 0.0)
+            ))
+          }
+
+          val e1: JSONObject = if (record.isNull("e1_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("e1_feature"))
+          val e2: JSONObject = if (record.isNull("e2_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("e2_feature"))
+          val title = b1.getOrDefault("cidtitle", "").toString
+          if (title.nonEmpty){
+            for ((en, prefix1) <- List((e1, "e1"), (e2, "e2"))){
+              for (prefix2 <- List("tags_3d", "tags_7d", "tags_14d")){
+                if (en.nonEmpty && en.containsKey(prefix2) && en.getString(prefix2).nonEmpty) {
+                  val (f1, f2, f3, f4) = funcC34567ForTags(en.getString(prefix2), title)
+                  featureMap.put(prefix1 + "_" + prefix2 + "_matchnum", f1)
+                  featureMap.put(prefix1 + "_" + prefix2 + "_maxscore", f3)
+                  featureMap.put(prefix1 + "_" + prefix2 + "_avgscore", f4)
+
+                }
+              }
+            }
+          }
+
+          val d1: JSONObject = if (record.isNull("d1_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("d1_feature"))
+          val d2: JSONObject = if (record.isNull("d2_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("d2_feature"))
+
+          if (d1.nonEmpty){
+            for (prefix <- List("3h", "6h", "12h", "1d", "3d", "7d")) {
+              val view = if (!d1.containsKey("ad_view_" + prefix)) 0D else d1.getIntValue("ad_view_" + prefix).toDouble
+              val click = if (!d1.containsKey("ad_click_" + prefix)) 0D else d1.getIntValue("ad_click_" + prefix).toDouble
+              val conver = if (!d1.containsKey("ad_conversion_" + prefix)) 0D else d1.getIntValue("ad_conversion_" + prefix).toDouble
+              val income = if (!d1.containsKey("ad_income_" + prefix)) 0D else d1.getIntValue("ad_income_" + prefix).toDouble
+              val f1 = RankExtractorFeature_20240530.calDiv(click, view)
+              val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
+              val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
+              val f4 = conver
+              val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
+              featureMap.put("d1_feature" + "_" + prefix + "_" + "ctr", f1)
+              featureMap.put("d1_feature" + "_" + prefix + "_" + "ctcvr", f2)
+              featureMap.put("d1_feature" + "_" + prefix + "_" + "cvr", f3)
+              featureMap.put("d1_feature" + "_" + prefix + "_" + "conver", f4)
+              featureMap.put("d1_feature" + "_" + prefix + "_" + "ecpm", f5)
+            }
+          }
+
+          val vidRankMaps = scala.collection.mutable.Map[String, scala.collection.immutable.Map[String, Double]]()
+          if (d2.nonEmpty){
+            d2.foreach(r => {
+              val key = r._1
+              val value = d2.getString(key).split(",").map(r=> {
+                val rList = r.split(":")
+                (rList(0), rList(2).toDouble)
+              }).toMap
+              vidRankMaps.put(key, value)
+            })
+          }
+          for (prefix1 <- List("ctr", "ctcvr", "ecpm")) {
+            for (prefix2 <- List("1d", "3d", "7d", "14d")) {
+              if (vidRankMaps.contains(prefix1 + "_" + prefix2)){
+                val rank = vidRankMaps(prefix1 + "_" + prefix2).getOrDefault(cid, 0.0)
+                if (rank >= 1.0){
+                  featureMap.put("vid_rank_" + prefix1 + "_" + prefix2, 1.0 / rank)
+                }
+              }
+            }
+          }
+
+
+          /*
+          广告
+            sparse:cid adid adverid targeting_conversion
+
+            cpa --> 1个
+            adverid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr conver ecpm  --> 30个
+            cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
+            地理//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
+            app//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
+            手机品牌//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
+            系统 无数据
+            week//cid下的 7d 14d、 ctr ctcvr cvr ecpm conver --> 10个
+            hour//cid下的 7d 14d、 ctr ctcvr cvr ecpm conver --> 10个
+
+          用户
+            用户历史 点击/转化 的title tag;3d 7d 14d; cid的title; 数量/最高分/平均分 --> 18个
+            用户历史 14d 看过/点过/转化次数/income; ctr cvr ctcvr ecpm;  --> 8个
+
+            用户到cid的ui特征 --> 10个
+              1/用户最近看过这个cid的时间间隔
+              1/用户最近点过这个cid的时间间隔
+              1/用户最近转过这个cid的时间间隔
+              用户看过这个cid多少次
+              用户点过这个cid多少次
+              用户转过这个cid多少次
+              用户对这个cid花了多少钱
+              用户对这个cid的ctr ctcvr cvr
+
+          视频
+            title与cid的 sim-score-1/-2 无数据
+            vid//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
+            vid//cid下的 1d 3d 7d 14d、 ctr ctcvr ecpm 的rank值 倒数 --> 12个
+
+           */
+
+
+
+          //4 处理label信息。
+          val labels = new JSONObject
+          for (labelKey <- List("ad_is_click", "ad_is_conversion")){
+            if (!record.isNull(labelKey)){
+              labels.put(labelKey, record.getString(labelKey))
+            }
+          }
+          //5 处理log key表头。
+          val apptype = record.getString("apptype")
+          val mid = record.getString("mid")
+          val headvideoid = record.getString("headvideoid")
+          val logKey = (apptype, mid, cid, ts, headvideoid).productIterator.mkString(",")
+          val labelKey = labels.toString()
+          val featureKey = featureMap.toString()
+          //6 拼接数据,保存。
+          logKey + "\t" + labelKey + "\t" + featureKey
+        })
+
+      // 4 保存数据到hdfs
+      val savePartition = dt + hh
+      val hdfsPath = savePath + "/" + savePartition
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        odpsData.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      }else{
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+  }
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+
+  def funcC34567ForTags(tags: String, title: String): Tuple4[Double, String, Double, Double] = {
+    // 匹配数量 匹配词 语义最高相似度分 语义平均相似度分
+    val tagsList = tags.split(",")
+    var d1 = 0.0
+    val d2 = new ArrayBuffer[String]()
+    var d3 = 0.0
+    var d4 = 0.0
+    for (tag <- tagsList) {
+      if (title.contains(tag)) {
+        d1 = d1 + 1.0
+        d2.add(tag)
+      }
+      val score = Similarity.conceptSimilarity(tag, title)
+      d3 = if (score > d3) score else d3
+      d4 = d4 + score
+    }
+    d4 = if (tagsList.nonEmpty) d4 / tagsList.size else d4
+    (d1, d2.mkString(","), d3, d4)
+  }
+}

+ 103 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_32_bucket_20240622.scala

@@ -0,0 +1,103 @@
+package com.aliyun.odps.spark.examples.makedata_ad
+
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.spark.examples.myUtils.{MyHdfsUtils, ParamUtils}
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+/*
+
+ */
+
+object makedata_ad_32_bucket_20240622 {
+  def main(args: Array[String]): Unit = {
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    val loader = getClass.getClassLoader
+    val resourceUrl = loader.getResource("20240622_ad_feature_name.txt")
+    val content =
+      if (resourceUrl != null) {
+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
+        Source.fromURL(resourceUrl).close()
+        content
+      } else {
+        ""
+      }
+    println(content)
+    val contentList = content.split("\n")
+      .map(r=> r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r=> r.nonEmpty).toList
+
+
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/20240620*")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/32_bucket_data/")
+    val fileName = param.getOrElse("fileName", "20240620_100")
+    val sampleRate = param.getOrElse("sampleRate", "1.0").toDouble
+    val bucketNum = param.getOrElse("bucketNum", "100").toInt
+
+    val data = sc.textFile(readPath)
+    println("问题数据数量:" + data.filter(r=>r.split("\t").length != 3).count())
+    val data1 = data.map(r => {
+      val rList = r.split("\t")
+      val jsons = JSON.parseObject(rList(2))
+      val doubles = scala.collection.mutable.Map[String, Double]()
+      jsons.foreach(r =>{
+        doubles.put(r._1, jsons.getDoubleValue(r._1))
+      })
+      doubles
+    }).sample(false, sampleRate ).repartition(20)
+
+    val result = new ArrayBuffer[String]()
+
+    for (i <- contentList.indices){
+      println("特征:" + contentList(i))
+      val data2 = data1.map(r => r.getOrDefault(contentList(i), 0D)).filter(_ > 1E-8).collect().sorted
+      val len = data2.length
+      if (len == 0){
+        result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + "0")
+      }else{
+        val oneBucketNum = (len - 1) / (bucketNum - 1) + 1 // 确保每个桶至少有一个元素
+        val buffers = new ArrayBuffer[Double]()
+
+        var lastBucketValue = data2(0) // 记录上一个桶的切分点
+        for (j <- 0 until len by oneBucketNum) {
+          val d = data2(j)
+          if (j > 0 && d != lastBucketValue) {
+            // 如果当前切分点不同于上一个切分点,则保存当前切分点
+            buffers += d
+          }
+          lastBucketValue = d // 更新上一个桶的切分点
+        }
+
+        // 最后一个桶的结束点应该是数组的最后一个元素
+        if (!buffers.contains(data2.last)) {
+          buffers += data2.last
+        }
+        result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(","))
+      }
+    }
+    val data3 = sc.parallelize(result)
+
+
+    // 4 保存数据到hdfs
+    val hdfsPath = savePath + "/" + fileName
+    if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+      println("删除路径并开始数据写入:" + hdfsPath)
+      MyHdfsUtils.delete_hdfs_path(hdfsPath)
+      data3.repartition(1).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+    } else {
+      println("路径不合法,无法写入:" + hdfsPath)
+    }
+  }
+}

+ 118 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_33_bucketData_20240622.scala

@@ -0,0 +1,118 @@
+package com.aliyun.odps.spark.examples.makedata_ad
+
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
+import examples.extractor.ExtractorUtils
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+/*
+
+ */
+
+object makedata_ad_33_bucketData_20240622 {
+  def main(args: Array[String]): Unit = {
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    val loader = getClass.getClassLoader
+
+    val resourceUrlBucket = loader.getResource("20240622_ad_bucket_249.txt")
+    val buckets =
+      if (resourceUrlBucket != null) {
+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
+        Source.fromURL(resourceUrlBucket).close()
+        buckets
+      } else {
+        ""
+      }
+    println(buckets)
+    val bucketsMap = buckets.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+      .map(r =>{
+        val rList = r.split("\t")
+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
+      }).toMap
+    val bucketsMap_br = sc.broadcast(bucketsMap)
+
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/33_ad_train_data/")
+    val beginStr = param.getOrElse("beginStr", "20240620")
+    val endStr = param.getOrElse("endStr", "20240620")
+    val repartition = param.getOrElse("repartition", "200").toInt
+
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      println("开始执行:" + date)
+      val data = sc.textFile(readPath + "/" + date + "*").map(r=>{
+        val rList = r.split("\t")
+        val logKey = rList(0)
+        val labelKey = rList(1)
+        val jsons = JSON.parseObject(rList(2))
+        val features = scala.collection.mutable.Map[String, Double]()
+        jsons.foreach(r => {
+          features.put(r._1, jsons.getDoubleValue(r._1))
+        })
+        (logKey, labelKey, features)
+      })
+        .filter{
+          case (logKey, labelKey, features) =>
+            val logKeyList = logKey.split(",")
+            val apptype = logKeyList(0)
+            !Set("12").contains(apptype)
+        }
+        .map{
+          case (logKey, labelKey, features) =>
+            val label = JSON.parseObject(labelKey).getOrDefault("ad_is_conversion", "0").toString
+            (label, features)
+        }
+        .mapPartitions(row => {
+          val result = new ArrayBuffer[String]()
+          val bucketsMap = bucketsMap_br.value
+          row.foreach{
+            case (label, features) =>
+              val featuresBucket = features.map{
+                case (name, score) =>
+                  if (score > 1E-8) {
+                    if (bucketsMap.contains(name)){
+                      val (_, buckets) = bucketsMap(name)
+                      val scoreNew = 1.0 / (buckets.length + 1) * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
+                      name + ":" + scoreNew.toString
+                    }else{
+                      name + ":" + score.toString
+                    }
+                  } else {
+                    ""
+                  }
+              }.filter(_.nonEmpty)
+              result.add(label + "\t" + featuresBucket.mkString("\t"))
+          }
+          result.iterator
+      })
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + date
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      } else {
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+
+
+
+  }
+}

+ 246 - 0
src/main/scala/com/aliyun/odps/spark/examples/myUtils/MyDateUtils.scala

@@ -0,0 +1,246 @@
+package com.aliyun.odps.spark.examples.myUtils
+import java.text.SimpleDateFormat
+import java.util.{Calendar, Date}
+
+import org.apache.commons.lang.time.DateUtils
+import org.apache.commons.lang3.time.DateUtils.addDays
+
+import scala.collection.mutable.ArrayBuffer
+
+object MyDateUtils {
+
+  val date_sdf = getYesterday()
+  val date_sdf_ = getYesterday_()
+  val date_sdf_full = ""
+
+
+
+
+  // 今天日期
+  def getNowDate(): String = {
+    var now: Date = new Date()
+    var dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyyMMdd")
+    var hehe = dateFormat.format(now)
+    hehe
+  }
+  def getNowDate_(): String = {
+    var now: Date = new Date()
+    var dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd")
+    var hehe = dateFormat.format(now)
+    hehe
+  }
+
+  // 昨天日期
+  def getYesterday(): String = {
+    var dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyyMMdd")
+    var cal: Calendar = Calendar.getInstance()
+    cal.add(Calendar.DATE, -1)
+    var yesterday = dateFormat.format(cal.getTime())
+    yesterday
+  }
+  def getYesterday_(): String = {
+    var dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd")
+    var cal: Calendar = Calendar.getInstance()
+    cal.add(Calendar.DATE, -1)
+    var yesterday = dateFormat.format(cal.getTime())
+    yesterday
+  }
+
+  //本周第一天的日期
+  def getNowWeekStart(): String = {
+    var period: String = ""
+    var cal: Calendar = Calendar.getInstance()
+    var df: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd")
+    cal.set(Calendar.DAY_OF_WEEK, Calendar.MONDAY)
+    //获取本周一的日期
+    period = df.format(cal.getTime())
+    period
+  }
+
+  // 本周末的日期
+  def getNowWeekEnd(): String = {
+    var period: String = ""
+    var cal: Calendar = Calendar.getInstance();
+    var df: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd");
+    cal.set(Calendar.DAY_OF_WEEK, Calendar.SUNDAY); //这种输出的是上个星期周日的日期,因为老外把周日当成第一天
+    cal.add(Calendar.WEEK_OF_YEAR, 1) // 增加一个星期,才是我们中国人的本周日的日期
+    period = df.format(cal.getTime())
+    period
+  }
+
+  // 本月的第一天
+  def getNowMonthStart(): String = {
+    var period: String = ""
+    var cal: Calendar = Calendar.getInstance();
+    var df: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd");
+    cal.set(Calendar.DATE, 1)
+    period = df.format(cal.getTime()) //本月第一天
+    period
+  }
+
+  // 本月最后一天
+  def getNowMonthEnd(): String = {
+    var period: String = ""
+    var cal: Calendar = Calendar.getInstance();
+    var df: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd");
+    cal.set(Calendar.DATE, 1)
+    cal.roll(Calendar.DATE, -1)
+    period = df.format(cal.getTime()) //本月最后一天
+    period
+  }
+
+  // "秒"时间戳 转 日期
+  def DateFormat(time:String):String={
+    var sdf:SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd")
+    var date:String = sdf.format(new Date((time.toLong*1000l)))
+    date
+  }
+
+  // "秒"时间戳 转 日期
+  def DateFormat_yyyyMMdd(time:String):String={
+    var sdf:SimpleDateFormat = new SimpleDateFormat("yyyyMMdd")
+    var date:String = sdf.format(new Date((time.toLong*1000l)))
+    date
+  }
+
+  // "秒"时间戳 转 当天时间
+  def timeFormat(time:String):String={
+    var sdf:SimpleDateFormat = new SimpleDateFormat("HH:mm:ss")
+    var date:String = sdf.format(new Date((time.toLong*1000l)))
+    date
+  }
+
+  // date-time格式转成秒
+  def tranTimeToLong(tm:String) :Long={
+    val fm = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+    val dt = fm.parse(tm)
+    val aa = fm.format(dt)
+    val tim: Long = dt.getTime()
+    tim / 1000
+  }
+
+  // 日期格式转成秒
+  def tranTimeString_yyyyMMdd_ToLong(tm:String) :Long={
+    val fm = new SimpleDateFormat("yyyyMMdd")
+    val dt = fm.parse(tm)
+    val aa = fm.format(dt)
+    val tim: Long = dt.getTime()
+    tim / 1000
+  }
+
+  // 秒转成日期
+  def formatDateMillToMut(mill:Long)= {
+    val date = new Date(mill)
+    date
+  }
+
+  //时间推移
+  def getNumDaysBefore(dt:String,num:Int, pattern:String = "yyyyMMdd"): String ={
+    val sdf = new SimpleDateFormat(pattern)
+    val enddate= sdf.parse(dt)
+    val rightNow = Calendar.getInstance()
+    rightNow.setTime(enddate)
+    rightNow.add(Calendar.DAY_OF_YEAR,-num);//日期减30天
+    val begindate =rightNow.getTime()
+    val time_begin = sdf.format(begindate)
+    time_begin
+  }
+
+  def getNumDaysAfter(dt:String,num:Int, pattern:String = "yyyyMMdd"): String ={
+    val sdf = new SimpleDateFormat(pattern)
+    val enddate= sdf.parse(dt)
+    val rightNow = Calendar.getInstance()
+    rightNow.setTime(enddate)
+    rightNow.add(Calendar.DAY_OF_YEAR,num);//日期减30天
+    val begindate =rightNow.getTime()
+    val time_begin = sdf.format(begindate)
+    time_begin
+  }
+
+  // "20190101"转"2019-01-01"
+  def dt2Dt(dt:String) : String={
+    dt.substring(0, 4) + "-" + dt.substring(4, 6) +"-" +dt.substring(6, 8)
+  }
+
+  // 日期区间生产1:从beginStr到endDate
+  def fromBeginDate2EndDate(beginStr:String, endStr:String): Array[String] ={
+    val date_format = new SimpleDateFormat("yyyyMMdd")
+    var from = DateUtils.parseDate(beginStr, Array[String]("yyyyMMdd"))
+    val to = DateUtils.parseDate(endStr, Array[String]("yyyyMMdd"))
+    var result = new ArrayBuffer[String]()
+    while (from.compareTo(to) <= 0) {
+      val dateStr = date_format.format(from)
+      result.append(dateStr)
+      from = DateUtils.addDays(from, 1)
+    }
+    result.toArray
+  }
+  // 日期区间生产2:
+  def getDateRange(beginStr: String, endStr: String, format: String = "yyyyMMdd"): ArrayBuffer[String] = {
+    val ranges = ArrayBuffer[String]()
+    val sdf = new SimpleDateFormat(format)
+    var dateBegin = sdf.parse(beginStr)
+    var dateEnd = sdf.parse(endStr)
+    while (dateBegin.compareTo(dateEnd) <= 0) {
+      ranges += sdf.format(dateBegin)
+      dateBegin = addDays(dateBegin, 1)
+    }
+    ranges
+  }
+
+  // 日期+小时 时间区间生成
+  def getDateHourRange(beginStr: String, endStr: String, format: String = "yyyyMMddHH"): ArrayBuffer[String] = {
+    val ranges = ArrayBuffer[String]()
+    val sdf = new SimpleDateFormat(format)
+    var dateBegin = sdf.parse(beginStr)
+    val dateEnd = sdf.parse(endStr)
+
+    while (dateBegin.compareTo(dateEnd) <= 0) {
+      ranges += sdf.format(dateBegin)
+      // 将开始时间增加一小时
+      dateBegin = addHours(dateBegin, 1)
+    }
+    ranges
+  }
+
+  import java.util.Date
+
+  // 辅助函数,用于给定的日期增加小时
+  def addHours(date: Date, hours: Int): Date = {
+    val cal = Calendar.getInstance()
+    cal.setTime(date)
+    cal.add(java.util.Calendar.HOUR_OF_DAY, hours)
+    cal.getTime
+  }
+
+  import java.time.LocalDate
+  import java.time.temporal.ChronoUnit
+  def calculateDateDifference(startDate: String, endDate: String): Long = {
+    val start = LocalDate.parse(startDate, java.time.format.DateTimeFormatter.BASIC_ISO_DATE)
+    val end = LocalDate.parse(endDate, java.time.format.DateTimeFormatter.BASIC_ISO_DATE)
+    val days = ChronoUnit.DAYS.between(start, end)
+    days
+  }
+
+
+  def main(args: Array[String]): Unit = {
+//    var from = DateUtils.parseDate("2019-09-01", Array[String]("yyyy-MM-dd"))
+//    var to = DateUtils.parseDate("2019-09-10", Array[String]("yyyy-MM-dd"))
+//
+//    val a = from.getTime / 3600
+//    val b = to.getTime / 3600
+//    println(b-a)
+
+    var from = getDateHourRange("2024050123", "2024050203")
+    from.foreach(println)
+
+    val partitionPrefix = "dt={},hh={}"
+    println(partitionPrefix.stripMargin.format("XX", "YY"))
+
+    val stdxx = "2024050116"
+    val dt = stdxx.substring(0, 8)
+    val hh = stdxx.substring(8, 10)
+    println(dt)
+    println(hh)
+  }
+}

+ 148 - 0
src/main/scala/com/aliyun/odps/spark/examples/myUtils/MyHdfsUtils.scala

@@ -0,0 +1,148 @@
+package com.aliyun.odps.spark.examples.myUtils
+
+/**
+ * Author: zhangbo58
+ * Description:
+ *
+ */
+import org.apache.commons.lang.time.DateUtils
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.permission.FsPermission
+import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
+
+import scala.collection.mutable.ArrayBuffer
+
+object MyHdfsUtils {
+  def main(args: Array[String]): Unit = {
+    val path = "zhangbo58/"
+    //生成FileSystem
+    println("获取目录下的一级文件和目录")
+    getFilesAndDirs(path).foreach(println)
+    println("获取目录下的一级文件")
+    getFiles(path).foreach(println)
+    println("获取目录下的一级目录")
+    getDirs(path).foreach(println)
+    println("获取目录下所有文件")
+    getAllFiles(path).foreach(println)
+  }
+
+  def getHdfs(path: String): FileSystem = {
+    val conf = new Configuration()
+    //    FileSystem.get(URI.create(path), conf)
+    val fs = org.apache.hadoop.fs.FileSystem.get(new org.apache.hadoop.conf.Configuration())
+    fs
+  }
+
+  //获取目录下的一级文件和目录
+  def getFilesAndDirs(path: String): Array[Path] = {
+    val fs = getHdfs(path).listStatus(new Path(path))
+    FileUtil.stat2Paths(fs)
+  }
+  //获取目录下的一级文件
+  def getFiles(path: String): Array[String] = {
+    getFilesAndDirs(path).filter(getHdfs(path).getFileStatus(_).isFile())
+      .map(_.toString)
+  }
+  //获取目录下的一级目录
+  def getDirs(path: String): Array[String] = {
+    getFilesAndDirs(path).filter(getHdfs(path).getFileStatus(_).isDirectory)
+      .map(_.toString)
+  }
+  //获取目录下的所有文件
+  def getAllFiles(path: String): ArrayBuffer[String] = {
+    val arr = ArrayBuffer[String]()
+    val hdfs = getHdfs(path)
+    val getPath = getFilesAndDirs(path)
+    getPath.foreach(patha => {
+      if (hdfs.getFileStatus(patha).isFile())
+        arr += patha.toString
+      else {
+        arr ++= getAllFiles(patha.toString())
+      }
+    })
+    arr
+  }
+  def ifHDFSHasData(path: String): Boolean = {
+    val hdfs_path = new org.apache.hadoop.fs.Path(path.toString)
+    val hdfs = org.apache.hadoop.fs.FileSystem.get(new org.apache.hadoop.conf.Configuration())
+
+    var rst = false
+    if (hdfs.exists(hdfs_path)) {
+      //路径存在且不为空
+      val statusList = hdfs.listStatus(hdfs_path)
+      for (status <- statusList if !rst && (status.getPath.toString.contains("part-") || status.getPath.toString.contains("_SUCCESS"))) {
+        if (status.getLen > 0) {
+          rst = true
+        }
+      }
+    }
+    rst
+  }
+
+  def delete_hdfs_path(path: String): Unit = {
+    val hdfs_path = new org.apache.hadoop.fs.Path(path.toString)
+    val hdfs = org.apache.hadoop.fs.FileSystem.get(new org.apache.hadoop.conf.Configuration())
+
+    if (hdfs.exists(hdfs_path)) {
+      hdfs.delete(hdfs_path, true)
+    }
+  }
+
+  def hdfs_exits(path:String): Boolean = {
+    val hdfs_path = new org.apache.hadoop.fs.Path(path.toString)
+    val hdfs = org.apache.hadoop.fs.FileSystem.get(new org.apache.hadoop.conf.Configuration())
+
+    hdfs.exists(hdfs_path)
+  }
+
+
+  /* 删除某路径下,不在某段时间内的所有数据。
+  * 举例:keepDays=2 dateStr=20191015 => 保留20191015和20191014两天的数据
+  */
+  def hdfs_delete_not_keep_days(
+                                 path:String,
+                                 keepDays:Int,
+                                 dateStr:String,
+                                 pattern:String = "yyyyMMdd"
+                               ): Unit ={
+    val file_list = this.getFiles(path)
+    println("hdfs_delete_not_keep_days-file_list")
+    file_list.foreach(println)
+
+    for (file <- file_list){
+      var flag = true
+      val date_early = MyDateUtils.getNumDaysBefore(dateStr, keepDays, pattern)
+      try{
+        val file_split_strs = file.split("/")
+        val len = file_split_strs.length
+        var file_date = file_split_strs(len-1)
+        if (file_date.equals("")){
+          file_date = file_split_strs(len-2)
+        }
+        var date1 = DateUtils.parseDate(file_date, Array[String](pattern)) // 文件中的日期
+        var date2 = DateUtils.parseDate(date_early, Array[String](pattern)) // 请求的日期前keepDays的日期
+        if (date1.compareTo(date2) >= 0){ // 这个日期之前的数据 全部删除
+          flag = false
+        }
+      }catch {
+        case e:Exception =>
+          flag = false
+      }
+
+      if (flag){
+        MyHdfsUtils.delete_hdfs_path(file.toString)
+      }
+    }
+  }
+
+  /**
+   * @Author: zhangbo
+   * @Description: 给某hdfs路径加权限
+   *
+   */
+
+  def give_hdfs_permission(path:String): Unit ={
+    getHdfs(path).setPermission(new Path(path), new FsPermission("777"))
+  }
+}
+

+ 40 - 0
src/main/scala/com/aliyun/odps/spark/examples/myUtils/ParamUtils.scala

@@ -0,0 +1,40 @@
+package com.aliyun.odps.spark.examples.myUtils
+
+import scala.collection.mutable
+object ParamUtils {
+  def parseArgs(args: Array[String]): mutable.HashMap[String, String] = {
+    println("args size:" + args.size)
+
+    val rst = new mutable.HashMap[String, String]() {
+      override def default(key: String) = "无参数传入"
+    }
+    for (a <- args) {
+      val key_val = a.split(":")
+      if (key_val.length >= 2) {
+        // 为了解决hdfs正则化路径时Array变多个的问题
+        if (rst.contains(key_val(0))) {
+          val value = rst.get(key_val(0)).get
+          val newValue = value + "," + key_val.splitAt(1)._2.mkString(":")
+          rst += (key_val(0) -> newValue)
+          println(key_val(0) + ":" + newValue)
+        } else {
+          rst += (key_val(0) -> key_val.splitAt(1)._2.mkString(":"))
+          println(key_val(0) + ":" + key_val.splitAt(1)._2.mkString(":"))
+        }
+      }
+    }
+    rst
+  }
+
+  def parseLogKey(logKey: String): Tuple7[String, String, String, String, String, String, String] = {
+    val l = logKey.split(":")
+    val mid = l(0)
+    val videoid = l(1)
+    val logtimestamp = l(2)
+    val apptype = l(3)
+    val pagesource_change = l(4)
+    val abcode = l(5)
+    val video_recommend = l(6)
+    (mid, videoid, logtimestamp, apptype, pagesource_change, abcode, video_recommend)
+  }
+}

+ 39 - 0
src/main/scala/com/aliyun/odps/spark/examples/myUtils/env.scala

@@ -0,0 +1,39 @@
+package com.aliyun.odps.spark.examples.myUtils
+
+import org.apache.spark.SparkContext
+import org.apache.spark.aliyun.odps.OdpsOps
+import org.springframework.data.redis.connection.RedisStandaloneConfiguration
+import org.springframework.data.redis.connection.jedis.JedisConnectionFactory
+import org.springframework.data.redis.core.RedisTemplate
+import org.springframework.data.redis.serializer.StringRedisSerializer
+import examples.dataloader.redisBuilderMyself
+
+object env {
+  def getODPS(sparkContext: SparkContext): OdpsOps = {
+    val accessKeyId = "LTAIWYUujJAm7CbH"
+    val accessKeySecret = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P"
+    val odpsUrl = "http://service.odps.aliyun.com/api"
+    val tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com"
+
+    OdpsOps(sparkContext, accessKeyId, accessKeySecret, odpsUrl, tunnelUrl)
+  }
+
+  def getRedisTemplate(): RedisTemplate[String, String] = {
+    // redis的公共模版
+    val redisSC = new RedisStandaloneConfiguration
+    redisSC.setPort(6379)
+    redisSC.setPassword("Wqsd@2019")
+    redisSC.setHostName("r-bp1pi8wyv6lzvgjy5z.redis.rds.aliyuncs.com")
+    val jedisCF = new JedisConnectionFactory(redisSC)
+    jedisCF.afterPropertiesSet()
+    val redisTemplate = new RedisTemplate[String, String]
+    redisTemplate.setDefaultSerializer(new StringRedisSerializer)
+    redisTemplate.setConnectionFactory(jedisCF)
+    redisTemplate.afterPropertiesSet()
+    redisTemplate
+  }
+
+  def getRedisTemplatev2(): RedisTemplate[String, String] = {
+    redisBuilderMyself.redisTemplate(redisBuilderMyself.redisConnectionFactory())
+  }
+}

+ 1 - 1
src/main/scala/com/aliyun/odps/spark/examples/sparksql/SparkSQL.scala

@@ -55,7 +55,7 @@ object SparkSQL {
 
     // 写 普通表
     df.write.insertInto(tableName) // insertInto语义
-    df.writeTo(tableName).overwritePartitions() // insertOverwrite use datasourceV2
+//    df.writeTo(tableName).overwritePartitions() // insertOverwrite use datasourceV2
 
     // 写 分区表
     // DataFrameWriter 无法指定分区写入 需要通过临时表再用SQL写入特定分区

+ 161 - 0
src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本

@@ -0,0 +1,161 @@
+
+【新 上游样本】
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_10_originData_v3 \
+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 64 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+tablePart:64 savePath:/dw/recommend/model/10_sample_data_v3/ beginStr:20240227 endStr:20240227 > p10_.log 2>&1 &
+
+[ros样本生产]
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_12_rosData_v3 \
+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 32 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+savePath:/dw/recommend/model/12_ros_data_v3/ beginStr:20240228 endStr:20240228 ifRepart:10 \
+> p12_1.log 2>&1 &
+
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_12_rosData_v3_noweight \
+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 32 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+savePath:/dw/recommend/model/12_ros_data_v3_noweight/ beginStr:20240222 endStr:20240226 ifRepart:10 \
+> p12_2.log 2>&1 &
+
+[str样本生产]
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_11_strData_v3 \
+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 64 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+savePath:/dw/recommend/model/11_str_data_v3/ beginStr:20240227 endStr:20240227 ifRepart:100 \
+> p11.log 2>&1 &
+
+
+[user写redis]
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_09_user2redis_freq \
+--name makedata_09_user2redis_freq \
+--master yarn --driver-memory 1G --executor-memory 4G --executor-cores 1 --num-executors 32 \
+--conf spark.yarn.executor.memoryoverhead=1024 \
+/root/zhangbo/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+date:20240302 tablePart:96 expireDay:3 ifWriteRedisUser:True ifUser:True midDays:14 redisLimit:80000000 \
+savePathUser:/dw/recommend/model/09_feature/user/ > p09.log 2>&1 &
+
+
+
+--------------
+【旧STR 上游样本】
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_06_originData \
+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 32 \
+--conf spark.yarn.executor.memoryoverhead=1024 \
+--conf spark.shuffle.service.enabled=true \
+--conf spark.shuffle.service.port=7337 \
+--conf spark.shuffle.consolidateFiles=true \
+--conf spark.shuffle.manager=sort \
+--conf spark.storage.memoryFraction=0.4 \
+--conf spark.shuffle.memoryFraction=0.5 \
+--conf spark.default.parallelism=200 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+tablePart:64 savePath:/dw/recommend/model/00_sample_data/ beginStr:20240311 endStr:20240312 > p6.log 2>&1 &
+【旧STR 训练数据】
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_07_strData \
+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 32 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+savePath:/dw/recommend/model/04_str_data/ beginStr:20240311 endStr:20240312 featureVersion:v4 ifRepart:100 \
+> p7.log 2>&1 &
+
+---
+
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_13_originData_20240529 \
+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+tablePart:64 repartition:32 \
+beginStr:2024061600 endStr:2024061623 \
+savePath:/dw/recommend/model/13_sample_data/ \
+table:alg_recsys_sample_all \
+> p13_2024061600.log 2>&1 &
+
+
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_14_valueData_20240608 \
+--master yarn --driver-memory 1G --executor-memory 3G --executor-cores 1 --num-executors 32 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+readPath:/dw/recommend/model/13_sample_data/ \
+savePath:/dw/recommend/model/14_feature_data/ \
+beginStr:20240615 endStr:20240615 repartition:1000 \
+> p14_data_check.log 2>&1 &
+
+
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_15_bucket_20240608 \
+--master yarn --driver-memory 16G --executor-memory 1G --executor-cores 1 --num-executors 16 \
+--conf spark.driver.maxResultSize=16G \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+readPath:/dw/recommend/model/14_feature_data/20240606/ fileName:20240606_200_v3 \
+bucketNum:200 sampleRate:0.1 \
+> p15_data2.log 2>&1 &
+
+
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_16_bucketData_20240609 \
+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+beginStr:20240615 endStr:20240615 repartition:1000 \
+> p16_data.log 2>&1 &
+
+
+/dw/recommend/model/13_sample_data/
+/dw/recommend/model/14_feature_data/
+/dw/recommend/model/16_train_data/
+
+-----
+一个执行:只有用线上打印特征的才执行
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_13_originData_20240529_check \
+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+tablePart:64 repartition:32 \
+beginStr:2024061500 endStr:2024061523 \
+savePath:/dw/recommend/model/13_sample_data_check_print/ \
+table:alg_recsys_sample_all_new \
+> p13_2024061500_check.log 2>&1 &
+
+两个都要执行:过滤不需要的样本
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_16_bucketData_20240609_check \
+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+readPath:/dw/recommend/model/14_feature_data_check_print/ \
+savePath:/dw/recommend/model/16_train_data_check_print/ \
+beginStr:20240615 endStr:20240615 repartition:1000 \
+> p16_data_check.log 2>&1 &
+
+/dw/recommend/model/13_sample_data_check/
+/dw/recommend/model/13_sample_data_check_print/
+/dw/recommend/model/14_feature_data_check/
+/dw/recommend/model/14_feature_data_check_print/
+/dw/recommend/model/16_train_data_check/
+/dw/recommend/model/16_train_data_check_print/
+
+
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_17_bucketDataPrint_20240617 \
+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+> p17_data_check.log 2>&1 &
+
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_18_mergehour2day_20240617 \
+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+> p18_data_check.log 2>&1 &
+
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_17_bucketDataPrint_20240617 \
+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+beginStr:2024061800 endStr:2024061814 \
+readDate:20240618 \
+> p17_data_check.log 2>&1 &

+ 34 - 0
src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本-广告

@@ -0,0 +1,34 @@
+
+
+
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_ad.makedata_ad_31_originData_20240620 \
+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+tablePart:64 repartition:32 \
+beginStr:2024062009 endStr:2024062023 \
+savePath:/dw/recommend/model/31_ad_sample_data/ \
+table:alg_recsys_ad_sample_all \
+> p31_2024062008.log 2>&1 &
+
+
+
+
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_ad.makedata_ad_32_bucket_20240622 \
+--master yarn --driver-memory 16G --executor-memory 1G --executor-cores 1 --num-executors 16 \
+--conf spark.driver.maxResultSize=16G \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+> p32_data.log 2>&1 &
+
+
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_ad.makedata_ad_33_bucketData_20240622 \
+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+beginStr:20240620 endStr:20240620 repartition:400 \
+> p33_data.log 2>&1 &
+
+
+/dw/recommend/model/31_ad_sample_data/
+/dw/recommend/model/33_ad_train_data/

+ 8 - 0
src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本【分析】

@@ -0,0 +1,8 @@
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.ana.ana_01_cidvidpk \
+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 32 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+tablePart:5 \
+beginStr:2024060211 endStr:2024060211 \
+vidSelect:21006075 cidsSelect:1155,1902 apptype:0 \
+> p01_ana.log 2>&1 &

+ 29 - 0
zhangbo/00_copy.sh

@@ -0,0 +1,29 @@
+#!/bin/sh
+
+#MVN_PACKAGE="mvn clean install  -T 2C -Dmaven.test.skip=true -Dmaven.compile.fork=true"
+JAVA_PATH="/usr/bin/java"
+PYTHON_PATH="/usr/bin/python"
+UPLOAD_PY_PATH="/root/algo/upload.py"
+JAR_PATH="/root/algo/recommend-server/recommend-server-service/target/recommend-server-service.jar"
+FM_PATH="/root/algo/alphaFM/bin"
+MODEL_PATH="/root/algo/LR_MODEL/"
+YESTERDAY="$(date -d '2 days ago' +%Y%m%d)"
+LAST30DAY="$(date -d '2 days ago' +%Y%m%d)"
+MAIN_CLASS="com.tzld.piaoquan.recommend.server.dataloader.OfflineShareSamplesLoader"
+TABLE_NAME="loghubods.alg_recsys_view_sample"
+LABEL="share_ornot"
+#OSSPATH=""
+
+
+# Train
+#mkdir -p ${MODEL_PATH}/${YESTERDAY}
+#${JAVA_PATH} -jar ${JAR_PATH} ${TABLE_NAME} ${LAST30DAY} ${YESTERDAY} ${LABEL} | ${FM_PATH}/fm_train -m ${MODEL_PATH}/${YESTERDAY}/model_${YESTERDAY}.txt -dim 0,1,0 -core 8
+
+#cat ${MODEL_PATH}/${YESTERDAY}/model_${YESTERDAY}.txt | awk -F " " '{print $1,"\t",$2}' > ${MODEL_PATH}/${YESTERDAY}/model_${YESTERDAY}_new.txt
+
+# Upload
+#${UPLOAD_PY_PATH} ${MODEL_PATH}/${YESTERDAY}/model_${YESTERDAY}_new.txt ${OSSPATH}
+
+# Predict
+java -jar ${JAR_PATH} $TABLE_NAME 20231211 20231211 ${LABEL}| ${FM_PATH}/fm_predict -m ${MODEL_PATH}/20231210/model_20231210.txt  -dim 0 -core 8 -out ${MODEL_PATH}/predict_1211.txt
+

+ 16 - 0
zhangbo/01_train.sh

@@ -0,0 +1,16 @@
+#!/bin/sh
+set -e
+set -x
+
+day=$1
+train_path=$2
+model_name=$3
+bias=$4 # 0,1,0 1,1,0
+
+HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
+$HADOOP fs -text ${train_path}/${day}/* | /root/sunmingze/alphaFM/bin/fm_train -m model/${model_name}_${day}.txt -dim ${bias} -core 8
+# -v_l1 ${v_l1} -v_l2 ${v_l2}
+
+# nohup sh 01_train.sh 20240606 /dw/recommend/model/16_train_data/ model_aka8 1,1,8 >p1_model_aka8.log 2>&1 &
+# nohup sh 01_train.sh 20240606 /dw/recommend/model/16_train_data/ model_aka0 1,1,0 >p1_model_aka0.log 2>&1 &
+# nohup sh 01_train.sh 20240606 /dw/recommend/model/16_train_data/ model_aka4 1,1,4 >p1_model_aka4.log 2>&1 &

+ 25 - 0
zhangbo/02_train_go.sh

@@ -0,0 +1,25 @@
+#!/bin/sh
+set -ex
+
+start_date=$1
+end_date=$2
+model_name=$3
+MODEL_PATH="./model/"
+SAMPLE_PATH=$4
+bias=$5
+HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
+FM_TRAIN="/root/sunmingze/alphaFM/bin/fm_train"
+
+current_date="$start_date"
+
+while [[ "$current_date" != "$end_date" ]]; do
+    echo -------"$current_date"----------
+
+    yesterday=$(date -d "$current_date - 1 day" +%Y%m%d)
+    echo model-day-$yesterday
+    echo data-day-$current_date
+    $HADOOP fs -text ${SAMPLE_PATH}/$current_date/* | ${FM_TRAIN} -m $MODEL_PATH/${model_name}_$current_date.txt -dim ${bias} -core 8 -im $MODEL_PATH/${model_name}_$yesterday.txt
+    current_date=$(date -d "$current_date + 1 day" +%Y%m%d)
+done
+
+# nohup sh 02_train_go.sh 20240615 20240616 model_aka8 /dw/recommend/model/16_train_data/ 1,1,8 >p2_model_aka8.log 2>&1 &

+ 33 - 0
zhangbo/03_predict.sh

@@ -0,0 +1,33 @@
+#!/bin/sh
+set -e
+set -x
+
+day=$1
+train_path=$2
+model_name=$3
+output_file=$4
+bias=$5
+HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
+$HADOOP fs -text ${train_path}/${day}/* | /root/sunmingze/alphaFM/bin/fm_predict -m model/$model_name -dim ${bias} -core 8 -out predict/${output_file}_$day.txt
+cat predict/${output_file}_$day.txt | /root/sunmingze/AUC/AUC
+
+
+# nohup sh 03_predict.sh 20240611 /dw/recommend/model/16_train_data/ model_aka0_20240610.txt model_aka0_20240610 0 >p3_model_aka0.log 2>&1 &
+# nohup sh 03_predict.sh 20240611 /dw/recommend/model/16_train_data/ model_aka4_20240610.txt model_aka4_20240610 4 >p3_model_aka4.log 2>&1 &
+# nohup sh 03_predict.sh 20240613 /dw/recommend/model/16_train_data/ model_aka8_20240612.txt model_aka8_20240612 8 >p3_model_aka8_12.log 2>&1 &
+
+
+# nohup sh 03_predict.sh 20240615 /dw/recommend/model/16_train_data_print_online_merge/ model_aka8_20240608.txt model_aka8_20240608 8 >p3_model_aka8_on.log 2>&1 &
+
+
+
+
+# cat tmpfile | /root/sunmingze/alphaFM/bin/fm_predict -m model/model_aka8_20240608.txt -dim 8 -core 1 -out tmpfile_out.txt
+
+
+# nohup sh 03_predict.sh 20240618 /dw/recommend/model/17_for_check_v1/ model_aka8_20240608.txt v1 8 >v1.log 2>&1 &
+# nohup sh 03_predict.sh 20240618 /dw/recommend/model/17_for_check_v2/ model_aka8_20240608.txt v2 8 >v2.log 2>&1 &
+# nohup sh 03_predict.sh 20240618 /dw/recommend/model/17_for_check_v3/ model_aka8_20240608.txt v3 8 >v3.log 2>&1 &
+# nohup sh 03_predict.sh 20240618 /dw/recommend/model/17_for_check_v4/ model_aka8_20240608.txt v4 8 >v4.log 2>&1 &
+# nohup sh 03_predict.sh 20240618 /dw/recommend/model/17_for_check_v5/ model_aka8_20240608.txt v4 8 >v5.log 2>&1 &
+# nohup sh 03_predict.sh 20240618 /dw/recommend/model/17_for_check_v6/ model_aka8_20240608.txt v4 8 >v6.log 2>&1 &

+ 25 - 0
zhangbo/04_upload.sh

@@ -0,0 +1,25 @@
+
+cat /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_str_mid_20240313.txt | sed '1d' | awk -F " " '{if($2!="0") print $1"\t"$2}' > /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_str_mid_20240313_change.txt
+dfs -put /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_str_mid_20240313_change.txt oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/video_str_model/model_str_mid.txt
+
+cat /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_aka0_20240608.txt | awk -F " " '{print $1"\t"$2}' > /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_aka0_20240608_change.txt
+dfs -put /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_aka0_20240608_change.txt oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/zhangbo/model_aka0.txt
+
+
+
+cat /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_aka8_20240608.txt |
+awk -F " " '{
+    if (NR == 1) {
+        print $1"\t"$2
+    } else {
+        split($0, fields, " ");
+        OFS="\t";
+        line=""
+        for (i = 1; i <= 10 && i <= length(fields); i++) {
+            line = (line ? line "\t" : "") fields[i];
+        }
+        print line
+    }
+}' > /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_aka8_20240608_change.txt
+
+dfs -put /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_aka8_20240608_change.txt oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/zhangbo/model_aka8.txt

+ 151 - 0
zhangbo/05_update_everyday_2model.sh

@@ -0,0 +1,151 @@
+#!/bin/sh
+set -ex
+# 0 全局变量/参数
+samplePath=/dw/recommend/model/10_sample_data_v3/
+savePath=/dw/recommend/model/12_ros_data_v3/
+model_name=model_jerry
+today="$(date +%Y%m%d)"
+today_early_1="$(date -d '1 days ago' +%Y%m%d)"
+yesterday="$(date -d '1 days ago' +%Y%m%d)"
+
+
+HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
+FM_TRAIN="/root/sunmingze/alphaFM/bin/fm_train"
+MODEL_PATH="/root/zhangbo/recommend-emr-dataprocess/zhangbo/model/"
+OSS_PATH="oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/zhangbo/"
+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
+export PATH=$SPARK_HOME/bin:$PATH
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
+
+# 0 判断上游表是否生产完成,最长等待到11点
+source /root/anaconda3/bin/activate py37
+max_hour=11
+max_minute=00
+while true; do
+  python_return_code=$(python utils.py --excute_program check_hive --partition ${today_early_1} --project loghubods --table alg_recsys_view_sample_v3)
+  if [ $python_return_code -eq 0 ]; then
+    echo "Python程序返回0,退出循环。"
+    break
+  fi
+  echo "Python程序返回非0值,等待五分钟后再次调用。"
+  sleep 300
+  current_hour=$(date +%H)
+  current_minute=$(date +%M)
+  if (( current_hour > max_hour || (current_hour == max_hour && current_minute >= max_minute) )); then
+    echo "最长等待时间已到,失败:${current_hour}-${current_minute}"
+    exit 1
+  fi
+done
+#conda deactivate
+
+# 1 生产数据
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_10_originData_v3 \
+--name every_day_origindata_${model_name}_${today} \
+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 32 \
+/root/zhangbo/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+tablePart:32 savePath:${samplePath} beginStr:${today_early_1} endStr:${today_early_1}
+if [ $? -eq 1 ]; then
+    echo "Spark原始样本生产任务执行失败"
+    exit 1
+else
+    echo "spark原始样本生产执行成功"
+fi
+
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_12_rosData_v3 \
+--name makedata_12_rosData_v3_${model_name}_${today} \
+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 32 \
+/root/zhangbo/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+readPath:${samplePath} savePath:${savePath} beginStr:${today_early_1} endStr:${today_early_1} ifRepart:10
+if [ $? -eq 1 ]; then
+    echo "Spark训练样本-生产任务执行失败-ros"
+    exit 1
+else
+    echo "spark训练样本-生产执行成功-ros"
+fi
+
+# 2 加载上次模型 训练本轮数据 保存本轮模型
+end_date=${today}
+loop_date=${yesterday}
+while [[ "$loop_date" != "$end_date" ]]; do
+    echo -------train ${loop_date}----------
+    loop_date_model=$(date -d "$loop_date - 1 day" +%Y%m%d)
+    $HADOOP fs -text ${savePath}/dt=${loop_date}/* | ${FM_TRAIN} -m ${MODEL_PATH}/${model_name}_${loop_date}.txt \
+-dim 0,1,0 -core 8 -im ${MODEL_PATH}/${model_name}_${loop_date_model}.txt
+    if [ $? -eq 1 ]; then
+        echo "训练失败"
+        exit 1
+    fi
+    echo -------save ${MODEL_PATH}/${model_name}_${loop_date}.txt----------
+    loop_date=$(date -d "$loop_date + 1 day" +%Y%m%d)
+done
+
+# 3 本轮模型格式转换
+cat ${MODEL_PATH}/${model_name}_${today_early_1}.txt \
+| sed '1d' | awk -F " " '{if($2!="0") print $1"\t"$2}' \
+> ${MODEL_PATH}/${model_name}_${today_early_1}_change.txt
+
+# 4 转换后模型上传oss
+online_model_path=${OSS_PATH}/${model_name}.txt
+$HADOOP fs -test -e ${online_model_path}
+if [ $? -eq 0 ]; then
+    echo "数据存在, 先删除。"
+    $HADOOP fs -rm -r ${online_model_path}
+else
+    echo "数据不存在"
+fi
+$HADOOP fs -put ${MODEL_PATH}/${model_name}_${today_early_1}_change.txt ${online_model_path}
+
+
+# 5 str数据生产
+savePath=/dw/recommend/model/11_str_data_v3/
+model_name=model_tom
+
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_11_strData_v3 \
+--name makedata_11_strData_v3_${model_name}_${today} \
+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 64 \
+/root/zhangbo/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+readPath:${samplePath} savePath:${savePath} beginStr:${today_early_1} endStr:${today_early_1} ifRepart:100
+if [ $? -eq 1 ]; then
+    echo "Spark训练样本-生产任务执行失败-str"
+    exit 1
+else
+    echo "spark训练样本-生产执行成功-str"
+fi
+# 6 加载上次模型 训练本轮数据 保存本轮模型
+end_date=${today}
+loop_date=${yesterday}
+while [[ "$loop_date" != "$end_date" ]]; do
+    echo -------train ${loop_date}----------
+    loop_date_model=$(date -d "$loop_date - 1 day" +%Y%m%d)
+    $HADOOP fs -text ${savePath}/dt=${loop_date}/* | ${FM_TRAIN} -m ${MODEL_PATH}/${model_name}_${loop_date}.txt \
+-dim 0,1,0 -core 8 -im ${MODEL_PATH}/${model_name}_${loop_date_model}.txt
+    if [ $? -eq 1 ]; then
+        echo "训练失败"
+        exit 1
+    fi
+    echo -------save ${MODEL_PATH}/${model_name}_${loop_date}.txt----------
+    loop_date=$(date -d "$loop_date + 1 day" +%Y%m%d)
+done
+
+# 7 本轮模型格式转换
+cat ${MODEL_PATH}/${model_name}_${today_early_1}.txt \
+| sed '1d' | awk -F " " '{if($2!="0") print $1"\t"$2}' \
+> ${MODEL_PATH}/${model_name}_${today_early_1}_change.txt
+
+# 8 转换后模型上传oss
+online_model_path=${OSS_PATH}/${model_name}.txt
+$HADOOP fs -test -e ${online_model_path}
+if [ $? -eq 0 ]; then
+    echo "数据存在, 先删除。"
+    $HADOOP fs -rm -r ${online_model_path}
+else
+    echo "数据不存在"
+fi
+$HADOOP fs -put ${MODEL_PATH}/${model_name}_${today_early_1}_change.txt ${online_model_path}
+
+
+# nohup sh 05_update_everyday_2model.sh > p5.log 2>&1 &

+ 107 - 0
zhangbo/05_update_everyday_str.sh

@@ -0,0 +1,107 @@
+#!/bin/sh
+set -ex
+# 0 全局变量/参数
+samplePath=/dw/recommend/model/00_sample_data/
+savePath=/dw/recommend/model/04_str_data/
+model_name=model_str_mid
+today="$(date +%Y%m%d)"
+today_early_1="$(date -d '1 days ago' +%Y%m%d)"
+yesterday="$(date -d '1 days ago' +%Y%m%d)"
+
+#today=20240129
+#today_early_1=20240128
+#yesterday=20240128
+
+HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
+FM_TRAIN="/root/sunmingze/alphaFM/bin/fm_train"
+MODEL_PATH="/root/zhangbo/recommend-emr-dataprocess/zhangbo/model/"
+OSS_PATH="oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/video_str_model/"
+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
+export PATH=$SPARK_HOME/bin:$PATH
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
+
+# 0 判断上游表是否生产完成,最长等待到12点
+source /root/anaconda3/bin/activate py37
+max_hour=11
+max_minute=00
+while true; do
+  python_return_code=$(python utils.py --excute_program check_origin_hive --partition ${today_early_1})
+  if [ $python_return_code -eq 0 ]; then
+    echo "Python程序返回0,退出循环。"
+    break
+  fi
+  echo "Python程序返回非0值,等待五分钟后再次调用。"
+  sleep 300
+  current_hour=$(date +%H)
+  current_minute=$(date +%M)
+  if (( current_hour > max_hour || (current_hour == max_hour && current_minute >= max_minute) )); then
+    echo "最长等待时间已到,失败:${current_hour}-${current_minute}"
+    exit 1
+  fi
+done
+#conda deactivate
+
+# 1 生产数据
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_06_originData \
+--name every_day_origindata_${model_name}_${today} \
+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 16 \
+/root/zhangbo/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+tablePart:32 savePath:${samplePath} beginStr:${today_early_1} endStr:${today_early_1}
+if [ $? -eq 1 ]; then
+    echo "Spark原始样本生产任务执行失败"
+    exit 1
+else
+    echo "spark原始样本生产执行成功"
+fi
+
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_07_strData \
+--name every_day_strdata_${model_name}_${today} \
+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 16 \
+/root/zhangbo/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+readPath:${samplePath} savePath:${savePath} beginStr:${today_early_1} endStr:${today_early_1} featureVersion:v4 ifRepart:100
+if [ $? -eq 1 ]; then
+    echo "Spark训练样本生产任务执行失败"
+    exit 1
+else
+    echo "spark训练样本生产执行成功"
+fi
+
+
+
+# 2 加载上次模型 训练本轮数据 保存本轮模型
+end_date=${today}
+loop_date=${yesterday}
+while [[ "$loop_date" != "$end_date" ]]; do
+    echo -------train ${loop_date}----------
+    loop_date_model=$(date -d "$loop_date - 1 day" +%Y%m%d)
+    $HADOOP fs -text ${savePath}/dt=${loop_date}/* | ${FM_TRAIN} -m ${MODEL_PATH}/${model_name}_${loop_date}.txt \
+-dim 1,1,0 -core 8 -im ${MODEL_PATH}/${model_name}_${loop_date_model}.txt
+    if [ $? -eq 1 ]; then
+        echo "训练失败"
+        exit 1
+    fi
+    echo -------save ${MODEL_PATH}/${model_name}_${loop_date}.txt----------
+    loop_date=$(date -d "$loop_date + 1 day" +%Y%m%d)
+done
+
+# 3 本轮模型格式转换
+cat ${MODEL_PATH}/${model_name}_${today_early_1}.txt \
+| sed '1d' | awk -F " " '{if($2!="0") print $1"\t"$2}' \
+> ${MODEL_PATH}/${model_name}_${today_early_1}_change.txt
+
+# 4 转换后模型上传oss
+online_model_path=${OSS_PATH}/${model_name}.txt
+$HADOOP fs -test -e ${online_model_path}
+if [ $? -eq 0 ]; then
+    echo "数据存在, 先删除。"
+    $HADOOP fs -rm -r ${online_model_path}
+else
+    echo "数据不存在"
+fi
+$HADOOP fs -put ${MODEL_PATH}/${model_name}_${today_early_1}_change.txt ${online_model_path}
+
+
+#nohup sh 05_update_everyday_str.sh > p.log 2>&1 &

+ 124 - 0
zhangbo/06_update_everyday_feature.sh

@@ -0,0 +1,124 @@
+#!/bin/sh
+set -ex
+# 0 全局变量/参数
+today="$(date +%Y%m%d)"
+today_early_1="$(date -d '1 days ago' +%Y%m%d)"
+
+HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
+export PATH=$SPARK_HOME/bin:$PATH
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
+
+max_hour=11
+max_minute=00
+
+# 0 判断上游表是否生产完成,最长等待到12点
+source /root/anaconda3/bin/activate py37
+while true; do
+  python_return_code=$(python utils.py --excute_program check_item_hive --partition ${today_early_1})
+  if [ $python_return_code -eq 0 ]; then
+    echo "Python程序返回0,退出循环。"
+    break
+  fi
+  echo "Python程序返回非0值,等待五分钟后再次调用。"
+  sleep 300
+  current_hour=$(date +%H)
+  current_minute=$(date +%M)
+  if (( current_hour > max_hour || (current_hour == max_hour && current_minute >= max_minute) )); then
+    echo "最长等待时间已到,失败:${current_hour}-${current_minute}"
+    exit 1
+  fi
+done
+conda deactivate
+# 1 item 生产数据
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_08_item2redis \
+--name makedata_08_item2redis_${today} \
+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 16 \
+/root/zhangbo/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+date:${today_early_1} tablePart:32 expireDay:4 ifDebug:False \
+ifVideo:True ifWriteRedis:True savePathVideo:/dw/recommend/model/09_feature/video
+
+if [ $? -eq 1 ]; then
+    echo "---------item写入redis执行失败---------"
+    exit 1
+else
+    echo "---------item写入redis执行成功---------"
+fi
+
+# 2 检查user上游表
+source /root/anaconda3/bin/activate py37
+while true; do
+  python_return_code=$(python utils.py --excute_program check_user_hive --partition ${today_early_1})
+  if [ $python_return_code -eq 0 ]; then
+    echo "Python程序返回0,退出循环。"
+    break
+  fi
+  echo "Python程序返回非0值,等待五分钟后再次调用。"
+  sleep 300
+  current_hour=$(date +%H)
+  current_minute=$(date +%M)
+  if (( current_hour > max_hour || (current_hour == max_hour && current_minute >= max_minute) )); then
+    echo "最长等待时间已到,失败:${current_hour}-${current_minute}"
+    exit 1
+  fi
+done
+
+# 3 检查mid 时间,上游表
+while true; do
+  python_return_code=$(python utils.py --excute_program check_hive --partition ${today_early_1} --project loghubods --table mid_uid)
+  if [ $python_return_code -eq 0 ]; then
+    echo "Python程序返回0,退出循环。上游表loghubods.mid_uid=${today_early_1} 已生产完毕"
+    break
+  fi
+  echo "Python程序返回非0值,等待五分钟后再次调用。上游表loghubods.mid_uid=${today_early_1} 未完成"
+  sleep 300
+  current_hour=$(date +%H)
+  current_minute=$(date +%M)
+  if (( current_hour > max_hour || (current_hour == max_hour && current_minute >= max_minute) )); then
+    echo "最长等待时间已到,失败:${current_hour}-${current_minute}"
+    exit 1
+  fi
+done
+
+conda deactivate
+# 4 user 生产数据
+#/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+#--class com.aliyun.odps.spark.examples.makedata.makedata_09_user2redis \
+#--name makedata_09_user2redis_${today} \
+#--master yarn --driver-memory 1G --executor-memory 4G --executor-cores 1 --num-executors 32 \
+#--conf spark.yarn.executor.memoryoverhead=1024 \
+#/root/zhangbo/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+#date:${today_early_1} tablePart:32 expireDay:3 ifDebug:False \
+#ifUser:True ifDeleteRedisUser:False ifWriteRedisUser:True sampleRate:1.0 midDays:7 \
+#savePathUser:/dw/recommend/model/feature/user/
+
+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_09_user2redis_freq \
+--name makedata_09_user2redis_freq \
+--master yarn --driver-memory 1G --executor-memory 4G --executor-cores 1 --num-executors 32 \
+--conf spark.yarn.executor.memoryoverhead=2024 \
+--conf spark.shuffle.service.enabled=true \
+--conf spark.shuffle.service.port=7337 \
+--conf spark.shuffle.consolidateFiles=true \
+--conf spark.shuffle.manager=sort \
+--conf spark.storage.memoryFraction=0.4 \
+--conf spark.shuffle.memoryFraction=0.5 \
+--conf spark.default.parallelism=400	\
+--conf spark.speculation=true \
+--conf spark.speculation.multiplier=10 \
+--conf spark.speculation.quantile=0.75	\
+--conf spark.network.timeout=120 \
+/root/zhangbo/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+date:${today_early_1} tablePart:64 expireDay:3 ifWriteRedisUser:True ifUser:True midDays:14 redisLimit:100000000 \
+savePathUser:/dw/recommend/model/09_feature/user/
+
+if [ $? -eq 1 ]; then
+    echo "---------user写入redis执行失败---------"
+    exit 1
+else
+    echo "---------user写入redis执行成功---------"
+fi
+
+#nohup sh 06_update_everyday_feature.sh > p6.log 2>&1 &

+ 67 - 0
zhangbo/50_delete_hdfs.sh

@@ -0,0 +1,67 @@
+#!/bin/sh
+
+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
+export PATH=$SPARK_HOME/bin:$PATH
+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
+
+DATE="$(date -d '9 days ago' +%Y%m%d)"
+HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
+
+path="/dw/recommend/model/feature/user/dt=${DATE}"
+$HADOOP fs -test -e ${path}
+if [ $? -eq 0 ]; then
+    echo "${path} 数据存在, 删除。"
+    $HADOOP fs -rm -r -skipTrash ${path}
+else
+    echo "${path} 数据不存在"
+fi
+
+path="/dw/recommend/model/feature/video/dt=${DATE}"
+$HADOOP fs -test -e ${path}
+if [ $? -eq 0 ]; then
+    echo "${path} 数据存在, 删除。"
+    $HADOOP fs -rm -r -skipTrash ${path}
+else
+    echo "${path} 数据不存在"
+fi
+
+path="/dw/recommend/model/00_sample_data/dt=${DATE}"
+$HADOOP fs -test -e ${path}
+if [ $? -eq 0 ]; then
+    echo "${path} 数据存在, 删除。"
+    $HADOOP fs -rm -r -skipTrash ${path}
+else
+    echo "${path} 数据不存在"
+fi
+
+
+
+function delete_path() {
+    if [ "$#" -ne 2 ]; then
+        echo "Usage: delete_path <early> <path>"
+        return 1
+    fi
+    early=$1
+    path=$2
+    date="$(date -d "${early} days ago" +%Y%m%d)"
+    path_delete=${path}${date}
+    $HADOOP fs -test -e ${path_delete}
+    if [ $? -eq 0 ]; then
+        echo "${path_delete} 数据存在, 删除。"
+        if $HADOOP fs -rm -r -skipTrash "${path_delete}"; then
+            echo "删除成功。"
+        else
+            echo "删除失败。"
+        fi
+    else
+        echo "${path_delete} 数据不存在"
+    fi
+}
+
+delete_path 7 /dw/recommend/model/11_str_data_v3/dt=
+delete_path 7 /dw/recommend/model/12_ros_data_v3/dt=
+delete_path 7 /dw/recommend/model/10_sample_data_v3/dt=
+delete_path 3 /dw/recommend/model/09_feature/user/all/dt=
+delete_path 3 /dw/recommend/model/09_feature/user/true/dt=
+delete_path 3 /dw/recommend/model/09_feature/video/dt=

+ 28 - 0
zhangbo/train.sh

@@ -0,0 +1,28 @@
+#!/bin/sh
+
+#MVN_PACKAGE="mvn clean install  -T 2C -Dmaven.test.skip=true -Dmaven.compile.fork=true"
+JAVA_PATH="/usr/bin/java"
+PYTHON_PATH="/usr/bin/python"
+UPLOAD_PY_PATH="/root/algo/upload.py"
+JAR_PATH="/root/algo/recommend-server/recommend-server-service/target/recommend-server-service.jar"
+FM_PATH="/root/algo/alphaFM/bin"
+MODEL_PATH="/root/algo/LR_MODEL/"
+YESTERDAY="$(date -d '2 days ago' +%Y%m%d)"
+LAST30DAY="$(date -d '2 days ago' +%Y%m%d)"
+MAIN_CLASS="com.tzld.piaoquan.recommend.server.dataloader.OfflineShareSamplesLoader"
+TABLE_NAME="loghubods.alg_recsys_view_sample"
+LABEL="share_ornot"
+#OSSPATH=""
+
+
+# Train
+#mkdir -p ${MODEL_PATH}/${YESTERDAY}
+#${JAVA_PATH} -jar ${JAR_PATH} ${TABLE_NAME} ${LAST30DAY} ${YESTERDAY} ${LABEL} | ${FM_PATH}/fm_train -m ${MODEL_PATH}/${YESTERDAY}/model_${YESTERDAY}.txt -dim 0,1,0 -core 8
+
+#cat ${MODEL_PATH}/${YESTERDAY}/model_${YESTERDAY}.txt | awk -F " " '{print $1,"\t",$2}' > ${MODEL_PATH}/${YESTERDAY}/model_${YESTERDAY}_new.txt
+
+# Upload
+#${UPLOAD_PY_PATH} ${MODEL_PATH}/${YESTERDAY}/model_${YESTERDAY}_new.txt ${OSSPATH}
+
+# Predict
+java -jar ${JAR_PATH} $TABLE_NAME 20231211 20231211 ${LABEL}| ${FM_PATH}/fm_predict -m ${MODEL_PATH}/20231210/model_20231210.txt  -dim 0 -core 8 -out ${MODEL_PATH}/predict_1211.txt

+ 14 - 0
zhangbo/up.sh

@@ -0,0 +1,14 @@
+#!/bin/sh
+
+day=$1
+root_path="/root/spark-data"
+oss_hdfs_path="oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/"
+model_path="$root_path/model"
+model_online="$model_path/online"
+
+
+cat $model_path/model_ctr_$day.txt | sed '1d' | awk -F " " '{if($2!="0") print $1"\t"$2}' > $model_online/model_ad_ctr.txt
+
+hdfs dfs -rmr ${oss_hdfs_path}/ad_ctr_model/model_ad_ctr.txt
+
+hdfs dfs -put $model_online/model_ad_ctr.txt ${oss_hdfs_path}/ad_ctr_model/

+ 10 - 0
zhangbo/up2.sh

@@ -0,0 +1,10 @@
+root_path="/root/spark-data"
+oss_hdfs_path="oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/"
+model_path=$root_path/model
+day=$1
+
+cat /root/spark-data/model/model_share_20231216.txt | sed '1d' | awk -F " " '{if($2!="0") print $1"\t"$2}' > /root/spark-data/model/model_share_now.txt
+
+dfs -put /root/spark-data/model/model_share_now.txt oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/video_model
+
+hdfs dfs -put $mdoel_path/model_share_$day.txt ${oss_hdfs_path}/video_str_model

+ 99 - 0
zhangbo/utils.py

@@ -0,0 +1,99 @@
+# -*- coding: utf-8 -*-
+from odps import ODPS
+import argparse
+
+ODPS_CONFIG = {
+        'ENDPOINT': 'http://service.cn.maxcompute.aliyun.com/api',
+        'ACCESSID': 'LTAIWYUujJAm7CbH',
+        'ACCESSKEY': 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P',
+}
+
+def check_data(project, table, partition) -> int:
+    """检查数据是否准备好,输出数据条数"""
+    odps = ODPS(
+        access_id=ODPS_CONFIG['ACCESSID'],
+        secret_access_key=ODPS_CONFIG['ACCESSKEY'],
+        project=project,
+        endpoint=ODPS_CONFIG['ENDPOINT'],
+        connect_timeout=3000,
+        read_timeout=500000,
+        pool_maxsize=1000,
+        pool_connections=1000
+    )
+    try:
+        t = odps.get_table(name=table)
+        check_res = t.exist_partition(partition_spec=f'dt={partition}')
+        if check_res:
+            sql = f'select * from {project}.{table} where dt = {partition}'
+            with odps.execute_sql(sql=sql).open_reader() as reader:
+                data_count = reader.count
+        else:
+            data_count = 0
+    except Exception as e:
+        print("error:" + str(e))
+        data_count = 0
+    return data_count
+
+
+def check_origin_hive(args):
+    project = "loghubods"
+    table = "alg_recsys_view_sample_v2"
+    partition = args.partition
+    count = check_data(project, table, partition)
+    if count == 0:
+        print("1")
+        exit(1)
+    else:
+        print("0")
+
+def check_item_hive(args):
+    project = "loghubods"
+    table = "alg_recsys_video_info"
+    partition = args.partition
+    count = check_data(project, table, partition)
+    if count == 0:
+        print("1")
+        exit(1)
+    else:
+        print("0")
+def check_user_hive(args):
+    project = "loghubods"
+    table = "alg_recsys_user_info"
+    partition = args.partition
+    count = check_data(project, table, partition)
+    if count == 0:
+        print("1")
+        exit(1)
+    else:
+        print("0")
+def check_hive(args):
+    project = args.project
+    table = args.table
+    partition = args.partition
+    count = check_data(project, table, partition)
+    if count == 0:
+        print("1")
+        exit(1)
+    else:
+        print("0")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='脚本utils')
+    parser.add_argument('--excute_program', type=str, help='执行程序')
+    parser.add_argument('--partition', type=str, help='表分区')
+    parser.add_argument('--project', type=str, help='表空间')
+    parser.add_argument('--table', type=str, help='表名')
+    args = parser.parse_args()
+    if args.excute_program == "check_origin_hive":
+        check_origin_hive(args)
+    elif args.excute_program == "check_item_hive":
+        check_item_hive(args)
+    elif args.excute_program == "check_user_hive":
+        check_user_hive(args)
+    elif args.excute_program == "check_hive":
+            check_hive(args)
+    else:
+        print("无合法参数,验证失败。")
+        exit(999)
+

Some files were not shown because too many files changed in this diff