1 year ago · 987239b917
--- a/pom.xml
+++ b/pom.xml
@@ -17,18 +17,26 @@
 
				          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
			
 
				     <modelVersion>4.0.0</modelVersion>
			
 
				 
			
 
				+    <parent>
			
 
				+        <groupId>com.tzld.commons</groupId>
			
 
				+        <artifactId>supom</artifactId>
			
 
				+        <version>1.0.9</version>
			
 
				+    </parent>
			
 
				+
			
 
				     <properties>
			
 
				-        <spark.version>3.1.1</spark.version>
			
 
				-        <oss.sdk.version>3.0.0</oss.sdk.version>
			
 
				+        <spark.version>2.3.0</spark.version>
			
 
				         <cupid.sdk.version>3.3.8-public</cupid.sdk.version>
			
 
				-        <scala.version>2.12.10</scala.version>
			
 
				-        <scala.binary.version>2.12</scala.binary.version>
			
 
				-        <odps.version>0.28.4-public</odps.version>
			
 
				+        <scala.version>2.11.8</scala.version>
			
 
				+        <scala.binary.version>2.11</scala.binary.version>
			
 
				+        <java.version>1.8</java.version>
			
 
				+        <maven.compiler.source>${java.version}</maven.compiler.source>
			
 
				+        <maven.compiler.target>${java.version}</maven.compiler.target>
			
 
				         <emr.version>2.0.0</emr.version>
			
 
				+        <odps.version>0.28.4-public</odps.version>
			
 
				     </properties>
			
 
				 
			
 
				     <groupId>com.aliyun.odps</groupId>
			
 
				-    <artifactId>spark-examples_${scala.binary.version}</artifactId>
			
 
				+    <artifactId>spark-examples</artifactId>
			
 
				     <version>1.0.0-SNAPSHOT</version>
			
 
				     <packaging>jar</packaging>
			
 
				 
			
@@ -39,6 +47,29 @@
 
				             <version>3.12.0</version>
			
 
				         </dependency>
			
 
				 
			
 
				+        <dependency>
			
 
				+            <groupId>com.hankcs</groupId>
			
 
				+            <artifactId>hanlp</artifactId>
			
 
				+            <version>portable-1.8.2</version>
			
 
				+        </dependency>
			
 
				+<!--        <dependency>-->
			
 
				+<!--            <groupId>com.medallia.word2vec</groupId>-->
			
 
				+<!--            <artifactId>word2vec</artifactId>-->
			
 
				+<!--            <version>0.1.42</version>-->
			
 
				+<!--        </dependency>-->
			
 
				+
			
 
				+        <dependency>
			
 
				+            <groupId>org.xm</groupId>
			
 
				+            <artifactId>similarity</artifactId>
			
 
				+            <version>1.1</version>
			
 
				+        </dependency>
			
 
				+
			
 
				+        <dependency>
			
 
				+            <groupId>com.alibaba</groupId>
			
 
				+            <artifactId>fastjson</artifactId>
			
 
				+            <version>1.2.83</version>
			
 
				+        </dependency>
			
 
				+
			
 
				         <dependency>
			
 
				             <groupId>com.tzld.piaoquan</groupId>
			
 
				             <artifactId>recommend-server-client</artifactId>
			
@@ -48,17 +79,15 @@
 
				         <dependency>
			
 
				             <groupId>com.tzld.piaoquan</groupId>
			
 
				             <artifactId>recommend-feature-client</artifactId>
			
 
				-            <version>1.1.15</version>
			
 
				+            <version>1.0.3</version>
			
 
				         </dependency>
			
 
				 
			
 
				-
			
 
				         <dependency>
			
 
				             <groupId>com.tzld.piaoquan</groupId>
			
 
				             <artifactId>ad-engine-commons</artifactId>
			
 
				-            <version>1.1.0</version>
			
 
				+            <version>1.0.0</version>
			
 
				         </dependency>
			
 
				 
			
 
				-
			
 
				         <dependency>
			
 
				             <groupId>org.apache.spark</groupId>
			
 
				             <artifactId>spark-core_${scala.binary.version}</artifactId>
			
@@ -66,36 +95,32 @@
 
				             <scope>provided</scope>
			
 
				             <exclusions>
			
 
				                 <exclusion>
			
 
				-                    <artifactId>protobuf-java</artifactId>
			
 
				-                    <groupId>com.google.protobuf</groupId>
			
 
				+                    <groupId>org.scala-lang</groupId>
			
 
				+                    <artifactId>scala-library</artifactId>
			
 
				+                </exclusion>
			
 
				+                <exclusion>
			
 
				+                    <groupId>org.scala-lang</groupId>
			
 
				+                    <artifactId>scalap</artifactId>
			
 
				                 </exclusion>
			
 
				             </exclusions>
			
 
				         </dependency>
			
 
				-
			
 
				         <dependency>
			
 
				             <groupId>org.apache.spark</groupId>
			
 
				             <artifactId>spark-sql_${scala.binary.version}</artifactId>
			
 
				             <version>${spark.version}</version>
			
 
				             <scope>provided</scope>
			
 
				-            <exclusions>
			
 
				-                <exclusion>
			
 
				-                    <artifactId>protobuf-java</artifactId>
			
 
				-                    <groupId>com.google.protobuf</groupId>
			
 
				-                </exclusion>
			
 
				-            </exclusions>
			
 
				         </dependency>
			
 
				-
			
 
				         <dependency>
			
 
				             <groupId>org.apache.spark</groupId>
			
 
				             <artifactId>spark-mllib_${scala.binary.version}</artifactId>
			
 
				             <version>${spark.version}</version>
			
 
				             <scope>provided</scope>
			
 
				-            <exclusions>
			
 
				-                <exclusion>
			
 
				-                    <artifactId>protobuf-java</artifactId>
			
 
				-                    <groupId>com.google.protobuf</groupId>
			
 
				-                </exclusion>
			
 
				-            </exclusions>
			
 
				+        </dependency>
			
 
				+        <dependency>
			
 
				+            <groupId>org.apache.spark</groupId>
			
 
				+            <artifactId>spark-streaming_${scala.binary.version}</artifactId>
			
 
				+            <version>${spark.version}</version>
			
 
				+            <scope>provided</scope>
			
 
				         </dependency>
			
 
				 
			
 
				         <dependency>
			
@@ -103,26 +128,36 @@
 
				             <artifactId>cupid-sdk</artifactId>
			
 
				             <version>${cupid.sdk.version}</version>
			
 
				             <scope>provided</scope>
			
 
				-            <exclusions>
			
 
				-                <exclusion>
			
 
				-                    <artifactId>protobuf-java</artifactId>
			
 
				-                    <groupId>com.google.protobuf</groupId>
			
 
				-                </exclusion>
			
 
				-            </exclusions>
			
 
				         </dependency>
			
 
				 
			
 
				         <dependency>
			
 
				             <groupId>com.aliyun.odps</groupId>
			
 
				             <artifactId>hadoop-fs-oss</artifactId>
			
 
				             <version>${cupid.sdk.version}</version>
			
 
				-            <exclusions>
			
 
				-                <exclusion>
			
 
				-                    <artifactId>protobuf-java</artifactId>
			
 
				-                    <groupId>com.google.protobuf</groupId>
			
 
				-                </exclusion>
			
 
				-            </exclusions>
			
 
				         </dependency>
			
 
				 
			
 
				+        <dependency>
			
 
				+            <groupId>com.aliyun.odps</groupId>
			
 
				+            <artifactId>odps-spark-datasource_${scala.binary.version}</artifactId>
			
 
				+            <version>${cupid.sdk.version}</version>
			
 
				+        </dependency>
			
 
				+
			
 
				+        <dependency>
			
 
				+            <groupId>org.scala-lang</groupId>
			
 
				+            <artifactId>scala-library</artifactId>
			
 
				+            <version>${scala.version}</version>
			
 
				+        </dependency>
			
 
				+        <dependency>
			
 
				+            <groupId>org.scala-lang</groupId>
			
 
				+            <artifactId>scala-actors</artifactId>
			
 
				+            <version>${scala.version}</version>
			
 
				+        </dependency>
			
 
				+
			
 
				+        <dependency>
			
 
				+            <groupId>com.aliyun.emr</groupId>
			
 
				+            <artifactId>emr-maxcompute_2.11</artifactId>
			
 
				+            <version>${emr.version}</version>
			
 
				+        </dependency>
			
 
				 
			
 
				         <dependency>
			
 
				             <groupId>org.springframework.boot</groupId>
			
@@ -134,34 +169,14 @@
 
				             <artifactId>jedis</artifactId>
			
 
				             <version>3.3.0</version>
			
 
				         </dependency>
			
 
				+
			
 
				+
			
 
				         <dependency>
			
 
				             <groupId>org.projectlombok</groupId>
			
 
				             <artifactId>lombok</artifactId>
			
 
				             <version>1.18.24</version>
			
 
				         </dependency>
			
 
				 
			
 
				-        <dependency>
			
 
				-            <groupId>com.aliyun.odps</groupId>
			
 
				-            <artifactId>odps-sdk-commons</artifactId>
			
 
				-            <version>${odps.version}</version>
			
 
				-        </dependency>
			
 
				-
			
 
				-        <dependency>
			
 
				-            <groupId>com.aliyun.emr</groupId>
			
 
				-            <artifactId>emr-mns_2.11</artifactId>
			
 
				-            <version>${emr.version}</version>
			
 
				-            <exclusions>
			
 
				-                <exclusion>
			
 
				-                    <groupId>com.aliyun.mns</groupId>
			
 
				-                    <artifactId>aliyun-sdk-mns</artifactId>
			
 
				-                </exclusion>
			
 
				-            </exclusions>
			
 
				-        </dependency>
			
 
				-        <dependency>
			
 
				-            <groupId>com.aliyun.emr</groupId>
			
 
				-            <artifactId>emr-maxcompute_2.11</artifactId>
			
 
				-            <version>${emr.version}</version>
			
 
				-        </dependency>
			
 
				     </dependencies>
			
 
				 
			
 
				     <build>
			
@@ -240,6 +255,17 @@
 
				                     </execution>
			
 
				                 </executions>
			
 
				             </plugin>
			
 
				+            <plugin>
			
 
				+                <groupId>org.apache.maven.plugins</groupId>
			
 
				+                <artifactId>maven-compiler-plugin</artifactId>
			
 
				+                <version>3.8.1</version>
			
 
				+                <configuration>
			
 
				+                    <source>1.8</source>
			
 
				+                    <target>1.8</target>
			
 
				+                    <!--<compilerId>scala</compilerId>-->
			
 
				+                    <!-- <compilerVersion>2.12.10</compilerVersion>-->
			
 
				+                </configuration>
			
 
				+            </plugin>
			
 
				         </plugins>
			
 
				     </build>
			
 
				 
			
--- a/src/main/java/examples/dataloader/AdRedisFeatureConstructor.java
+++ b/src/main/java/examples/dataloader/AdRedisFeatureConstructor.java
@@ -1,199 +1,198 @@
 
				-package examples.dataloader;
			
 
				-
			
 
				-
			
 
				-import com.aliyun.odps.account.Account;
			
 
				-import com.aliyun.odps.account.AliyunAccount;
			
 
				-import com.aliyun.odps.data.Record;
			
 
				-import com.tzld.piaoquan.recommend.feature.domain.ad.base.AdActionFeature;
			
 
				-import com.tzld.piaoquan.recommend.feature.domain.ad.base.AdRequestContext;
			
 
				-import com.tzld.piaoquan.recommend.feature.domain.ad.base.UserAdFeature;
			
 
				-import com.tzld.piaoquan.recommend.feature.domain.ad.base.AdItemFeature;
			
 
				-
			
 
				-
			
 
				-import java.util.HashMap;
			
 
				-import java.util.Map;
			
 
				-
			
 
				-public class AdRedisFeatureConstructor {
			
 
				-
			
 
				-    private static final String BUCKET_NAME = "ali-recommend";
			
 
				-    private static final Map<String, String> ODPS_CONFIG = new HashMap<String, String>();
			
 
				-
			
 
				-    static {
			
 
				-        ODPS_CONFIG.put("ENDPOINT", "http://service.cn.maxcompute.aliyun.com/api");
			
 
				-        ODPS_CONFIG.put("ACCESSID", "LTAIWYUujJAm7CbH");
			
 
				-        ODPS_CONFIG.put("ACCESSKEY", "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P");
			
 
				-    }
			
 
				-
			
 
				-    ;
			
 
				-
			
 
				-    private static final Account account = new AliyunAccount(ODPS_CONFIG.get("ACCESSID"), ODPS_CONFIG.get("ACCESSKEY"));
			
 
				-
			
 
				-
			
 
				-    public static UserAdFeature constructUserFeature(Record record) {
			
 
				-        UserAdFeature userFeature = new UserAdFeature();
			
 
				-        userFeature.setMid(record.getString("mids"));
			
 
				-
			
 
				-        // 1day features
			
 
				-        AdActionFeature userAd1dayActionFeature = new AdActionFeature();
			
 
				-        userAd1dayActionFeature.setAdView(record.getString("ad_view_1day"));
			
 
				-        userAd1dayActionFeature.setAdClick(record.getString("ad_click_1day"));
			
 
				-        userAd1dayActionFeature.setAdConversion(record.getString("ad_conversion_1day"));
			
 
				-        userAd1dayActionFeature.setCtr(record.getString("ad_ctr_1day"));
			
 
				-        userAd1dayActionFeature.setCvr(record.getString("ad_cvr_1day"));
			
 
				-        userFeature.setDay1_cnt_features(userAd1dayActionFeature);
			
 
				-
			
 
				-
			
 
				-        // 3day features
			
 
				-        AdActionFeature userAd3dayActionFeature = new AdActionFeature();
			
 
				-        userAd3dayActionFeature.setAdView(record.getString("ad_view_3day"));
			
 
				-        userAd3dayActionFeature.setAdClick(record.getString("ad_click_3day"));
			
 
				-        userAd3dayActionFeature.setAdConversion(record.getString("ad_conversion_3day"));
			
 
				-        userAd3dayActionFeature.setCtr(record.getString("ad_ctr_3day"));
			
 
				-        userAd3dayActionFeature.setCvr(record.getString("ad_cvr_3day"));
			
 
				-        userFeature.setDay3_cnt_features(userAd3dayActionFeature);
			
 
				-
			
 
				-
			
 
				-        // 7day features
			
 
				-        AdActionFeature userAd7dayActionFeature = new AdActionFeature();
			
 
				-        userAd7dayActionFeature.setAdView(record.getString("ad_view_7day"));
			
 
				-        userAd7dayActionFeature.setAdClick(record.getString("ad_click7day"));
			
 
				-        userAd7dayActionFeature.setAdConversion(record.getString("ad_conversion_7day"));
			
 
				-        userAd7dayActionFeature.setCtr(record.getString("ad_ctr_7day"));
			
 
				-        userAd7dayActionFeature.setCvr(record.getString("ad_cvr_7day"));
			
 
				-        userFeature.setDay7_cnt_features(userAd7dayActionFeature);
			
 
				-
			
 
				-
			
 
				-        // 3month features
			
 
				-        AdActionFeature userAd3MonthActionFeature = new AdActionFeature();
			
 
				-        userAd3MonthActionFeature.setAdView(record.getString("ad_view_3month"));
			
 
				-        userAd3MonthActionFeature.setAdClick(record.getString("ad_click_3month"));
			
 
				-        userAd3MonthActionFeature.setAdConversion(record.getString("ad_conversion_3month"));
			
 
				-        userAd3MonthActionFeature.setCtr(record.getString("ad_ctr_3month"));
			
 
				-        userAd3MonthActionFeature.setCvr(record.getString("ad_cvr_3month"));
			
 
				-        userFeature.setMonth3_cnt_features(userAd3MonthActionFeature);
			
 
				-
			
 
				-
			
 
				-        return userFeature;
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-    public static AdItemFeature constructItemFeature(Record record) {
			
 
				-        AdItemFeature itemFeature = new AdItemFeature();
			
 
				-        itemFeature.setAdId(record.getString("creativeid"));
			
 
				-        // itemFeature.setAdCode(record.getString("adcode"));
			
 
				-        itemFeature.setCampaignId(record.getString("campaignid"));
			
 
				-        itemFeature.setAdvertiserId(record.getString("advertiserid"));
			
 
				-        itemFeature.setCreativeId(record.getString("creativeid"));
			
 
				-
			
 
				-        // ad 维度特征
			
 
				-        AdActionFeature adIdActionFeature1day = new AdActionFeature();
			
 
				-        adIdActionFeature1day.setAdView(record.getString("view_ad_1day"));
			
 
				-        adIdActionFeature1day.setAdClick(record.getString("click_ad_1day"));
			
 
				-        adIdActionFeature1day.setAdConversion(record.getString("conversion_ad_1day"));
			
 
				-        adIdActionFeature1day.setCtr(record.getString("ctr_ad_1day"));
			
 
				-        adIdActionFeature1day.setCvr(record.getString("cvr_ad_1day"));
			
 
				-        itemFeature.setDay1_cnt_features(adIdActionFeature1day);
			
 
				-
			
 
				-        AdActionFeature adIdActionFeature3day = new AdActionFeature();
			
 
				-        adIdActionFeature3day.setAdView(record.getString("view_ad_3day"));
			
 
				-        adIdActionFeature3day.setAdClick(record.getString("click_ad_3day"));
			
 
				-        adIdActionFeature3day.setAdConversion(record.getString("conversion_ad_3day"));
			
 
				-        adIdActionFeature3day.setCtr(record.getString("ctr_ad_3day"));
			
 
				-        adIdActionFeature3day.setCvr(record.getString("cvr_ad_3day"));
			
 
				-        itemFeature.setDay3_cnt_features(adIdActionFeature3day);
			
 
				-
			
 
				-        AdActionFeature adIdActionFeature7day = new AdActionFeature();
			
 
				-        adIdActionFeature7day.setAdView(record.getString("view_ad_7day"));
			
 
				-        adIdActionFeature7day.setAdClick(record.getString("click_ad_7day"));
			
 
				-        adIdActionFeature7day.setAdConversion(record.getString("conversion_ad_7day"));
			
 
				-        adIdActionFeature7day.setCtr(record.getString("ctr_ad_7day"));
			
 
				-        adIdActionFeature7day.setCvr(record.getString("cvr_ad_7day"));
			
 
				-        itemFeature.setDay7_cnt_features(adIdActionFeature7day);
			
 
				-
			
 
				-        AdActionFeature adIdActionFeature3month = new AdActionFeature();
			
 
				-        adIdActionFeature3month.setAdView(record.getString("view_ad_3month"));
			
 
				-        adIdActionFeature3month.setAdClick(record.getString("click_ad_3month"));
			
 
				-        adIdActionFeature3month.setAdConversion(record.getString("conversion_ad_3month"));
			
 
				-        adIdActionFeature3month.setCtr(record.getString("ctr_ad_3month"));
			
 
				-        adIdActionFeature3month.setCvr(record.getString("cvr_ad_3month"));
			
 
				-        itemFeature.setMonth3_cnt_features(adIdActionFeature3month);
			
 
				-
			
 
				-        // TODO creativeId等维度特征
			
 
				-        // creative 维度特征
			
 
				-        AdActionFeature creativeActionFeature1day = new AdActionFeature();
			
 
				-        creativeActionFeature1day.setAdView(record.getString("view_creative_1day"));
			
 
				-        creativeActionFeature1day.setAdClick(record.getString("click_creative_1day"));
			
 
				-        creativeActionFeature1day.setAdConversion(record.getString("conversion_creative_1day"));
			
 
				-        creativeActionFeature1day.setCtr(record.getString("ctr_creative_1day"));
			
 
				-        creativeActionFeature1day.setCvr(record.getString("cvr_creative_1day"));
			
 
				-        itemFeature.setCreative_1day_cnt_features(creativeActionFeature1day);
			
 
				-
			
 
				-        AdActionFeature creativeActionFeature3day = new AdActionFeature();
			
 
				-        creativeActionFeature3day.setAdView(record.getString("view_creative_3day"));
			
 
				-        creativeActionFeature3day.setAdClick(record.getString("click_creative_3day"));
			
 
				-        creativeActionFeature3day.setAdConversion(record.getString("conversion_creative_3day"));
			
 
				-        creativeActionFeature3day.setCtr(record.getString("ctr_creative_3day"));
			
 
				-        creativeActionFeature3day.setCvr(record.getString("cvr_creative_3day"));
			
 
				-        itemFeature.setCreative_3day_cnt_features(creativeActionFeature3day);
			
 
				-
			
 
				-        AdActionFeature creativeActionFeature7day = new AdActionFeature();
			
 
				-        creativeActionFeature7day.setAdView(record.getString("view_creative_7day"));
			
 
				-        creativeActionFeature7day.setAdClick(record.getString("click_creative_7day"));
			
 
				-        creativeActionFeature7day.setAdConversion(record.getString("conversion_creative_7day"));
			
 
				-        creativeActionFeature7day.setCtr(record.getString("ctr_creative_7day"));
			
 
				-        creativeActionFeature7day.setCvr(record.getString("cvr_creative_7day"));
			
 
				-        itemFeature.setCreative_7day_cnt_features(creativeActionFeature7day);
			
 
				-
			
 
				-
			
 
				-        AdActionFeature creativeActionFeature3month = new AdActionFeature();
			
 
				-        creativeActionFeature3month.setAdView(record.getString("view_creative_3month"));
			
 
				-        creativeActionFeature3month.setAdClick(record.getString("click_creative_3month"));
			
 
				-        creativeActionFeature3month.setAdConversion(record.getString("conversion_creative_3month"));
			
 
				-        creativeActionFeature3month.setCtr(record.getString("ctr_creative_3month"));
			
 
				-        creativeActionFeature3month.setCvr(record.getString("cvr_creative_3month"));
			
 
				-        itemFeature.setCreative_3month_cnt_features(creativeActionFeature3month);
			
 
				-
			
 
				-
			
 
				-
			
 
				-        // TODO advertiser维度
			
 
				-        // advertiser 维度特征
			
 
				-        AdActionFeature advidActionFeature1day = new AdActionFeature();
			
 
				-        advidActionFeature1day.setAdView(record.getString("view_advertiser_1day"));
			
 
				-        advidActionFeature1day.setAdClick(record.getString("click_advertiser_1day"));
			
 
				-        advidActionFeature1day.setAdConversion(record.getString("conversion_advertiser_1day"));
			
 
				-        advidActionFeature1day.setCtr(record.getString("ctr_advertiser_1day"));
			
 
				-        advidActionFeature1day.setCvr(record.getString("cvr_advertiser_1day"));
			
 
				-        itemFeature.setAdvertiser_1day_cnt_features(advidActionFeature1day);
			
 
				-
			
 
				-        AdActionFeature advidActionFeature3day = new AdActionFeature();
			
 
				-        advidActionFeature3day.setAdView(record.getString("view_advertiser_3day"));
			
 
				-        advidActionFeature3day.setAdClick(record.getString("click_advertiser_3day"));
			
 
				-        advidActionFeature3day.setAdConversion(record.getString("conversion_advertiser_3day"));
			
 
				-        advidActionFeature3day.setCtr(record.getString("ctr_advertiser_3day"));
			
 
				-        advidActionFeature3day.setCvr(record.getString("cvr_advertiser_3day"));
			
 
				-        itemFeature.setAdvertiser_3day_cnt_features(advidActionFeature3day);
			
 
				-
			
 
				-        AdActionFeature advidActionFeature7day = new AdActionFeature();
			
 
				-        advidActionFeature7day.setAdView(record.getString("view_advertiser_7day"));
			
 
				-        advidActionFeature7day.setAdClick(record.getString("click_advertiser_7day"));
			
 
				-        advidActionFeature7day.setAdConversion(record.getString("conversion_advertiser_7day"));
			
 
				-        advidActionFeature7day.setCtr(record.getString("ctr_advertiser_7day"));
			
 
				-        advidActionFeature7day.setCvr(record.getString("cvr_advertiser_7day"));
			
 
				-        itemFeature.setAdvertiser_7day_cnt_features(advidActionFeature7day);
			
 
				-
			
 
				-
			
 
				-        AdActionFeature advidActionFeature3month = new AdActionFeature();
			
 
				-        advidActionFeature3month.setAdView(record.getString("view_advertiser_3month"));
			
 
				-        advidActionFeature3month.setAdClick(record.getString("click_advertiser_3month"));
			
 
				-        advidActionFeature3month.setAdConversion(record.getString("conversion_advertiser_3month"));
			
 
				-        advidActionFeature3month.setCtr(record.getString("ctr_advertiser_3month"));
			
 
				-        advidActionFeature3month.setCvr(record.getString("cvr_advertiser_3month"));
			
 
				-        itemFeature.setAdvertiser_3month_cnt_features(advidActionFeature3month);
			
 
				-
			
 
				-
			
 
				-
			
 
				-        return itemFeature;
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-}
			
 
				+//package examples.dataloader;
			
 
				+//
			
 
				+//
			
 
				+//import com.aliyun.odps.account.Account;
			
 
				+//import com.aliyun.odps.account.AliyunAccount;
			
 
				+//import com.aliyun.odps.data.Record;
			
 
				+//import com.tzld.piaoquan.ad.engine.commons.base.AdActionFeature;
			
 
				+//import com.tzld.piaoquan.ad.engine.commons.base.AdItemFeature;
			
 
				+//import com.tzld.piaoquan.ad.engine.commons.base.UserAdFeature;
			
 
				+//
			
 
				+//
			
 
				+//import java.util.HashMap;
			
 
				+//import java.util.Map;
			
 
				+//
			
 
				+//public class AdRedisFeatureConstructor {
			
 
				+//
			
 
				+//    private static final String BUCKET_NAME = "ali-recommend";
			
 
				+//    private static final Map<String, String> ODPS_CONFIG = new HashMap<String, String>();
			
 
				+//
			
 
				+//    static {
			
 
				+//        ODPS_CONFIG.put("ENDPOINT", "http://service.cn.maxcompute.aliyun.com/api");
			
 
				+//        ODPS_CONFIG.put("ACCESSID", "LTAIWYUujJAm7CbH");
			
 
				+//        ODPS_CONFIG.put("ACCESSKEY", "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P");
			
 
				+//    }
			
 
				+//
			
 
				+//    ;
			
 
				+//
			
 
				+//    private static final Account account = new AliyunAccount(ODPS_CONFIG.get("ACCESSID"), ODPS_CONFIG.get("ACCESSKEY"));
			
 
				+//
			
 
				+//
			
 
				+//    public static UserAdFeature constructUserFeature(Record record) {
			
 
				+//        UserAdFeature userFeature = new UserAdFeature();
			
 
				+//        userFeature.setMid(record.getString("mids"));
			
 
				+//
			
 
				+//        // 1day features
			
 
				+//        AdActionFeature userAd1dayActionFeature = new AdActionFeature();
			
 
				+//        userAd1dayActionFeature.setOriginAdView(record.getString("ad_view_1day"));
			
 
				+//        userAd1dayActionFeature.setOriginAdClick(record.getString("ad_click_1day"));
			
 
				+//        userAd1dayActionFeature.setOriginAdConversion(record.getString("ad_conversion_1day"));
			
 
				+//        userAd1dayActionFeature.setOriginCtr(record.getString("ad_ctr_1day"));
			
 
				+//        userAd1dayActionFeature.setOriginCvr(record.getString("ad_cvr_1day"));
			
 
				+//        userFeature.setDay1_cnt_features(userAd1dayActionFeature);
			
 
				+//
			
 
				+//
			
 
				+//        // 3day features
			
 
				+//        AdActionFeature userAd3dayActionFeature = new AdActionFeature();
			
 
				+//        userAd1dayActionFeature.setOriginAdView(record.getString("ad_view_3day"));
			
 
				+//        userAd1dayActionFeature.setOriginAdClick(record.getString("ad_click_3day"));
			
 
				+//        userAd1dayActionFeature.setOriginAdConversion(record.getString("ad_conversion_3day"));
			
 
				+//        userAd1dayActionFeature.setOriginCtr(record.getString("ad_ctr_3day"));
			
 
				+//        userAd1dayActionFeature.setOriginCvr(record.getString("ad_cvr_3day"));
			
 
				+//        userFeature.setDay3_cnt_features(userAd3dayActionFeature);
			
 
				+//
			
 
				+//
			
 
				+//        // 7day features
			
 
				+//        AdActionFeature userAd7dayActionFeature = new AdActionFeature();
			
 
				+//        userAd1dayActionFeature.setOriginAdView(record.getString("ad_view_7day"));
			
 
				+//        userAd1dayActionFeature.setOriginAdClick(record.getString("ad_click7day"));
			
 
				+//        userAd1dayActionFeature.setOriginAdConversion(record.getString("ad_conversion_7day"));
			
 
				+//        userAd1dayActionFeature.setOriginCtr(record.getString("ad_ctr_7day"));
			
 
				+//        userAd1dayActionFeature.setOriginCvr(record.getString("ad_cvr_7day"));
			
 
				+//        userFeature.setDay7_cnt_features(userAd7dayActionFeature);
			
 
				+//
			
 
				+//
			
 
				+//        // 3month features
			
 
				+//        AdActionFeature userAd3MonthActionFeature = new AdActionFeature();
			
 
				+//        userAd3MonthActionFeature.setOriginAdView(record.getString("ad_view_3month"));
			
 
				+//        userAd3MonthActionFeature.setOriginAdClick(record.getString("ad_click_3month"));
			
 
				+//        userAd3MonthActionFeature.setOriginAdConversion(record.getString("ad_conversion_3month"));
			
 
				+//        userAd3MonthActionFeature.setOriginCtr(record.getString("ad_ctr_3month"));
			
 
				+//        userAd3MonthActionFeature.setOriginCvr(record.getString("ad_cvr_3month"));
			
 
				+//        userFeature.setMonth3_cnt_features(userAd3MonthActionFeature);
			
 
				+//
			
 
				+//
			
 
				+//        return userFeature;
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//    public static AdItemFeature constructItemFeature(Record record) {
			
 
				+//        AdItemFeature itemFeature = new AdItemFeature();
			
 
				+//        itemFeature.setAdId(record.getString("creativeid"));
			
 
				+//        // itemFeature.setAdCode(record.getString("adcode"));
			
 
				+//        itemFeature.setCampaignId(record.getString("campaignid"));
			
 
				+//        itemFeature.setAdvertiserId(record.getString("advertiserid"));
			
 
				+//        itemFeature.setCreativeId(record.getString("creativeid"));
			
 
				+//
			
 
				+//        // ad 维度特征
			
 
				+//        AdActionFeature adIdActionFeature1day = new AdActionFeature();
			
 
				+//        adIdActionFeature1day.setOriginAdView(record.getString("view_ad_1day"));
			
 
				+//        adIdActionFeature1day.setOriginAdClick(record.getString("click_ad_1day"));
			
 
				+//        adIdActionFeature1day.setOriginAdConversion(record.getString("conversion_ad_1day"));
			
 
				+//        adIdActionFeature1day.setOriginCtr(record.getString("ctr_ad_1day"));
			
 
				+//        adIdActionFeature1day.setOriginCvr(record.getString("cvr_ad_1day"));
			
 
				+//        itemFeature.setDay1_cnt_features(adIdActionFeature1day);
			
 
				+//
			
 
				+//        AdActionFeature adIdActionFeature3day = new AdActionFeature();
			
 
				+//        adIdActionFeature3day.setOriginAdView(record.getString("view_ad_3day"));
			
 
				+//        adIdActionFeature3day.setOriginAdClick(record.getString("click_ad_3day"));
			
 
				+//        adIdActionFeature3day.setOriginAdConversion(record.getString("conversion_ad_3day"));
			
 
				+//        adIdActionFeature3day.setOriginCtr(record.getString("ctr_ad_3day"));
			
 
				+//        adIdActionFeature3day.setOriginCvr(record.getString("cvr_ad_3day"));
			
 
				+//        itemFeature.setDay3_cnt_features(adIdActionFeature3day);
			
 
				+//
			
 
				+//        AdActionFeature adIdActionFeature7day = new AdActionFeature();
			
 
				+//        adIdActionFeature7day.setOriginAdView(record.getString("view_ad_7day"));
			
 
				+//        adIdActionFeature7day.setOriginAdClick(record.getString("click_ad_7day"));
			
 
				+//        adIdActionFeature7day.setOriginAdConversion(record.getString("conversion_ad_7day"));
			
 
				+//        adIdActionFeature7day.setOriginCtr(record.getString("ctr_ad_7day"));
			
 
				+//        adIdActionFeature7day.setOriginCvr(record.getString("cvr_ad_7day"));
			
 
				+//        itemFeature.setDay7_cnt_features(adIdActionFeature7day);
			
 
				+//
			
 
				+//        AdActionFeature adIdActionFeature3month = new AdActionFeature();
			
 
				+//        adIdActionFeature3month.setOriginAdView(record.getString("view_ad_3month"));
			
 
				+//        adIdActionFeature3month.setOriginAdClick(record.getString("click_ad_3month"));
			
 
				+//        adIdActionFeature3month.setOriginAdConversion(record.getString("conversion_ad_3month"));
			
 
				+//        adIdActionFeature3month.setOriginCtr(record.getString("ctr_ad_3month"));
			
 
				+//        adIdActionFeature3month.setOriginCvr(record.getString("cvr_ad_3month"));
			
 
				+//        itemFeature.setMonth3_cnt_features(adIdActionFeature3month);
			
 
				+//
			
 
				+//        // TODO creativeId等维度特征
			
 
				+//        // creative 维度特征
			
 
				+//        AdActionFeature creativeActionFeature1day = new AdActionFeature();
			
 
				+//        creativeActionFeature1day.setOriginAdView(record.getString("view_creative_1day"));
			
 
				+//        creativeActionFeature1day.setOriginAdClick(record.getString("click_creative_1day"));
			
 
				+//        creativeActionFeature1day.setOriginAdConversion(record.getString("conversion_creative_1day"));
			
 
				+//        creativeActionFeature1day.setOriginCtr(record.getString("ctr_creative_1day"));
			
 
				+//        creativeActionFeature1day.setOriginCvr(record.getString("cvr_creative_1day"));
			
 
				+//        itemFeature.setCreative_1day_cnt_features(creativeActionFeature1day);
			
 
				+//
			
 
				+//        AdActionFeature creativeActionFeature3day = new AdActionFeature();
			
 
				+//        creativeActionFeature3day.setOriginAdView(record.getString("view_creative_3day"));
			
 
				+//        creativeActionFeature3day.setOriginAdClick(record.getString("click_creative_3day"));
			
 
				+//        creativeActionFeature3day.setOriginAdConversion(record.getString("conversion_creative_3day"));
			
 
				+//        creativeActionFeature3day.setOriginCtr(record.getString("ctr_creative_3day"));
			
 
				+//        creativeActionFeature3day.setOriginCvr(record.getString("cvr_creative_3day"));
			
 
				+//        itemFeature.setCreative_3day_cnt_features(creativeActionFeature3day);
			
 
				+//
			
 
				+//        AdActionFeature creativeActionFeature7day = new AdActionFeature();
			
 
				+//        creativeActionFeature7day.setOriginAdView(record.getString("view_creative_7day"));
			
 
				+//        creativeActionFeature7day.setOriginAdClick(record.getString("click_creative_7day"));
			
 
				+//        creativeActionFeature7day.setOriginAdConversion(record.getString("conversion_creative_7day"));
			
 
				+//        creativeActionFeature7day.setOriginCtr(record.getString("ctr_creative_7day"));
			
 
				+//        creativeActionFeature7day.setOriginCvr(record.getString("cvr_creative_7day"));
			
 
				+//        itemFeature.setCreative_7day_cnt_features(creativeActionFeature7day);
			
 
				+//
			
 
				+//
			
 
				+//        AdActionFeature creativeActionFeature3month = new AdActionFeature();
			
 
				+//        creativeActionFeature3month.setOriginAdView(record.getString("view_creative_3month"));
			
 
				+//        creativeActionFeature3month.setOriginAdClick(record.getString("click_creative_3month"));
			
 
				+//        creativeActionFeature3month.setOriginAdConversion(record.getString("conversion_creative_3month"));
			
 
				+//        creativeActionFeature3month.setOriginCtr(record.getString("ctr_creative_3month"));
			
 
				+//        creativeActionFeature3month.setOriginCvr(record.getString("cvr_creative_3month"));
			
 
				+//        itemFeature.setCreative_3month_cnt_features(creativeActionFeature3month);
			
 
				+//
			
 
				+//
			
 
				+//
			
 
				+//        // TODO advertiser维度
			
 
				+//        // advertiser 维度特征
			
 
				+//        AdActionFeature advidActionFeature1day = new AdActionFeature();
			
 
				+//        advidActionFeature1day.setOriginAdView(record.getString("view_advertiser_1day"));
			
 
				+//        advidActionFeature1day.setOriginAdClick(record.getString("click_advertiser_1day"));
			
 
				+//        advidActionFeature1day.setOriginAdConversion(record.getString("conversion_advertiser_1day"));
			
 
				+//        advidActionFeature1day.setOriginCtr(record.getString("ctr_advertiser_1day"));
			
 
				+//        advidActionFeature1day.setOriginCvr(record.getString("cvr_advertiser_1day"));
			
 
				+//        itemFeature.setAdvertiser_1day_cnt_features(advidActionFeature1day);
			
 
				+//
			
 
				+//        AdActionFeature advidActionFeature3day = new AdActionFeature();
			
 
				+//        advidActionFeature3day.setOriginAdView(record.getString("view_advertiser_3day"));
			
 
				+//        advidActionFeature3day.setOriginAdClick(record.getString("click_advertiser_3day"));
			
 
				+//        advidActionFeature3day.setOriginAdConversion(record.getString("conversion_advertiser_3day"));
			
 
				+//        advidActionFeature3day.setOriginCtr(record.getString("ctr_advertiser_3day"));
			
 
				+//        advidActionFeature3day.setOriginCvr(record.getString("cvr_advertiser_3day"));
			
 
				+//        itemFeature.setAdvertiser_3day_cnt_features(advidActionFeature3day);
			
 
				+//
			
 
				+//        AdActionFeature advidActionFeature7day = new AdActionFeature();
			
 
				+//        advidActionFeature7day.setOriginAdView(record.getString("view_advertiser_7day"));
			
 
				+//        advidActionFeature7day.setOriginAdClick(record.getString("click_advertiser_7day"));
			
 
				+//        advidActionFeature7day.setOriginAdConversion(record.getString("conversion_advertiser_7day"));
			
 
				+//        advidActionFeature7day.setOriginCtr(record.getString("ctr_advertiser_7day"));
			
 
				+//        advidActionFeature7day.setOriginCvr(record.getString("cvr_advertiser_7day"));
			
 
				+//        itemFeature.setAdvertiser_7day_cnt_features(advidActionFeature7day);
			
 
				+//
			
 
				+//
			
 
				+//        AdActionFeature advidActionFeature3month = new AdActionFeature();
			
 
				+//        advidActionFeature3month.setOriginAdView(record.getString("view_advertiser_3month"));
			
 
				+//        advidActionFeature3month.setOriginAdClick(record.getString("click_advertiser_3month"));
			
 
				+//        advidActionFeature3month.setOriginAdConversion(record.getString("conversion_advertiser_3month"));
			
 
				+//        advidActionFeature3month.setOriginCtr(record.getString("ctr_advertiser_3month"));
			
 
				+//        advidActionFeature3month.setOriginCvr(record.getString("cvr_advertiser_3month"));
			
 
				+//        itemFeature.setAdvertiser_3month_cnt_features(advidActionFeature3month);
			
 
				+//
			
 
				+//
			
 
				+//
			
 
				+//        return itemFeature;
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//}
			
--- a/src/main/java/examples/dataloader/AdSampleConstructor.java
+++ b/src/main/java/examples/dataloader/AdSampleConstructor.java
@@ -1,223 +1,223 @@
 
				-package examples.dataloader;
			
 
				-
			
 
				-
			
 
				-import com.aliyun.odps.account.Account;
			
 
				-import com.aliyun.odps.account.AliyunAccount;
			
 
				-import com.aliyun.odps.data.Record;
			
 
				-import com.tzld.piaoquan.recommend.feature.domain.ad.base.AdActionFeature;
			
 
				-import com.tzld.piaoquan.recommend.feature.domain.ad.base.AdRequestContext;
			
 
				-import com.tzld.piaoquan.recommend.feature.domain.ad.base.UserAdFeature;
			
 
				-import com.tzld.piaoquan.recommend.feature.domain.ad.base.AdItemFeature;
			
 
				-
			
 
				-
			
 
				-import java.util.HashMap;
			
 
				-import java.util.Map;
			
 
				-
			
 
				-public class AdSampleConstructor {
			
 
				-
			
 
				-    private static final String BUCKET_NAME = "ali-recommend";
			
 
				-    private static final Map<String, String> ODPS_CONFIG = new HashMap<String, String>();
			
 
				-
			
 
				-    static {
			
 
				-        ODPS_CONFIG.put("ENDPOINT", "http://service.cn.maxcompute.aliyun.com/api");
			
 
				-        ODPS_CONFIG.put("ACCESSID", "LTAIWYUujJAm7CbH");
			
 
				-        ODPS_CONFIG.put("ACCESSKEY", "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P");
			
 
				-    }
			
 
				-
			
 
				-    private static final Account account = new AliyunAccount(ODPS_CONFIG.get("ACCESSID"), ODPS_CONFIG.get("ACCESSKEY"));
			
 
				-
			
 
				-
			
 
				-    public static AdRequestContext constructRequestContext(Record record) {
			
 
				-        AdRequestContext requestContext = new AdRequestContext();
			
 
				-        requestContext.setApptype(record.getString("apptype"));
			
 
				-        requestContext.setMachineinfoBrand(record.getString("machineinfo_brand"));
			
 
				-        requestContext.setMachineinfoModel(record.getString("machineinfo_model"));
			
 
				-        requestContext.setMachineinfoSdkversion(record.getString("machineinfo_sdkversion"));
			
 
				-        requestContext.setMachineinfoWchatversion(record.getString("machineinfo_wechatversion"));
			
 
				-
			
 
				-
			
 
				-        requestContext.setDay(record.getString("ctx_day"));
			
 
				-        requestContext.setWeek(record.getString("ctx_week"));
			
 
				-        requestContext.setHour(record.getString("ctx_hour"));
			
 
				-        requestContext.setRegion(record.getString("province"));
			
 
				-        requestContext.setCity(record.getString("city"));
			
 
				-        return requestContext;
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-    public static UserAdFeature constructUserFeature(Record record) {
			
 
				-        UserAdFeature userFeature = new UserAdFeature();
			
 
				-        userFeature.setMid(record.getString("machinecode"));
			
 
				-
			
 
				-        // 1day features
			
 
				-        AdActionFeature user1dayActionFeature = new AdActionFeature();
			
 
				-        user1dayActionFeature.setAdView(record.getString("user_view_1day"));
			
 
				-        user1dayActionFeature.setAdClick(record.getString("user_click_1day"));
			
 
				-        user1dayActionFeature.setAdConversion(record.getString("user_conversion_1day"));
			
 
				-        user1dayActionFeature.setCtr(record.getString("user_ctr_1day"));
			
 
				-        user1dayActionFeature.setCvr(record.getString("user_cvr_1day"));
			
 
				-        userFeature.setDay1_cnt_features(user1dayActionFeature);
			
 
				-
			
 
				-        // 3day features
			
 
				-        AdActionFeature user3dayActionFeature = new AdActionFeature();
			
 
				-        user3dayActionFeature.setAdView(record.getString("user_view_3day"));
			
 
				-        user3dayActionFeature.setAdClick(record.getString("user_click_3day"));
			
 
				-        user3dayActionFeature.setAdConversion(record.getString("user_conversion_3day"));
			
 
				-        user3dayActionFeature.setCtr(record.getString("user_ctr_3day"));
			
 
				-        user3dayActionFeature.setCvr(record.getString("user_cvr_3day"));
			
 
				-        userFeature.setDay3_cnt_features(user3dayActionFeature);
			
 
				-
			
 
				-
			
 
				-        // 7day features
			
 
				-        AdActionFeature user7dayActionFeature = new AdActionFeature();
			
 
				-        user7dayActionFeature.setAdView(record.getString("user_view_7day"));
			
 
				-        user7dayActionFeature.setAdClick(record.getString("user_click7day"));
			
 
				-        user7dayActionFeature.setAdConversion(record.getString("user_conversion_7day"));
			
 
				-        user7dayActionFeature.setCtr(record.getString("user_ctr_7day"));
			
 
				-        user7dayActionFeature.setCvr(record.getString("user_cvr_7day"));
			
 
				-        userFeature.setDay7_cnt_features(user7dayActionFeature);
			
 
				-
			
 
				-        // 3month features
			
 
				-        AdActionFeature user3MonthActionFeature = new AdActionFeature();
			
 
				-        user3MonthActionFeature.setAdView(record.getString("user_view_3month"));
			
 
				-        user3MonthActionFeature.setAdClick(record.getString("user_click_3month"));
			
 
				-        user3MonthActionFeature.setAdConversion(record.getString("user_conversion_3month"));
			
 
				-        user3MonthActionFeature.setCtr(record.getString("user_ctr_3month"));
			
 
				-        user3MonthActionFeature.setCvr(record.getString("user_cvr_3month"));
			
 
				-        userFeature.setMonth3_cnt_features(user3MonthActionFeature);
			
 
				-
			
 
				-        return userFeature;
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-    public static AdItemFeature constructItemFeature(Record record) {
			
 
				-        AdItemFeature itemFeature = new AdItemFeature();
			
 
				-
			
 
				-
			
 
				-        itemFeature.setAdId(record.getString("adid"));
			
 
				-        // itemFeature.setAdCode(record.getString("adcode"));
			
 
				-        itemFeature.setAdvertiserId(record.getString("advertiserid"));
			
 
				-        itemFeature.setCampaignId(record.getString("campaignid"));
			
 
				-        itemFeature.setCreativeId(record.getString("creativeid"));
			
 
				-
			
 
				-        // 1day features
			
 
				-        AdActionFeature user1dayActionFeature = new AdActionFeature();
			
 
				-        user1dayActionFeature.setAdView(record.getString("ad_view_1day"));
			
 
				-        user1dayActionFeature.setAdClick(record.getString("ad_click_1day"));
			
 
				-        user1dayActionFeature.setAdConversion(record.getString("ad_conversion_1day"));
			
 
				-        user1dayActionFeature.setCtr(record.getString("ad_ctr_1day"));
			
 
				-        user1dayActionFeature.setCvr(record.getString("ad_cvr_1day"));
			
 
				-        itemFeature.setDay1_cnt_features(user1dayActionFeature);
			
 
				-
			
 
				-        // 3day features
			
 
				-        AdActionFeature user3dayActionFeature = new AdActionFeature();
			
 
				-        user3dayActionFeature.setAdView(record.getString("ad_view_3day"));
			
 
				-        user3dayActionFeature.setAdClick(record.getString("ad_click_3day"));
			
 
				-        user3dayActionFeature.setAdConversion(record.getString("ad_conversion_3day"));
			
 
				-        user3dayActionFeature.setCtr(record.getString("ad_ctr_3day"));
			
 
				-        user3dayActionFeature.setCvr(record.getString("ad_cvr_3day"));
			
 
				-        itemFeature.setDay3_cnt_features(user3dayActionFeature);
			
 
				-
			
 
				-
			
 
				-        // 7day features
			
 
				-        AdActionFeature user7dayActionFeature = new AdActionFeature();
			
 
				-        user7dayActionFeature.setAdView(record.getString("ad_view_7day"));
			
 
				-        user7dayActionFeature.setAdClick(record.getString("ad_click_7day"));
			
 
				-        user7dayActionFeature.setAdConversion(record.getString("ad_conversion_7day"));
			
 
				-        user7dayActionFeature.setCtr(record.getString("ad_ctr_7day"));
			
 
				-        user7dayActionFeature.setCvr(record.getString("ad_cvr_7day"));
			
 
				-        itemFeature.setDay7_cnt_features(user7dayActionFeature);
			
 
				-
			
 
				-        // 3month features
			
 
				-        AdActionFeature user3MonthActionFeature = new AdActionFeature();
			
 
				-        user3MonthActionFeature.setAdView(record.getString("ad_view_3month"));
			
 
				-        user3MonthActionFeature.setAdClick(record.getString("ad_click_3month"));
			
 
				-        user3MonthActionFeature.setAdConversion(record.getString("ad_conversion_3month"));
			
 
				-        user3MonthActionFeature.setCtr(record.getString("ad_ctr_3month"));
			
 
				-        user3MonthActionFeature.setCvr(record.getString("ad_cvr_3month"));
			
 
				-        itemFeature.setMonth3_cnt_features(user3MonthActionFeature);
			
 
				-
			
 
				-
			
 
				-        //TODO  CREATIVE 维度  需要在样本中补齐
			
 
				-        AdActionFeature creative1dayFeature = new AdActionFeature();
			
 
				-        creative1dayFeature.setAdView(record.getString("view_creative_1day"));
			
 
				-        creative1dayFeature.setAdClick(record.getString("click_creative_1day"));
			
 
				-        creative1dayFeature.setAdConversion(record.getString("conversion_creative_1day"));
			
 
				-        creative1dayFeature.setCtr(record.getString("ctr_creative_1day"));
			
 
				-        creative1dayFeature.setCvr(record.getString("cvr_creative_1day"));
			
 
				-        itemFeature.setCreative_1day_cnt_features(creative1dayFeature);
			
 
				-
			
 
				-        // 3day features
			
 
				-        AdActionFeature creative3dayFeature = new AdActionFeature();
			
 
				-        creative3dayFeature.setAdView(record.getString("view_creative_3day"));
			
 
				-        creative3dayFeature.setAdClick(record.getString("click_creative_3day"));
			
 
				-        creative3dayFeature.setAdConversion(record.getString("conversion_creative_3day"));
			
 
				-        creative3dayFeature.setCtr(record.getString("ctr_creative_3day"));
			
 
				-        creative3dayFeature.setCvr(record.getString("cvr_creative_3day"));
			
 
				-        itemFeature.setCreative_3day_cnt_features(creative3dayFeature);
			
 
				-
			
 
				-
			
 
				-        // 7day features
			
 
				-        AdActionFeature creative7dayFeature = new AdActionFeature();
			
 
				-        creative7dayFeature.setAdView(record.getString("view_creative_7day"));
			
 
				-        creative7dayFeature.setAdClick(record.getString("click_creative_7day"));
			
 
				-        creative7dayFeature.setAdConversion(record.getString("conversion_creative_7day"));
			
 
				-        creative7dayFeature.setCtr(record.getString("ctr_creative_7day"));
			
 
				-        creative7dayFeature.setCvr(record.getString("cvr_creative_7day"));
			
 
				-        itemFeature.setCreative_7day_cnt_features(creative7dayFeature);
			
 
				-
			
 
				-        // 3month features
			
 
				-        AdActionFeature creative3MonthFeature = new AdActionFeature();
			
 
				-        creative3MonthFeature.setAdView(record.getString("view_creative_3month"));
			
 
				-        creative3MonthFeature.setAdClick(record.getString("click_creative_3month"));
			
 
				-        creative3MonthFeature.setAdConversion(record.getString("conversion_creative_3month"));
			
 
				-        creative3MonthFeature.setCtr(record.getString("ctr_creative_3month"));
			
 
				-        creative3MonthFeature.setCvr(record.getString("cvr_creative_3month"));
			
 
				-        itemFeature.setCreative_3month_cnt_features(creative3MonthFeature);
			
 
				-
			
 
				-
			
 
				-        // advertiser id
			
 
				-        // 1day features
			
 
				-        AdActionFeature advertiser1dayFeature = new AdActionFeature();
			
 
				-        advertiser1dayFeature.setAdView(record.getString("advertiser_view_1day"));
			
 
				-        advertiser1dayFeature.setAdClick(record.getString("advertiser_click_1day"));
			
 
				-        advertiser1dayFeature.setAdConversion(record.getString("advertiser_conversion_1day"));
			
 
				-        advertiser1dayFeature.setCtr(record.getString("advertiser_ctr_1day"));
			
 
				-        advertiser1dayFeature.setCvr(record.getString("advertiser_cvr_1day"));
			
 
				-        itemFeature.setAdvertiser_1day_cnt_features(advertiser1dayFeature);
			
 
				-
			
 
				-        // 3day features
			
 
				-        AdActionFeature advertiser3dayFeature = new AdActionFeature();
			
 
				-        advertiser3dayFeature.setAdView(record.getString("advertiser_view_3day"));
			
 
				-        advertiser3dayFeature.setAdClick(record.getString("advertiser_click_3day"));
			
 
				-        advertiser3dayFeature.setAdConversion(record.getString("advertiser_conversion_3day"));
			
 
				-        advertiser3dayFeature.setCtr(record.getString("advertiser_ctr_3day"));
			
 
				-        advertiser3dayFeature.setCvr(record.getString("advertiser_cvr_3day"));
			
 
				-        itemFeature.setAdvertiser_3day_cnt_features(advertiser3dayFeature);
			
 
				-
			
 
				-
			
 
				-        // 7day features
			
 
				-        AdActionFeature advertiser7dayFeature = new AdActionFeature();
			
 
				-        advertiser7dayFeature.setAdView(record.getString("advertiser_view_7day"));
			
 
				-        advertiser7dayFeature.setAdClick(record.getString("advertiser_click_7day"));
			
 
				-        advertiser7dayFeature.setAdConversion(record.getString("advertiser_conversion_7day"));
			
 
				-        advertiser7dayFeature.setCtr(record.getString("advertiser_ctr_7day"));
			
 
				-        advertiser7dayFeature.setCvr(record.getString("advertiser_cvr_7day"));
			
 
				-        itemFeature.setAdvertiser_7day_cnt_features(advertiser7dayFeature);
			
 
				-
			
 
				-        // 3month features
			
 
				-        AdActionFeature advertiser3monthFeature = new AdActionFeature();
			
 
				-        advertiser3monthFeature.setAdView(record.getString("advertiser_view_3month"));
			
 
				-        advertiser3monthFeature.setAdClick(record.getString("advertiser_view_3month"));
			
 
				-        advertiser3monthFeature.setAdConversion(record.getString("advertiser_conversion_3month"));
			
 
				-        advertiser3monthFeature.setCtr(record.getString("advertiser_ctr_3month"));
			
 
				-        advertiser3monthFeature.setCvr(record.getString("advertiser_cvr_3month"));
			
 
				-        itemFeature.setAdvertiser_3month_cnt_features(advertiser3monthFeature);
			
 
				-
			
 
				-
			
 
				-
			
 
				-        return itemFeature;
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-}
			
 
				+//package examples.dataloader;
			
 
				+//
			
 
				+//
			
 
				+//import com.aliyun.odps.account.Account;
			
 
				+//import com.aliyun.odps.account.AliyunAccount;
			
 
				+//import com.aliyun.odps.data.Record;
			
 
				+//import com.tzld.piaoquan.ad.engine.commons.base.AdActionFeature;
			
 
				+//import com.tzld.piaoquan.ad.engine.commons.base.AdRequestContext;
			
 
				+//import com.tzld.piaoquan.ad.engine.commons.base.UserAdFeature;
			
 
				+//import com.tzld.piaoquan.ad.engine.commons.base.AdItemFeature;
			
 
				+//
			
 
				+//
			
 
				+//import java.util.HashMap;
			
 
				+//import java.util.Map;
			
 
				+//
			
 
				+//public class AdSampleConstructor {
			
 
				+//
			
 
				+//    private static final String BUCKET_NAME = "ali-recommend";
			
 
				+//    private static final Map<String, String> ODPS_CONFIG = new HashMap<String, String>();
			
 
				+//
			
 
				+//    static {
			
 
				+//        ODPS_CONFIG.put("ENDPOINT", "http://service.cn.maxcompute.aliyun.com/api");
			
 
				+//        ODPS_CONFIG.put("ACCESSID", "LTAIWYUujJAm7CbH");
			
 
				+//        ODPS_CONFIG.put("ACCESSKEY", "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P");
			
 
				+//    }
			
 
				+//
			
 
				+//    private static final Account account = new AliyunAccount(ODPS_CONFIG.get("ACCESSID"), ODPS_CONFIG.get("ACCESSKEY"));
			
 
				+//
			
 
				+//
			
 
				+//    public static AdRequestContext constructRequestContext(Record record) {
			
 
				+//        AdRequestContext requestContext = new AdRequestContext();
			
 
				+//        requestContext.setApptype(record.getString("apptype"));
			
 
				+//        requestContext.setMachineinfoBrand(record.getString("machineinfo_brand"));
			
 
				+//        requestContext.setMachineinfoModel(record.getString("machineinfo_model"));
			
 
				+//        requestContext.setMachineinfoSdkversion(record.getString("machineinfo_sdkversion"));
			
 
				+//        requestContext.setMachineinfoWchatversion(record.getString("machineinfo_wechatversion"));
			
 
				+//
			
 
				+//
			
 
				+//        requestContext.setDay(record.getString("ctx_day"));
			
 
				+//        requestContext.setWeek(record.getString("ctx_week"));
			
 
				+//        requestContext.setHour(record.getString("ctx_hour"));
			
 
				+//        requestContext.setRegion(record.getString("province"));
			
 
				+//        requestContext.setCity(record.getString("city"));
			
 
				+//        return requestContext;
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//    public static UserAdFeature constructUserFeature(Record record) {
			
 
				+//        UserAdFeature userFeature = new UserAdFeature();
			
 
				+//        userFeature.setMid(record.get("machinecode").toString());
			
 
				+//
			
 
				+//        // 1day features
			
 
				+//        AdActionFeature user1dayActionFeature = new AdActionFeature();
			
 
				+//        user1dayActionFeature.setAdView(record.getString("user_view_1day"));
			
 
				+//        user1dayActionFeature.setAdClick(record.getString("user_click_1day"));
			
 
				+//        user1dayActionFeature.setAdConversion(record.getString("user_conversion_1day"));
			
 
				+//        user1dayActionFeature.setCtr(record.getString("user_ctr_1day"));
			
 
				+//        user1dayActionFeature.setCvr(record.getString("user_cvr_1day"));
			
 
				+//        userFeature.setDay1_cnt_features(user1dayActionFeature);
			
 
				+//
			
 
				+//        // 3day features
			
 
				+//        AdActionFeature user3dayActionFeature = new AdActionFeature();
			
 
				+//        user3dayActionFeature.setAdView(record.getString("user_view_3day"));
			
 
				+//        user3dayActionFeature.setAdClick(record.getString("user_click_3day"));
			
 
				+//        user3dayActionFeature.setAdConversion(record.getString("user_conversion_3day"));
			
 
				+//        user3dayActionFeature.setCtr(record.getString("user_ctr_3day"));
			
 
				+//        user3dayActionFeature.setCvr(record.getString("user_cvr_3day"));
			
 
				+//        userFeature.setDay3_cnt_features(user3dayActionFeature);
			
 
				+//
			
 
				+//
			
 
				+//        // 7day features
			
 
				+//        AdActionFeature user7dayActionFeature = new AdActionFeature();
			
 
				+//        user7dayActionFeature.setAdView(record.getString("user_view_7day"));
			
 
				+//        user7dayActionFeature.setAdClick(record.getString("user_click7day"));
			
 
				+//        user7dayActionFeature.setAdConversion(record.getString("user_conversion_7day"));
			
 
				+//        user7dayActionFeature.setCtr(record.getString("user_ctr_7day"));
			
 
				+//        user7dayActionFeature.setCvr(record.getString("user_cvr_7day"));
			
 
				+//        userFeature.setDay7_cnt_features(user7dayActionFeature);
			
 
				+//
			
 
				+//        // 3month features
			
 
				+//        AdActionFeature user3MonthActionFeature = new AdActionFeature();
			
 
				+//        user3MonthActionFeature.setAdView(record.getString("user_view_3month"));
			
 
				+//        user3MonthActionFeature.setAdClick(record.getString("user_click_3month"));
			
 
				+//        user3MonthActionFeature.setAdConversion(record.getString("user_conversion_3month"));
			
 
				+//        user3MonthActionFeature.setCtr(record.getString("user_ctr_3month"));
			
 
				+//        user3MonthActionFeature.setCvr(record.getString("user_cvr_3month"));
			
 
				+//        userFeature.setMonth3_cnt_features(user3MonthActionFeature);
			
 
				+//
			
 
				+//        return userFeature;
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//    public static AdItemFeature constructItemFeature(Record record) {
			
 
				+//        AdItemFeature itemFeature = new AdItemFeature();
			
 
				+//
			
 
				+//
			
 
				+//        itemFeature.setAdId(record.getString("adid"));
			
 
				+//        // itemFeature.setAdCode(record.getString("adcode"));
			
 
				+//        itemFeature.setAdvertiserId(record.getString("advertiserid"));
			
 
				+//        itemFeature.setCampaignId(record.getString("campaignid"));
			
 
				+//        itemFeature.setCreativeId(record.getString("creativeid"));
			
 
				+//
			
 
				+//        // 1day features
			
 
				+//        AdActionFeature user1dayActionFeature = new AdActionFeature();
			
 
				+//        user1dayActionFeature.setAdView(record.getString("ad_view_1day"));
			
 
				+//        user1dayActionFeature.setAdClick(record.getString("ad_click_1day"));
			
 
				+//        user1dayActionFeature.setAdConversion(record.getString("ad_conversion_1day"));
			
 
				+//        user1dayActionFeature.setCtr(record.getString("ad_ctr_1day"));
			
 
				+//        user1dayActionFeature.setCvr(record.getString("ad_cvr_1day"));
			
 
				+//        itemFeature.setDay1_cnt_features(user1dayActionFeature);
			
 
				+//
			
 
				+//        // 3day features
			
 
				+//        AdActionFeature user3dayActionFeature = new AdActionFeature();
			
 
				+//        user3dayActionFeature.setAdView(record.getString("ad_view_3day"));
			
 
				+//        user3dayActionFeature.setAdClick(record.getString("ad_click_3day"));
			
 
				+//        user3dayActionFeature.setAdConversion(record.getString("ad_conversion_3day"));
			
 
				+//        user3dayActionFeature.setCtr(record.getString("ad_ctr_3day"));
			
 
				+//        user3dayActionFeature.setCvr(record.getString("ad_cvr_3day"));
			
 
				+//        itemFeature.setDay3_cnt_features(user3dayActionFeature);
			
 
				+//
			
 
				+//
			
 
				+//        // 7day features
			
 
				+//        AdActionFeature user7dayActionFeature = new AdActionFeature();
			
 
				+//        user7dayActionFeature.setAdView(record.getString("ad_view_7day"));
			
 
				+//        user7dayActionFeature.setAdClick(record.getString("ad_click_7day"));
			
 
				+//        user7dayActionFeature.setAdConversion(record.getString("ad_conversion_7day"));
			
 
				+//        user7dayActionFeature.setCtr(record.getString("ad_ctr_7day"));
			
 
				+//        user7dayActionFeature.setCvr(record.getString("ad_cvr_7day"));
			
 
				+//        itemFeature.setDay7_cnt_features(user7dayActionFeature);
			
 
				+//
			
 
				+//        // 3month features
			
 
				+//        AdActionFeature user3MonthActionFeature = new AdActionFeature();
			
 
				+//        user3MonthActionFeature.setAdView(record.getString("ad_view_3month"));
			
 
				+//        user3MonthActionFeature.setAdClick(record.getString("ad_click_3month"));
			
 
				+//        user3MonthActionFeature.setAdConversion(record.getString("ad_conversion_3month"));
			
 
				+//        user3MonthActionFeature.setCtr(record.getString("ad_ctr_3month"));
			
 
				+//        user3MonthActionFeature.setCvr(record.getString("ad_cvr_3month"));
			
 
				+//        itemFeature.setMonth3_cnt_features(user3MonthActionFeature);
			
 
				+//
			
 
				+//
			
 
				+//        //TODO  CREATIVE 维度  需要在样本中补齐
			
 
				+//        AdActionFeature creative1dayFeature = new AdActionFeature();
			
 
				+//        creative1dayFeature.setAdView(record.getString("view_creative_1day"));
			
 
				+//        creative1dayFeature.setAdClick(record.getString("click_creative_1day"));
			
 
				+//        creative1dayFeature.setAdConversion(record.getString("conversion_creative_1day"));
			
 
				+//        creative1dayFeature.setCtr(record.getString("ctr_creative_1day"));
			
 
				+//        creative1dayFeature.setCvr(record.getString("cvr_creative_1day"));
			
 
				+//        itemFeature.setCreative_1day_cnt_features(creative1dayFeature);
			
 
				+//
			
 
				+//        // 3day features
			
 
				+//        AdActionFeature creative3dayFeature = new AdActionFeature();
			
 
				+//        creative3dayFeature.setAdView(record.getString("view_creative_3day"));
			
 
				+//        creative3dayFeature.setAdClick(record.getString("click_creative_3day"));
			
 
				+//        creative3dayFeature.setAdConversion(record.getString("conversion_creative_3day"));
			
 
				+//        creative3dayFeature.setCtr(record.getString("ctr_creative_3day"));
			
 
				+//        creative3dayFeature.setCvr(record.getString("cvr_creative_3day"));
			
 
				+//        itemFeature.setCreative_3day_cnt_features(creative3dayFeature);
			
 
				+//
			
 
				+//
			
 
				+//        // 7day features
			
 
				+//        AdActionFeature creative7dayFeature = new AdActionFeature();
			
 
				+//        creative7dayFeature.setAdView(record.getString("view_creative_7day"));
			
 
				+//        creative7dayFeature.setAdClick(record.getString("click_creative_7day"));
			
 
				+//        creative7dayFeature.setAdConversion(record.getString("conversion_creative_7day"));
			
 
				+//        creative7dayFeature.setCtr(record.getString("ctr_creative_7day"));
			
 
				+//        creative7dayFeature.setCvr(record.getString("cvr_creative_7day"));
			
 
				+//        itemFeature.setCreative_7day_cnt_features(creative7dayFeature);
			
 
				+//
			
 
				+//        // 3month features
			
 
				+//        AdActionFeature creative3MonthFeature = new AdActionFeature();
			
 
				+//        creative3MonthFeature.setAdView(record.getString("view_creative_3month"));
			
 
				+//        creative3MonthFeature.setAdClick(record.getString("click_creative_3month"));
			
 
				+//        creative3MonthFeature.setAdConversion(record.getString("conversion_creative_3month"));
			
 
				+//        creative3MonthFeature.setCtr(record.getString("ctr_creative_3month"));
			
 
				+//        creative3MonthFeature.setCvr(record.getString("cvr_creative_3month"));
			
 
				+//        itemFeature.setCreative_3month_cnt_features(creative3MonthFeature);
			
 
				+//
			
 
				+//
			
 
				+//        // advertiser id
			
 
				+//        // 1day features
			
 
				+//        AdActionFeature advertiser1dayFeature = new AdActionFeature();
			
 
				+//        advertiser1dayFeature.setAdView(record.getString("advertiser_view_1day"));
			
 
				+//        advertiser1dayFeature.setAdClick(record.getString("advertiser_click_1day"));
			
 
				+//        advertiser1dayFeature.setAdConversion(record.getString("advertiser_conversion_1day"));
			
 
				+//        advertiser1dayFeature.setCtr(record.getString("advertiser_ctr_1day"));
			
 
				+//        advertiser1dayFeature.setCvr(record.getString("advertiser_cvr_1day"));
			
 
				+//        itemFeature.setAdvertiser_1day_cnt_features(advertiser1dayFeature);
			
 
				+//
			
 
				+//        // 3day features
			
 
				+//        AdActionFeature advertiser3dayFeature = new AdActionFeature();
			
 
				+//        advertiser3dayFeature.setAdView(record.getString("advertiser_view_3day"));
			
 
				+//        advertiser3dayFeature.setAdClick(record.getString("advertiser_click_3day"));
			
 
				+//        advertiser3dayFeature.setAdConversion(record.getString("advertiser_conversion_3day"));
			
 
				+//        advertiser3dayFeature.setCtr(record.getString("advertiser_ctr_3day"));
			
 
				+//        advertiser3dayFeature.setCvr(record.getString("advertiser_cvr_3day"));
			
 
				+//        itemFeature.setAdvertiser_3day_cnt_features(advertiser3dayFeature);
			
 
				+//
			
 
				+//
			
 
				+//        // 7day features
			
 
				+//        AdActionFeature advertiser7dayFeature = new AdActionFeature();
			
 
				+//        advertiser7dayFeature.setAdView(record.getString("advertiser_view_7day"));
			
 
				+//        advertiser7dayFeature.setAdClick(record.getString("advertiser_click_7day"));
			
 
				+//        advertiser7dayFeature.setAdConversion(record.getString("advertiser_conversion_7day"));
			
 
				+//        advertiser7dayFeature.setCtr(record.getString("advertiser_ctr_7day"));
			
 
				+//        advertiser7dayFeature.setCvr(record.getString("advertiser_cvr_7day"));
			
 
				+//        itemFeature.setAdvertiser_7day_cnt_features(advertiser7dayFeature);
			
 
				+//
			
 
				+//        // 3month features
			
 
				+//        AdActionFeature advertiser3monthFeature = new AdActionFeature();
			
 
				+//        advertiser3monthFeature.setAdView(record.getString("advertiser_view_3month"));
			
 
				+//        advertiser3monthFeature.setAdClick(record.getString("advertiser_view_3month"));
			
 
				+//        advertiser3monthFeature.setAdConversion(record.getString("advertiser_conversion_3month"));
			
 
				+//        advertiser3monthFeature.setCtr(record.getString("advertiser_ctr_3month"));
			
 
				+//        advertiser3monthFeature.setCvr(record.getString("advertiser_cvr_3month"));
			
 
				+//        itemFeature.setAdvertiser_3month_cnt_features(advertiser3monthFeature);
			
 
				+//
			
 
				+//
			
 
				+//
			
 
				+//        return itemFeature;
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//}
			
--- a/src/main/java/examples/dataloader/OfflineVlogFeatureGroup.java
+++ b/src/main/java/examples/dataloader/OfflineVlogFeatureGroup.java
@@ -0,0 +1,80 @@
 
				+package examples.dataloader;
			
 
				+
			
 
				+public enum OfflineVlogFeatureGroup {
			
 
				+
			
 
				+    machineinfo_brand,
			
 
				+    machineinfo_model,
			
 
				+    machineinfo_platform,
			
 
				+    machineinfo_system,
			
 
				+    u_1day_exp_cnt,
			
 
				+    u_1day_click_cnt,
			
 
				+    u_1day_share_cnt,
			
 
				+    u_1day_return_cnt,
			
 
				+    u_ctr_1day,
			
 
				+    u_str_1day,
			
 
				+    u_rov_1day,
			
 
				+    u_ros_1day,
			
 
				+
			
 
				+    u_3day_exp_cnt,
			
 
				+    u_3day_click_cnt,
			
 
				+    u_3day_share_cnt,
			
 
				+    u_3day_return_cnt,
			
 
				+    u_ctr_3day,
			
 
				+    u_str_3day,
			
 
				+    u_rov_3day,
			
 
				+    u_ros_3day,
			
 
				+
			
 
				+
			
 
				+    total_time,
			
 
				+
			
 
				+    play_count_total,
			
 
				+    i_1day_exp_cnt,
			
 
				+    i_1day_click_cnt,
			
 
				+    i_1day_share_cnt,
			
 
				+    i_1day_return_cnt,
			
 
				+    i_ctr_1day,
			
 
				+    i_str_1day,
			
 
				+    i_rov_1day,
			
 
				+    i_ros_1day,
			
 
				+
			
 
				+    i_3day_exp_cnt,
			
 
				+    i_3day_click_cnt,
			
 
				+    i_3day_share_cnt,
			
 
				+    i_3day_return_cnt,
			
 
				+    i_ctr_3day,
			
 
				+    i_str_3day,
			
 
				+    i_rov_3day,
			
 
				+    i_ros_3day,
			
 
				+
			
 
				+    ctx_week,
			
 
				+    ctx_hour,
			
 
				+    ctx_region,
			
 
				+    ctx_city,
			
 
				+    ;
			
 
				+
			
 
				+
			
 
				+    private final byte[] idBytes;
			
 
				+    private final byte[] nameBytes;
			
 
				+
			
 
				+    OfflineVlogFeatureGroup() {
			
 
				+        this.idBytes = String.valueOf(ordinal()).getBytes();
			
 
				+        this.nameBytes = name().toLowerCase().getBytes();
			
 
				+    }
			
 
				+
			
 
				+    public final int getId() {
			
 
				+        return ordinal();
			
 
				+    }
			
 
				+
			
 
				+    public final String getGroupName() {
			
 
				+        return name().toLowerCase();
			
 
				+    }
			
 
				+
			
 
				+    public final byte[] getGroupNameBytes() {
			
 
				+        return getGroupName().getBytes();
			
 
				+    }
			
 
				+
			
 
				+    public final byte[] getIdBytes() {
			
 
				+        return idBytes;
			
 
				+    }
			
 
				+
			
 
				+}
			
--- a/src/main/java/examples/dataloader/OfflineVlogFeatureGroupV1.java
+++ b/src/main/java/examples/dataloader/OfflineVlogFeatureGroupV1.java
@@ -0,0 +1,81 @@
 
				+package examples.dataloader;
			
 
				+
			
 
				+public enum OfflineVlogFeatureGroupV1 {
			
 
				+
			
 
				+    machineinfo_brand,
			
 
				+    machineinfo_model,
			
 
				+    machineinfo_platform,
			
 
				+    machineinfo_system,
			
 
				+    u_1day_exp_cnt,
			
 
				+    u_1day_click_cnt,
			
 
				+    u_1day_share_cnt,
			
 
				+    u_1day_return_cnt,
			
 
				+    u_1day_ctr,
			
 
				+    u_1day_str,
			
 
				+    u_1day_rov,
			
 
				+    u_1day_ros,
			
 
				+
			
 
				+    u_3day_exp_cnt,
			
 
				+    u_3day_click_cnt,
			
 
				+    u_3day_share_cnt,
			
 
				+    u_3day_return_cnt,
			
 
				+    u_3day_ctr,
			
 
				+    u_3day_str,
			
 
				+    u_3day_rov,
			
 
				+    u_3day_ros,
			
 
				+
			
 
				+
			
 
				+    total_time,
			
 
				+
			
 
				+    play_count_total,
			
 
				+    i_1day_exp_cnt,
			
 
				+    i_1day_click_cnt,
			
 
				+    i_1day_share_cnt,
			
 
				+    i_1day_return_cnt,
			
 
				+    i_1day_ctr,
			
 
				+    i_1day_str,
			
 
				+    i_1day_rov,
			
 
				+    i_1day_ros,
			
 
				+
			
 
				+    i_3day_exp_cnt,
			
 
				+    i_3day_click_cnt,
			
 
				+    i_3day_share_cnt,
			
 
				+    i_3day_return_cnt,
			
 
				+    i_3day_ctr,
			
 
				+    i_3day_str,
			
 
				+    i_3day_rov,
			
 
				+    i_3day_ros,
			
 
				+
			
 
				+    ctx_week,
			
 
				+    ctx_hour,
			
 
				+    ctx_region,
			
 
				+    ctx_city,
			
 
				+
			
 
				+    ;
			
 
				+
			
 
				+
			
 
				+    private final byte[] idBytes;
			
 
				+    private final byte[] nameBytes;
			
 
				+
			
 
				+    OfflineVlogFeatureGroupV1() {
			
 
				+        this.idBytes = String.valueOf(ordinal()).getBytes();
			
 
				+        this.nameBytes = name().toLowerCase().getBytes();
			
 
				+    }
			
 
				+
			
 
				+    public final int getId() {
			
 
				+        return ordinal();
			
 
				+    }
			
 
				+
			
 
				+    public final String getGroupName() {
			
 
				+        return name().toLowerCase();
			
 
				+    }
			
 
				+
			
 
				+    public final byte[] getGroupNameBytes() {
			
 
				+        return getGroupName().getBytes();
			
 
				+    }
			
 
				+
			
 
				+    public final byte[] getIdBytes() {
			
 
				+        return idBytes;
			
 
				+    }
			
 
				+
			
 
				+}
			
--- a/src/main/java/examples/dataloader/OfflineVlogFeatureGroupV2.java
+++ b/src/main/java/examples/dataloader/OfflineVlogFeatureGroupV2.java
@@ -0,0 +1,125 @@
 
				+package examples.dataloader;
			
 
				+
			
 
				+public enum OfflineVlogFeatureGroupV2 {
			
 
				+
			
 
				+    machineinfo_brand,
			
 
				+    machineinfo_model,
			
 
				+    machineinfo_platform,
			
 
				+    machineinfo_system,
			
 
				+    u_1day_exp_cnt,
			
 
				+    u_1day_click_cnt,
			
 
				+    u_1day_share_cnt,
			
 
				+    u_1day_return_cnt,
			
 
				+    u_1day_ctr,
			
 
				+    u_1day_str,
			
 
				+    u_1day_rov,
			
 
				+    u_1day_ros,
			
 
				+
			
 
				+    u_3day_exp_cnt,
			
 
				+    u_3day_click_cnt,
			
 
				+    u_3day_share_cnt,
			
 
				+    u_3day_return_cnt,
			
 
				+    u_3day_ctr,
			
 
				+    u_3day_str,
			
 
				+    u_3day_rov,
			
 
				+    u_3day_ros,
			
 
				+
			
 
				+    total_time,
			
 
				+
			
 
				+    play_count_total,
			
 
				+    i_1day_exp_cnt,
			
 
				+    i_1day_click_cnt,
			
 
				+    i_1day_share_cnt,
			
 
				+    i_1day_return_cnt,
			
 
				+    i_1day_ctr,
			
 
				+    i_1day_str,
			
 
				+    i_1day_rov,
			
 
				+    i_1day_ros,
			
 
				+
			
 
				+    i_3day_exp_cnt,
			
 
				+    i_3day_click_cnt,
			
 
				+    i_3day_share_cnt,
			
 
				+    i_3day_return_cnt,
			
 
				+    i_3day_ctr,
			
 
				+    i_3day_str,
			
 
				+    i_3day_rov,
			
 
				+    i_3day_ros,
			
 
				+
			
 
				+    ctx_week,
			
 
				+    ctx_hour,
			
 
				+    ctx_region,
			
 
				+    ctx_city,
			
 
				+    view_pv_list_1day,
			
 
				+    view_uv_list_1day,
			
 
				+    play_pv_list_1day,
			
 
				+    play_uv_list_1day,
			
 
				+    share_pv_list_1day,
			
 
				+    share_uv_list_1day,
			
 
				+    return_uv_list_1day,
			
 
				+    p_view_uv_list_1day,
			
 
				+    p_view_pv_list_1day,
			
 
				+    p_return_uv_list_1day,
			
 
				+    share_uv_list_2day,
			
 
				+    share_pv_list_2day,
			
 
				+    share_uv_list_3day,
			
 
				+    share_pv_list_3day,
			
 
				+    view_uv_list_1h,
			
 
				+    view_pv_list_1h,
			
 
				+    play_uv_list_1h,
			
 
				+    play_pv_list_1h,
			
 
				+    share_uv_list_1h,
			
 
				+    share_pv_list_1h,
			
 
				+    return_uv_list_1h,
			
 
				+    p_return_uv_list_1h,
			
 
				+    i_1day_ctr_rt,
			
 
				+    i_1day_str_rt,
			
 
				+    i_1day_ros_rt,
			
 
				+    i_1day_rov_rt,
			
 
				+    i_1h_ctr_rt,
			
 
				+    i_1h_str_rt,
			
 
				+    i_1h_ros_rt,
			
 
				+    i_1h_rov_rt,
			
 
				+    u_7day_exp_cnt,
			
 
				+    u_7day_click_cnt,
			
 
				+    u_7day_share_cnt,
			
 
				+    u_7day_return_cnt,
			
 
				+    i_7day_exp_cnt,
			
 
				+    i_7day_click_cnt,
			
 
				+    i_7day_share_cnt,
			
 
				+    i_7day_return_cnt,
			
 
				+    u_7day_ctr,
			
 
				+    u_7day_str,
			
 
				+    u_7day_rov,
			
 
				+    u_7day_ros,
			
 
				+    i_7day_ctr,
			
 
				+    i_7day_str,
			
 
				+    i_7day_rov,
			
 
				+    i_7day_ros
			
 
				+    ;
			
 
				+
			
 
				+
			
 
				+    private final byte[] idBytes;
			
 
				+    private final byte[] nameBytes;
			
 
				+
			
 
				+    OfflineVlogFeatureGroupV2() {
			
 
				+        this.idBytes = String.valueOf(ordinal()).getBytes();
			
 
				+        this.nameBytes = name().toLowerCase().getBytes();
			
 
				+    }
			
 
				+
			
 
				+    public final int getId() {
			
 
				+        return ordinal();
			
 
				+    }
			
 
				+
			
 
				+    public final String getGroupName() {
			
 
				+        return name().toLowerCase();
			
 
				+    }
			
 
				+
			
 
				+    public final byte[] getGroupNameBytes() {
			
 
				+        return getGroupName().getBytes();
			
 
				+    }
			
 
				+
			
 
				+    public final byte[] getIdBytes() {
			
 
				+        return idBytes;
			
 
				+    }
			
 
				+
			
 
				+}
			
--- a/src/main/java/examples/dataloader/OfflineVlogShareLRFeatureExtractor.java
+++ b/src/main/java/examples/dataloader/OfflineVlogShareLRFeatureExtractor.java
@@ -0,0 +1,68 @@
 
				+package examples.dataloader;
			
 
				+import com.google.common.collect.ArrayListMultimap;
			
 
				+import com.google.common.collect.ListMultimap;
			
 
				+import com.tzld.piaoquan.recommend.feature.domain.video.base.RequestContextBytesFeature;
			
 
				+import com.tzld.piaoquan.recommend.feature.domain.video.base.UserBytesFeature;
			
 
				+import com.tzld.piaoquan.recommend.feature.domain.video.base.VideoBytesFeature;
			
 
				+import com.tzld.piaoquan.recommend.feature.domain.video.feature.BytesGroup;
			
 
				+import com.tzld.piaoquan.recommend.feature.domain.video.feature.BytesUtils;
			
 
				+import com.tzld.piaoquan.recommend.feature.model.sample.*;
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.List;
			
 
				+import java.util.Map;
			
 
				+
			
 
				+public class OfflineVlogShareLRFeatureExtractor {
			
 
				+
			
 
				+    public ListMultimap<FeatureGroup, BaseFeature> featureMap = ArrayListMultimap.create();
			
 
				+
			
 
				+    final private BytesUtils utils;
			
 
				+    final private int groupCount = OfflineVlogFeatureGroup.values().length;
			
 
				+    public OfflineVlogShareLRFeatureExtractor() {
			
 
				+        BytesGroup[] groups = new BytesGroup[OfflineVlogFeatureGroup.values().length];
			
 
				+        OfflineVlogFeatureGroup[] var2 = OfflineVlogFeatureGroup.values();
			
 
				+        int var3 = var2.length;
			
 
				+
			
 
				+        for(int var4 = 0; var4 < var3; ++var4) {
			
 
				+            OfflineVlogFeatureGroup g = var2[var4];
			
 
				+            groups[g.ordinal()] = new BytesGroup(g.ordinal(), g.getGroupName(), g.getGroupNameBytes());
			
 
				+        }
			
 
				+        this.utils = new BytesUtils(groups);
			
 
				+    }
			
 
				+    public void makeFeature(Map<String, Object> maps){
			
 
				+        for (Map.Entry<String, Object> entry : maps.entrySet()){
			
 
				+            OfflineVlogFeatureGroup ovf = OfflineVlogFeatureGroup.valueOf(entry.getKey());
			
 
				+            Object value = entry.getValue();
			
 
				+            if (value instanceof String){
			
 
				+                this.makeFea(ovf, ((String)value).getBytes());
			
 
				+            }else if (value instanceof Double){
			
 
				+                this.makeFea(ovf, String.valueOf((Double)value).getBytes());
			
 
				+            }else if (value instanceof Integer){
			
 
				+                //todo
			
 
				+            }else{
			
 
				+                //todo
			
 
				+                this.makeFea(ovf, ((String)value).getBytes());
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    public void makeFeature4String(Map<String, String> maps){
			
 
				+        for (Map.Entry<String, String> entry : maps.entrySet()){
			
 
				+            OfflineVlogFeatureGroup ovf = OfflineVlogFeatureGroup.valueOf(entry.getKey());
			
 
				+            String value = entry.getValue();
			
 
				+            this.makeFea(ovf, value.getBytes());
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    private FeatureGroup makeGroup(OfflineVlogFeatureGroup group) {
			
 
				+        FeatureGroup.Builder g = FeatureGroup.newBuilder();
			
 
				+        g.setType("1");
			
 
				+        g.setName(group.getGroupName());
			
 
				+        g.setId(group.ordinal());
			
 
				+        return g.build();
			
 
				+    }
			
 
				+    void makeFea(OfflineVlogFeatureGroup group, byte[] value) {
			
 
				+        FeatureGroup featureGroup = this.makeGroup(group);
			
 
				+        BaseFeature feature = this.utils.makeFea(group.ordinal(), value);
			
 
				+        this.featureMap.put(featureGroup, feature);
			
 
				+    }
			
 
				+
			
 
				+}
			
--- a/src/main/java/examples/dataloader/OfflineVlogShareLRFeatureExtractorV1.java
+++ b/src/main/java/examples/dataloader/OfflineVlogShareLRFeatureExtractorV1.java
@@ -0,0 +1,66 @@
 
				+package examples.dataloader;
			
 
				+
			
 
				+import com.google.common.collect.ArrayListMultimap;
			
 
				+import com.google.common.collect.ListMultimap;
			
 
				+import com.tzld.piaoquan.recommend.feature.domain.video.feature.BytesGroup;
			
 
				+import com.tzld.piaoquan.recommend.feature.domain.video.feature.BytesUtils;
			
 
				+import com.tzld.piaoquan.recommend.feature.model.sample.BaseFeature;
			
 
				+import com.tzld.piaoquan.recommend.feature.model.sample.FeatureGroup;
			
 
				+
			
 
				+import java.util.Map;
			
 
				+
			
 
				+public class OfflineVlogShareLRFeatureExtractorV1 {
			
 
				+
			
 
				+    public ListMultimap<FeatureGroup, BaseFeature> featureMap = ArrayListMultimap.create();
			
 
				+
			
 
				+    final private BytesUtils utils;
			
 
				+    final private int groupCount = OfflineVlogFeatureGroupV1.values().length;
			
 
				+    public OfflineVlogShareLRFeatureExtractorV1() {
			
 
				+        BytesGroup[] groups = new BytesGroup[OfflineVlogFeatureGroupV1.values().length];
			
 
				+        OfflineVlogFeatureGroupV1[] var2 = OfflineVlogFeatureGroupV1.values();
			
 
				+        int var3 = var2.length;
			
 
				+
			
 
				+        for(int var4 = 0; var4 < var3; ++var4) {
			
 
				+            OfflineVlogFeatureGroupV1 g = var2[var4];
			
 
				+            groups[g.ordinal()] = new BytesGroup(g.ordinal(), g.getGroupName(), g.getGroupNameBytes());
			
 
				+        }
			
 
				+        this.utils = new BytesUtils(groups);
			
 
				+    }
			
 
				+    public void makeFeature(Map<String, Object> maps){
			
 
				+        for (Map.Entry<String, Object> entry : maps.entrySet()){
			
 
				+            OfflineVlogFeatureGroupV1 ovf = OfflineVlogFeatureGroupV1.valueOf(entry.getKey());
			
 
				+            Object value = entry.getValue();
			
 
				+            if (value instanceof String){
			
 
				+                this.makeFea(ovf, ((String)value).getBytes());
			
 
				+            }else if (value instanceof Double){
			
 
				+                this.makeFea(ovf, String.valueOf((Double)value).getBytes());
			
 
				+            }else if (value instanceof Integer){
			
 
				+                //todo
			
 
				+            }else{
			
 
				+                //todo
			
 
				+                this.makeFea(ovf, ((String)value).getBytes());
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    public void makeFeature4String(Map<String, String> maps){
			
 
				+        for (Map.Entry<String, String> entry : maps.entrySet()){
			
 
				+            OfflineVlogFeatureGroupV1 ovf = OfflineVlogFeatureGroupV1.valueOf(entry.getKey());
			
 
				+            String value = entry.getValue();
			
 
				+            this.makeFea(ovf, value.getBytes());
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    private FeatureGroup makeGroup(OfflineVlogFeatureGroupV1 group) {
			
 
				+        FeatureGroup.Builder g = FeatureGroup.newBuilder();
			
 
				+        g.setType("1");
			
 
				+        g.setName(group.getGroupName());
			
 
				+        g.setId(group.ordinal());
			
 
				+        return g.build();
			
 
				+    }
			
 
				+    void makeFea(OfflineVlogFeatureGroupV1 group, byte[] value) {
			
 
				+        FeatureGroup featureGroup = this.makeGroup(group);
			
 
				+        BaseFeature feature = this.utils.makeFea(group.ordinal(), value);
			
 
				+        this.featureMap.put(featureGroup, feature);
			
 
				+    }
			
 
				+    
			
 
				+}
			
--- a/src/main/java/examples/dataloader/OfflineVlogShareLRFeatureExtractorV2.java
+++ b/src/main/java/examples/dataloader/OfflineVlogShareLRFeatureExtractorV2.java
@@ -0,0 +1,66 @@
 
				+package examples.dataloader;
			
 
				+
			
 
				+import com.google.common.collect.ArrayListMultimap;
			
 
				+import com.google.common.collect.ListMultimap;
			
 
				+import com.tzld.piaoquan.recommend.feature.domain.video.feature.BytesGroup;
			
 
				+import com.tzld.piaoquan.recommend.feature.domain.video.feature.BytesUtils;
			
 
				+import com.tzld.piaoquan.recommend.feature.model.sample.BaseFeature;
			
 
				+import com.tzld.piaoquan.recommend.feature.model.sample.FeatureGroup;
			
 
				+
			
 
				+import java.util.Map;
			
 
				+
			
 
				+public class OfflineVlogShareLRFeatureExtractorV2 {
			
 
				+
			
 
				+    public ListMultimap<FeatureGroup, BaseFeature> featureMap = ArrayListMultimap.create();
			
 
				+
			
 
				+    final private BytesUtils utils;
			
 
				+    final private int groupCount = OfflineVlogFeatureGroupV2.values().length;
			
 
				+    public OfflineVlogShareLRFeatureExtractorV2() {
			
 
				+        BytesGroup[] groups = new BytesGroup[OfflineVlogFeatureGroupV2.values().length];
			
 
				+        OfflineVlogFeatureGroupV2[] var2 = OfflineVlogFeatureGroupV2.values();
			
 
				+        int var3 = var2.length;
			
 
				+
			
 
				+        for(int var4 = 0; var4 < var3; ++var4) {
			
 
				+            OfflineVlogFeatureGroupV2 g = var2[var4];
			
 
				+            groups[g.ordinal()] = new BytesGroup(g.ordinal(), g.getGroupName(), g.getGroupNameBytes());
			
 
				+        }
			
 
				+        this.utils = new BytesUtils(groups);
			
 
				+    }
			
 
				+    public void makeFeature(Map<String, Object> maps){
			
 
				+        for (Map.Entry<String, Object> entry : maps.entrySet()){
			
 
				+            OfflineVlogFeatureGroupV2 ovf = OfflineVlogFeatureGroupV2.valueOf(entry.getKey());
			
 
				+            Object value = entry.getValue();
			
 
				+            if (value instanceof String){
			
 
				+                this.makeFea(ovf, ((String)value).getBytes());
			
 
				+            }else if (value instanceof Double){
			
 
				+                this.makeFea(ovf, String.valueOf((Double)value).getBytes());
			
 
				+            }else if (value instanceof Integer){
			
 
				+                //todo
			
 
				+            }else{
			
 
				+                //todo
			
 
				+                this.makeFea(ovf, ((String)value).getBytes());
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    public void makeFeature4String(Map<String, String> maps){
			
 
				+        for (Map.Entry<String, String> entry : maps.entrySet()){
			
 
				+            OfflineVlogFeatureGroupV2 ovf = OfflineVlogFeatureGroupV2.valueOf(entry.getKey());
			
 
				+            String value = entry.getValue();
			
 
				+            this.makeFea(ovf, value.getBytes());
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    private FeatureGroup makeGroup(OfflineVlogFeatureGroupV2 group) {
			
 
				+        FeatureGroup.Builder g = FeatureGroup.newBuilder();
			
 
				+        g.setType("1");
			
 
				+        g.setName(group.getGroupName());
			
 
				+        g.setId(group.ordinal());
			
 
				+        return g.build();
			
 
				+    }
			
 
				+    void makeFea(OfflineVlogFeatureGroupV2 group, byte[] value) {
			
 
				+        FeatureGroup featureGroup = this.makeGroup(group);
			
 
				+        BaseFeature feature = this.utils.makeFea(group.ordinal(), value);
			
 
				+        this.featureMap.put(featureGroup, feature);
			
 
				+    }
			
 
				+    
			
 
				+}
			
--- a/src/main/java/examples/dataloader/RequestContextOffline.java
+++ b/src/main/java/examples/dataloader/RequestContextOffline.java
@@ -0,0 +1,120 @@
 
				+package examples.dataloader;
			
 
				+
			
 
				+import com.aliyun.odps.data.Record;
			
 
				+
			
 
				+import java.util.Arrays;
			
 
				+import java.util.HashMap;
			
 
				+import java.util.Map;
			
 
				+import java.util.stream.Collectors;
			
 
				+
			
 
				+public class RequestContextOffline {
			
 
				+    public Map<String, Object> featureMap = new HashMap<>();
			
 
				+
			
 
				+    public void putUserFeature(Record record){
			
 
				+        setKVinMap(record, "machineinfo_brand", "string", "");
			
 
				+        setKVinMap(record, "machineinfo_model", "string", "");
			
 
				+        setKVinMap(record, "machineinfo_platform", "string","");
			
 
				+        setKVinMap(record, "machineinfo_system", "string","");
			
 
				+
			
 
				+        setKVinMap(record, "u_1day_exp_cnt", "double", "cnt");
			
 
				+        setKVinMap(record, "u_1day_click_cnt", "double", "cnt");
			
 
				+        setKVinMap(record, "u_1day_share_cnt", "double", "cnt");
			
 
				+        setKVinMap(record, "u_1day_return_cnt", "double", "cnt");
			
 
				+
			
 
				+        setKVinMap(record, "u_ctr_1day", "double", "rate");
			
 
				+        setKVinMap(record, "u_str_1day", "double", "rate");
			
 
				+        setKVinMap(record, "u_rov_1day", "double", "rate");
			
 
				+        setKVinMap(record, "u_ros_1day", "double", "rate");
			
 
				+
			
 
				+        setKVinMap(record, "u_3day_exp_cnt", "double", "cnt");
			
 
				+        setKVinMap(record, "u_3day_click_cnt", "double", "cnt");
			
 
				+        setKVinMap(record, "u_3day_share_cnt", "double", "cnt");
			
 
				+        setKVinMap(record, "u_3day_return_cnt", "double", "cnt");
			
 
				+
			
 
				+        setKVinMap(record, "u_ctr_3day", "double", "rate");
			
 
				+        setKVinMap(record, "u_str_3day", "double", "rate");
			
 
				+        setKVinMap(record, "u_rov_3day", "double", "rate");
			
 
				+        setKVinMap(record, "u_ros_3day", "double", "rate");
			
 
				+    }
			
 
				+    public void putItemFeature(Record record){
			
 
				+        // setKVinMap(record, "i_title_len", "double", "cnt");
			
 
				+        setKVinMap(record, "total_time", "double", "cnt");
			
 
				+        // setKVinMap(record, "i_days_since_upload", "double", "cnt");
			
 
				+        setKVinMap(record, "play_count_total", "double", "cnt");
			
 
				+
			
 
				+        setKVinMap(record, "i_1day_exp_cnt", "double", "cnt");
			
 
				+        setKVinMap(record, "i_1day_click_cnt", "double", "cnt");
			
 
				+        setKVinMap(record, "i_1day_share_cnt", "double", "cnt");
			
 
				+        setKVinMap(record, "i_1day_return_cnt", "double", "cnt");
			
 
				+
			
 
				+        setKVinMap(record, "i_ctr_1day", "double", "rate");
			
 
				+        setKVinMap(record, "i_str_1day", "double", "rate");
			
 
				+        setKVinMap(record, "i_rov_1day", "double", "rate");
			
 
				+        setKVinMap(record, "i_ros_1day", "double", "rate");
			
 
				+
			
 
				+        setKVinMap(record, "i_3day_exp_cnt", "double", "cnt");
			
 
				+        setKVinMap(record, "i_3day_click_cnt", "double", "cnt");
			
 
				+        setKVinMap(record, "i_3day_share_cnt", "double", "cnt");
			
 
				+        setKVinMap(record, "i_3day_return_cnt", "double", "cnt");
			
 
				+
			
 
				+        setKVinMap(record, "i_ctr_3day", "double", "rate");
			
 
				+        setKVinMap(record, "i_str_3day", "double", "rate");
			
 
				+        setKVinMap(record, "i_rov_3day", "double", "rate");
			
 
				+        setKVinMap(record, "i_ros_3day", "double", "rate");
			
 
				+    }
			
 
				+
			
 
				+    public void putSceneFeature(Record record){
			
 
				+        setKVinMap(record, "ctx_week", "string", "");
			
 
				+        setKVinMap(record, "ctx_hour", "string", "");
			
 
				+        setKVinMap(record, "ctx_region", "string","");
			
 
				+        setKVinMap(record, "ctx_city", "string","");
			
 
				+    }
			
 
				+    public void setKVinMap(Record record, String key, String instance, String cntOrRate){
			
 
				+        if (!Arrays.stream(record.getColumns()).map(r-> r.getName()).collect(Collectors.toSet()).contains(key)){
			
 
				+            return;
			
 
				+        }
			
 
				+        String value;
			
 
				+        try{
			
 
				+            value = record.getString(key);
			
 
				+        }catch (Exception e){
			
 
				+            value = String.valueOf(record.getBigint(key));
			
 
				+        }
			
 
				+
			
 
				+        if (value == null){
			
 
				+            return;
			
 
				+        }
			
 
				+        String ins = instance.toLowerCase();
			
 
				+        switch (ins){
			
 
				+            case "string":
			
 
				+                featureMap.put(key, value);
			
 
				+                return;
			
 
				+            case "double":
			
 
				+                if ("cnt".equals(cntOrRate)){
			
 
				+                    featureMap.put(key, String.valueOf(this.bucketRatioFeature(Double.valueOf(value))));
			
 
				+                }else if ("rate".equals(cntOrRate)){
			
 
				+                    featureMap.put(key, String.valueOf(this.ceilLog(Double.valueOf(value))));
			
 
				+                }
			
 
				+                return;
			
 
				+            case "int":
			
 
				+                return;
			
 
				+            case "long":
			
 
				+                return;
			
 
				+            default:
			
 
				+                return;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    public double ceilLog(Double key) {
			
 
				+        return Math.ceil(Math.log(key + 1.0));
			
 
				+    }
			
 
				+
			
 
				+    public double bucketRatioFeature(Double key) {
			
 
				+        long bucket = Math.round(Math.log((key + 1.0) * 50.0));
			
 
				+        if (bucket > 50L) {
			
 
				+            bucket = 50L;
			
 
				+        }
			
 
				+
			
 
				+        return (double)bucket;
			
 
				+    }
			
 
				+}
			
--- a/src/main/java/examples/dataloader/redisBuilderMyself.java
+++ b/src/main/java/examples/dataloader/redisBuilderMyself.java
@@ -0,0 +1,45 @@
 
				+package examples.dataloader;
			
 
				+
			
 
				+import org.springframework.data.redis.connection.RedisConnectionFactory;
			
 
				+import org.springframework.data.redis.connection.RedisStandaloneConfiguration;
			
 
				+import org.springframework.data.redis.connection.jedis.JedisConnectionFactory;
			
 
				+import redis.clients.jedis.JedisPoolConfig;
			
 
				+import org.springframework.data.redis.connection.jedis.JedisClientConfiguration;
			
 
				+import org.springframework.data.redis.core.RedisTemplate;
			
 
				+
			
 
				+
			
 
				+public class redisBuilderMyself {
			
 
				+
			
 
				+
			
 
				+    public static JedisConnectionFactory redisConnectionFactory() {
			
 
				+
			
 
				+        RedisStandaloneConfiguration config = new RedisStandaloneConfiguration(
			
 
				+                "r-bp1pi8wyv6lzvgjy5z.redis.rds.aliyuncs.com", 6379);
			
 
				+        config.setPassword("Wqsd@2019");
			
 
				+
			
 
				+        JedisPoolConfig jedisPoolConfig = new JedisPoolConfig();
			
 
				+        // 最大连接数, 根据业务需要设置，不能超过实例规格规定的最大连接数。
			
 
				+        jedisPoolConfig.setMaxTotal(30);
			
 
				+        // 最大空闲连接数, 根据业务需要设置，不能超过实例规格规定的最大连接数。
			
 
				+        jedisPoolConfig.setMaxIdle(20);
			
 
				+        // 关闭 testOn[Borrow|Return]，防止产生额外的PING。
			
 
				+        jedisPoolConfig.setTestOnBorrow(false);
			
 
				+        jedisPoolConfig.setTestOnReturn(false);
			
 
				+
			
 
				+        JedisClientConfiguration jedisClientConfiguration = JedisClientConfiguration.builder().usePooling().poolConfig(
			
 
				+                jedisPoolConfig).build();
			
 
				+
			
 
				+        return new JedisConnectionFactory(config, jedisClientConfiguration);
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    public static RedisTemplate<String, String> redisTemplate(RedisConnectionFactory connectionFactory) {
			
 
				+        RedisTemplate<String, String> template = new RedisTemplate<>();
			
 
				+        template.setConnectionFactory(connectionFactory);
			
 
				+        return template;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
--- a/src/main/java/examples/extractor/ExtractorUtils.java
+++ b/src/main/java/examples/extractor/ExtractorUtils.java
@@ -0,0 +1,164 @@
 
				+package examples.extractor;
			
 
				+
			
 
				+import java.util.Map;
			
 
				+import java.time.LocalDateTime;
			
 
				+import java.time.format.DateTimeFormatter;
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.List;
			
 
				+public class ExtractorUtils {
			
 
				+
			
 
				+    public static Double division(String s1, String s2, Map<String, String> maps){
			
 
				+        double rate = 0.0;
			
 
				+        if (maps.containsKey(s1) && maps.containsKey(s2)){
			
 
				+            Double d1 = Double.valueOf(maps.get(s1));
			
 
				+            if (isDoubleEqualToZero(d1)){
			
 
				+                return rate;
			
 
				+            }
			
 
				+            Double d2 = Double.valueOf(maps.get(s2));
			
 
				+            rate = d2 / d1;
			
 
				+        }
			
 
				+        return rate;
			
 
				+    }
			
 
				+    public static Double divisionDouble(Double d1, Double d2){
			
 
				+        double rate = 0.0;
			
 
				+        if (isDoubleEqualToZero(d1)){
			
 
				+            return rate;
			
 
				+        }
			
 
				+        rate = d2 / d1;
			
 
				+        return rate;
			
 
				+    }
			
 
				+
			
 
				+    public static boolean isDoubleEqualToZero(double value) {
			
 
				+        final double epsilon = 1e-10; // 定义一个很小的误差范围
			
 
				+        // 判断value是否在误差范围内
			
 
				+        return Math.abs(value) < epsilon;
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+
			
 
				+    public static double calculateVariance(List<Double> numbers) {
			
 
				+        double average = numbers.stream()
			
 
				+                .mapToDouble(Double::doubleValue)
			
 
				+                .average()
			
 
				+                .orElse(0.0);
			
 
				+
			
 
				+        double squaredDiffSum = numbers.stream()
			
 
				+                .mapToDouble(Double::doubleValue)
			
 
				+                .map(x -> Math.pow(x - average, 2))
			
 
				+                .average()
			
 
				+                .orElse(0.0);
			
 
				+
			
 
				+        return squaredDiffSum;
			
 
				+    }
			
 
				+
			
 
				+    public static double calculateAverage(List<Double> numbers) {
			
 
				+        if (numbers == null || numbers.isEmpty()) {
			
 
				+            return 0.0;
			
 
				+        }
			
 
				+        return numbers.stream()
			
 
				+                .mapToDouble(Number::doubleValue)
			
 
				+                .average()
			
 
				+                .orElse(0.0);
			
 
				+    }
			
 
				+
			
 
				+    public static List<Double> calculateDifferences(List<Double> numbers) {
			
 
				+        List<Double> differences = new ArrayList<>();
			
 
				+
			
 
				+        for (int i = 0; i < numbers.size() - 1; i++) {
			
 
				+            Double diff = 0.0;
			
 
				+            if (!isDoubleEqualToZero(numbers.get(i))){
			
 
				+                diff = (numbers.get(i + 1) - numbers.get(i)) / numbers.get(i);
			
 
				+            }
			
 
				+            differences.add(diff);
			
 
				+        }
			
 
				+
			
 
				+        return differences;
			
 
				+    }
			
 
				+
			
 
				+    public static List<String> generateHourStrings(String timeString, int N) {
			
 
				+        LocalDateTime dateTime = LocalDateTime.parse(timeString, DateTimeFormatter.ofPattern("yyyyMMddHH"));
			
 
				+        List<String> hourStrings = new ArrayList<>();
			
 
				+        for (int i = 0; i < N; i++) {
			
 
				+            hourStrings.add(dateTime.minusHours(i).format(DateTimeFormatter.ofPattern("yyyyMMddHH")));
			
 
				+        }
			
 
				+
			
 
				+        return hourStrings;
			
 
				+    }
			
 
				+
			
 
				+    public static String subtractHours(String inputDateTime, int hoursToSubtract) {
			
 
				+        DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMMddHH");
			
 
				+        LocalDateTime dateTime = LocalDateTime.parse(inputDateTime, formatter);
			
 
				+        LocalDateTime subtractedDateTime = dateTime.minusHours(hoursToSubtract);
			
 
				+        return subtractedDateTime.format(formatter);
			
 
				+    }
			
 
				+
			
 
				+    // 针对0-1的数字，进行分桶。
			
 
				+    public static Integer ceilLogRate(Double key) {
			
 
				+        double bucket = Math.ceil(
			
 
				+                Math.pow(key, 0.2) * 100
			
 
				+        );
			
 
				+        if (bucket > 300) {
			
 
				+            bucket = 300;
			
 
				+        }
			
 
				+        if (bucket < 0) {
			
 
				+            bucket = 0;
			
 
				+        }
			
 
				+        return (int)bucket;
			
 
				+    }
			
 
				+
			
 
				+    // 针对大于1的数字，进行分桶。
			
 
				+    public static int bucketCnt(Double key) {
			
 
				+        long bucket = Math.round(Math.log((key * 10 + 1.0)) * 10);
			
 
				+        if (bucket > 300) {
			
 
				+            bucket = 300;
			
 
				+        }
			
 
				+        if (bucket < 0) {
			
 
				+            bucket = 0;
			
 
				+        }
			
 
				+        return (int)bucket;
			
 
				+    }
			
 
				+
			
 
				+    public static int findInsertPosition(double[] sortedArray, double target) {
			
 
				+        int low = 0;
			
 
				+        int high = sortedArray.length - 1;
			
 
				+
			
 
				+        while (low <= high) {
			
 
				+            int mid = low + (high - low) / 2;
			
 
				+            double midValue = sortedArray[mid];
			
 
				+
			
 
				+            if (midValue < target) {
			
 
				+                low = mid + 1;
			
 
				+            } else if (midValue > target) {
			
 
				+                high = mid - 1;
			
 
				+            } else {
			
 
				+                // 找到相等的值，尝试在右侧寻找插入点
			
 
				+                while (mid < sortedArray.length - 1 && sortedArray[mid + 1] == target) {
			
 
				+                    mid++;
			
 
				+                }
			
 
				+                return mid + 1; // 返回当前mid的下一个位置作为插入点
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        return low; // 返回low作为插入点
			
 
				+    }
			
 
				+
			
 
				+    public static void main(String[] args) {
			
 
				+        double[] sortedArray = {1.0, 2.0, 4.0, 4.0, 6.0};
			
 
				+        double target = 0.0;
			
 
				+        System.out.println(findInsertPosition(sortedArray, target));
			
 
				+
			
 
				+
			
 
				+//        System.out.println(ceilLogRate(0.0002));
			
 
				+//        System.out.println(ceilLogRate(0.01));
			
 
				+//        System.out.println(ceilLogRate(0.2));
			
 
				+//        System.out.println(ceilLogRate(4.));
			
 
				+//        System.out.println(bucketCnt(1.));
			
 
				+//        System.out.println(bucketCnt(20.));
			
 
				+//        System.out.println(bucketCnt(500.));
			
 
				+//        System.out.println(bucketCnt(50000.));
			
 
				+
			
 
				+//        System.out.println(generateHourStrings("2024011603", 5));
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+}
			
--- a/src/main/java/examples/extractor/RankExtractorFeature_20240530.java
+++ b/src/main/java/examples/extractor/RankExtractorFeature_20240530.java
@@ -0,0 +1,33 @@
 
				+package examples.extractor;
			
 
				+
			
 
				+public class RankExtractorFeature_20240530 {
			
 
				+
			
 
				+    public static Double calDiv(double a, double b){
			
 
				+        if (a == 0 || b == 0){
			
 
				+            return 0D;
			
 
				+        }
			
 
				+        return a / b;
			
 
				+    }
			
 
				+    public static Double calLog(double a){
			
 
				+        if (a <= 0){
			
 
				+            return 0D;
			
 
				+        }
			
 
				+        return Math.log(a + 1.0);
			
 
				+    }
			
 
				+
			
 
				+    public static void main(String[] args) {
			
 
				+        System.out.println(Math.log(10));
			
 
				+        System.out.println(Math.log(100));
			
 
				+        System.out.println(Math.log(1000));
			
 
				+        System.out.println(Math.log(10000));
			
 
				+        System.out.println(Math.log(100000));
			
 
				+
			
 
				+        System.out.println(Math.log10(10));
			
 
				+        System.out.println(Math.log10(100));
			
 
				+        System.out.println(Math.log10(1000));
			
 
				+        System.out.println(Math.log10(10000));
			
 
				+        System.out.println(Math.log10(100000));
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
--- a/src/main/java/examples/extractor/RankExtractorItemFeature.java
+++ b/src/main/java/examples/extractor/RankExtractorItemFeature.java
@@ -0,0 +1,324 @@
 
				+package examples.extractor;
			
 
				+
			
 
				+import java.util.*;
			
 
				+import java.util.stream.Collectors;
			
 
				+
			
 
				+public class RankExtractorItemFeature {
			
 
				+    public static Map<String, String> getItemRateFeature(Map<String, String> maps) {
			
 
				+
			
 
				+        double d;
			
 
				+        Map<String, Double> result = new HashMap<>();
			
 
				+        d = ExtractorUtils.division("i_1day_exp_cnt", "i_1day_click_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_1day_ctr",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_1day_exp_cnt", "i_1day_share_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_1day_str",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_1day_exp_cnt", "i_1day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_1day_rov",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_1day_share_cnt", "i_1day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_1day_ros",d);
			
 
				+        }
			
 
				+
			
 
				+        d = ExtractorUtils.division("i_3day_exp_cnt", "i_3day_click_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_3day_ctr",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_3day_exp_cnt", "i_3day_share_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_3day_str",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_3day_exp_cnt", "i_3day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_3day_rov",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_3day_share_cnt", "i_3day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_3day_ros",d);
			
 
				+        }
			
 
				+
			
 
				+        d = ExtractorUtils.division("i_7day_exp_cnt", "i_7day_click_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_7day_ctr",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_7day_exp_cnt", "i_7day_share_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_7day_str",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_7day_exp_cnt", "i_7day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_7day_rov",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_7day_share_cnt", "i_7day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_7day_ros",d);
			
 
				+        }
			
 
				+
			
 
				+        d = ExtractorUtils.division("i_3month_exp_cnt", "i_3month_click_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_3month_ctr",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_3month_exp_cnt", "i_3month_share_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_3month_str",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_3month_exp_cnt", "i_3month_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_3month_rov",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_3month_share_cnt", "i_3month_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_3month_ros",d);
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+        return rateFeatureChange(result);
			
 
				+    }
			
 
				+
			
 
				+    public static Map<String, String> getItemRealtimeTrend(Map<String, Map<String, Double>> maps, String date, String hour){
			
 
				+        Map<String, Double> result1 = new HashMap<>();
			
 
				+        Map<String, Double> result2 = new HashMap<>();
			
 
				+        if (date.isEmpty() || hour.isEmpty()){
			
 
				+            return rateFeatureChange(result1);
			
 
				+        }
			
 
				+        int N = 6;
			
 
				+
			
 
				+        List<String> hourStrs = ExtractorUtils.generateHourStrings(date + hour, N);
			
 
				+
			
 
				+        String key;
			
 
				+
			
 
				+        key = "share_uv_list_1day";
			
 
				+        if (maps.containsKey(key)){
			
 
				+            Map<String, Double> fList = maps.get(key);
			
 
				+            List<Double> arrs = hourStrs.stream().map(r -> fList.getOrDefault(r, 0.0D)).collect(Collectors.toList());
			
 
				+            Collections.reverse(arrs);
			
 
				+            result1.put(key+"_"+N+"_avg", ExtractorUtils.calculateAverage(arrs));
			
 
				+            result1.put(key+"_"+N+"_var", ExtractorUtils.calculateVariance(arrs));
			
 
				+
			
 
				+            List<Double> arrsDiff = ExtractorUtils.calculateDifferences(arrs);
			
 
				+            result2.put(key+"_diff_"+N+"_avg", ExtractorUtils.calculateAverage(arrsDiff));
			
 
				+            result2.put(key+"_diff_"+N+"_var", ExtractorUtils.calculateVariance(arrsDiff));
			
 
				+        }
			
 
				+
			
 
				+        key = "return_uv_list_1day";
			
 
				+        if (maps.containsKey(key)){
			
 
				+            Map<String, Double> fList = maps.get(key);
			
 
				+            List<Double> arrs = hourStrs.stream().map(r -> fList.getOrDefault(r, 0.0D)).collect(Collectors.toList());
			
 
				+            Collections.reverse(arrs);
			
 
				+            result1.put(key+"_"+N+"_avg", ExtractorUtils.calculateAverage(arrs));
			
 
				+            result1.put(key+"_"+N+"_var", ExtractorUtils.calculateVariance(arrs));
			
 
				+
			
 
				+            List<Double> arrsDiff = ExtractorUtils.calculateDifferences(arrs);
			
 
				+            result2.put(key+"_diff_"+N+"_avg", ExtractorUtils.calculateAverage(arrsDiff));
			
 
				+            result2.put(key+"_diff_"+N+"_var", ExtractorUtils.calculateVariance(arrsDiff));
			
 
				+        }
			
 
				+
			
 
				+        key = "share_uv_list_1h";
			
 
				+        if (maps.containsKey(key)){
			
 
				+            Map<String, Double> fList = maps.get(key);
			
 
				+            List<Double> arrs = hourStrs.stream().map(r -> fList.getOrDefault(r, 0.0D)).collect(Collectors.toList());
			
 
				+            Collections.reverse(arrs);
			
 
				+            result1.put(key+"_"+N+"_avg", ExtractorUtils.calculateAverage(arrs));
			
 
				+            result1.put(key+"_"+N+"_var", ExtractorUtils.calculateVariance(arrs));
			
 
				+
			
 
				+            List<Double> arrsDiff = ExtractorUtils.calculateDifferences(arrs);
			
 
				+            result2.put(key+"_diff_"+N+"_avg", ExtractorUtils.calculateAverage(arrsDiff));
			
 
				+            result2.put(key+"_diff_"+N+"_var", ExtractorUtils.calculateVariance(arrsDiff));
			
 
				+        }
			
 
				+
			
 
				+        key = "return_uv_list_1h";
			
 
				+        if (maps.containsKey(key)){
			
 
				+            Map<String, Double> fList = maps.get(key);
			
 
				+            List<Double> arrs = hourStrs.stream().map(r -> fList.getOrDefault(r, 0.0D)).collect(Collectors.toList());
			
 
				+            Collections.reverse(arrs);
			
 
				+            result1.put(key+"_"+N+"_avg", ExtractorUtils.calculateAverage(arrs));
			
 
				+            result1.put(key+"_"+N+"_var", ExtractorUtils.calculateVariance(arrs));
			
 
				+
			
 
				+            List<Double> arrsDiff = ExtractorUtils.calculateDifferences(arrs);
			
 
				+            result2.put(key+"_diff_"+N+"_avg", ExtractorUtils.calculateAverage(arrsDiff));
			
 
				+            result2.put(key+"_diff_"+N+"_var", ExtractorUtils.calculateVariance(arrsDiff));
			
 
				+        }
			
 
				+        Map<String, String> r1 = cntFeatureChange4Double(result1);
			
 
				+        Map<String, String> r2 = rateFeatureChange(result2);
			
 
				+        r1.putAll(r2);
			
 
				+
			
 
				+        return r1;
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    public static Map<String, String> rateFeatureChange(Map<String, Double> maps){
			
 
				+        Map<String, String> result = new HashMap<>();
			
 
				+        for (Map.Entry<String, Double> entry : maps.entrySet()){
			
 
				+            int value = ExtractorUtils.ceilLogRate(entry.getValue());
			
 
				+            result.put(entry.getKey(), String.valueOf(value));
			
 
				+        }
			
 
				+        return result;
			
 
				+    }
			
 
				+    public static Map<String, String> cntFeatureChange4Double(Map<String, Double> maps){
			
 
				+        Map<String, String> result = new HashMap<>();
			
 
				+        for (Map.Entry<String, Double> entry : maps.entrySet()){
			
 
				+            int value = ExtractorUtils.bucketCnt(Double.valueOf(entry.getValue()));
			
 
				+            result.put(entry.getKey(), String.valueOf(value));
			
 
				+        }
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+    public static Map<String, String> cntFeatureChange(Map<String, String> maps,
			
 
				+                                                       Set<String> names){
			
 
				+        Map<String, String> result = new HashMap<>();
			
 
				+        for (Map.Entry<String, String> entry : maps.entrySet()){
			
 
				+            if (!names.contains(entry.getKey())){
			
 
				+                continue;
			
 
				+            }
			
 
				+            int value = ExtractorUtils.bucketCnt(Double.valueOf(entry.getValue()));
			
 
				+            result.put(entry.getKey(), String.valueOf(value));
			
 
				+        }
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+    public static Map<String, String> getItemRealtimeCnt(Map<String, Map<String, Double>> maps,
			
 
				+                                                         Set<String> names,
			
 
				+                                                         String date, String hour){
			
 
				+        Map<String, String> result = new HashMap<>();
			
 
				+        if (date.isEmpty() || hour.isEmpty()){
			
 
				+            return result;
			
 
				+        }
			
 
				+        String dateHour = ExtractorUtils.subtractHours(date + hour, 0);
			
 
				+        for (Map.Entry<String, Map<String, Double>> entry : maps.entrySet()){
			
 
				+            if (!names.contains(entry.getKey())){
			
 
				+                continue;
			
 
				+            }
			
 
				+            Double num = entry.getValue().getOrDefault(dateHour, 0.0);
			
 
				+            if (!ExtractorUtils.isDoubleEqualToZero(num)){
			
 
				+                result.put(entry.getKey(), String.valueOf(ExtractorUtils.bucketCnt(num)));
			
 
				+            }
			
 
				+        }
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+    public static Map<String, String> getItemRealtimeRate(Map<String, Map<String, Double>> maps,
			
 
				+                                                         String date, String hour){
			
 
				+        Map<String, Double> result = new HashMap<>();
			
 
				+        if (date.isEmpty() || hour.isEmpty()){
			
 
				+            return rateFeatureChange(result);
			
 
				+        }
			
 
				+        String dateHour = ExtractorUtils.subtractHours(date + hour, 0);
			
 
				+
			
 
				+        double d, d1, d2;
			
 
				+        String k1, k2;
			
 
				+
			
 
				+        k1 = "view_pv_list_1day";
			
 
				+        k2 = "play_pv_list_1day";
			
 
				+        if (maps.containsKey(k1) && maps.containsKey(k2)){
			
 
				+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
			
 
				+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
			
 
				+            d = ExtractorUtils.divisionDouble(d1, d2);
			
 
				+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+                result.put("i_1day_ctr_rt", d);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        k1 = "view_pv_list_1day";
			
 
				+        k2 = "share_pv_list_1day";
			
 
				+        if (maps.containsKey(k1) && maps.containsKey(k2)){
			
 
				+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
			
 
				+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
			
 
				+            d = ExtractorUtils.divisionDouble(d1, d2);
			
 
				+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+                result.put("i_1day_str_rt", d);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        k1 = "share_pv_list_1day";
			
 
				+        k2 = "return_uv_list_1day";
			
 
				+        if (maps.containsKey(k1) && maps.containsKey(k2)){
			
 
				+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
			
 
				+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
			
 
				+            d = ExtractorUtils.divisionDouble(d1, d2);
			
 
				+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+                result.put("i_1day_ros_rt", d);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        k1 = "view_pv_list_1day";
			
 
				+        k2 = "return_uv_list_1day";
			
 
				+        if (maps.containsKey(k1) && maps.containsKey(k2)){
			
 
				+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
			
 
				+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
			
 
				+            d = ExtractorUtils.divisionDouble(d1, d2);
			
 
				+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+                result.put("i_1day_rov_rt", d);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        //---
			
 
				+        k1 = "view_pv_list_1h";
			
 
				+        k2 = "play_pv_list_1h";
			
 
				+        if (maps.containsKey(k1) && maps.containsKey(k2)){
			
 
				+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
			
 
				+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
			
 
				+            d = ExtractorUtils.divisionDouble(d1, d2);
			
 
				+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+                result.put("i_1h_ctr_rt", d);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        k1 = "view_pv_list_1h";
			
 
				+        k2 = "share_pv_list_1h";
			
 
				+        if (maps.containsKey(k1) && maps.containsKey(k2)){
			
 
				+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
			
 
				+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
			
 
				+            d = ExtractorUtils.divisionDouble(d1, d2);
			
 
				+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+                result.put("i_1h_str_rt", d);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        k1 = "share_pv_list_1day";
			
 
				+        k2 = "return_uv_list_1h";
			
 
				+        if (maps.containsKey(k1) && maps.containsKey(k2)){
			
 
				+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
			
 
				+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
			
 
				+            d = ExtractorUtils.divisionDouble(d1, d2);
			
 
				+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+                result.put("i_1h_ros_rt", d);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        k1 = "view_pv_list_1h";
			
 
				+        k2 = "return_uv_list_1h";
			
 
				+        if (maps.containsKey(k1) && maps.containsKey(k2)){
			
 
				+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
			
 
				+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
			
 
				+            d = ExtractorUtils.divisionDouble(d1, d2);
			
 
				+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+                result.put("i_1h_rov_rt", d);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+        return rateFeatureChange(result);
			
 
				+    }
			
 
				+
			
 
				+    public static void main(String[] args) {
			
 
				+        String s1 = "share_uv_list_1day";
			
 
				+        String s2 = "2024011300:2,2024011301:2,2024011304:2,2024011309:3,2024011311:3,2024011314:4,2024011315:4,2024011321:1,2024011323:1,2024011400:1,2024011401:1,2024011404:1,2024011406:1,2024011407:1,2024011408:1,2024011410:1,2024011423:1,2024011302:2,2024011305:2,2024011312:4,2024011313:4,2024011317:4,2024011318:4,2024011319:3,2024011320:1,2024011403:1,2024011409:1,2024011411:1,2024011419:1,2024011420:1,2024011422:1,2024011303:2,2024011306:2,2024011307:2,2024011308:2,2024011310:3,2024011316:4,2024011322:1,2024011402:1,2024011405:1,2024011421:1";
			
 
				+        Map<String, Double> m1 = new HashMap<>();
			
 
				+        Map<String, Map<String, Double>> maps = new HashMap<>();
			
 
				+        for (String s : s2.split(",")){
			
 
				+            String s3 = s.split(":")[0];
			
 
				+            String s4 = s.split(":")[1];
			
 
				+            m1.put(s3, Double.valueOf(s4));
			
 
				+        }
			
 
				+        maps.put(s1, m1);
			
 
				+
			
 
				+        String date = "20240114";
			
 
				+        String hour = "20";
			
 
				+        System.out.println(getItemRealtimeTrend(maps, date, hour));
			
 
				+    }
			
 
				+}
			
--- a/src/main/java/examples/extractor/RankExtractorItemFeatureV2.java
+++ b/src/main/java/examples/extractor/RankExtractorItemFeatureV2.java
@@ -0,0 +1,338 @@
 
				+package examples.extractor;
			
 
				+
			
 
				+import java.util.*;
			
 
				+import java.util.stream.Collectors;
			
 
				+
			
 
				+public class RankExtractorItemFeatureV2 {
			
 
				+    public static Map<String, String> getItemRateFeature(Map<String, String> maps) {
			
 
				+
			
 
				+        double d;
			
 
				+        Map<String, Double> result = new HashMap<>();
			
 
				+        d = ExtractorUtils.division("i_1day_exp_cnt", "i_1day_click_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_1day_ctr",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_1day_exp_cnt", "i_1day_share_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_1day_str",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_1day_exp_cnt", "i_1day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_1day_rov",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_1day_share_cnt", "i_1day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_1day_ros",d);
			
 
				+        }
			
 
				+
			
 
				+        d = ExtractorUtils.division("i_3day_exp_cnt", "i_3day_click_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_3day_ctr",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_3day_exp_cnt", "i_3day_share_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_3day_str",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_3day_exp_cnt", "i_3day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_3day_rov",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_3day_share_cnt", "i_3day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_3day_ros",d);
			
 
				+        }
			
 
				+
			
 
				+        d = ExtractorUtils.division("i_7day_exp_cnt", "i_7day_click_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_7day_ctr",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_7day_exp_cnt", "i_7day_share_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_7day_str",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_7day_exp_cnt", "i_7day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_7day_rov",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_7day_share_cnt", "i_7day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_7day_ros",d);
			
 
				+        }
			
 
				+
			
 
				+        d = ExtractorUtils.division("i_3month_exp_cnt", "i_3month_click_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_3month_ctr",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_3month_exp_cnt", "i_3month_share_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_3month_str",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_3month_exp_cnt", "i_3month_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_3month_rov",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("i_3month_share_cnt", "i_3month_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("i_3month_ros",d);
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+        Map<String, String> result2 = new HashMap<>();
			
 
				+        for (Map.Entry<String, Double> entry : result.entrySet()){
			
 
				+            result2.put(entry.getKey(), String.valueOf(entry.getValue()));
			
 
				+        }
			
 
				+
			
 
				+        return result2;
			
 
				+    }
			
 
				+
			
 
				+    public static Map<String, String> getItemRealtimeTrend(Map<String, Map<String, Double>> maps, String date, String hour){
			
 
				+        Map<String, Double> result1 = new HashMap<>();
			
 
				+        Map<String, Double> result2 = new HashMap<>();
			
 
				+        if (date.isEmpty() || hour.isEmpty()){
			
 
				+            return rateFeatureChange(result1);
			
 
				+        }
			
 
				+        int N = 6;
			
 
				+
			
 
				+        List<String> hourStrs = ExtractorUtils.generateHourStrings(date + hour, N);
			
 
				+
			
 
				+        String key;
			
 
				+
			
 
				+        key = "share_uv_list_1day";
			
 
				+        if (maps.containsKey(key)){
			
 
				+            Map<String, Double> fList = maps.get(key);
			
 
				+            List<Double> arrs = hourStrs.stream().map(r -> fList.getOrDefault(r, 0.0D)).collect(Collectors.toList());
			
 
				+            Collections.reverse(arrs);
			
 
				+            result1.put(key+"_"+N+"_avg", ExtractorUtils.calculateAverage(arrs));
			
 
				+            result1.put(key+"_"+N+"_var", ExtractorUtils.calculateVariance(arrs));
			
 
				+
			
 
				+            List<Double> arrsDiff = ExtractorUtils.calculateDifferences(arrs);
			
 
				+            result2.put(key+"_diff_"+N+"_avg", ExtractorUtils.calculateAverage(arrsDiff));
			
 
				+            result2.put(key+"_diff_"+N+"_var", ExtractorUtils.calculateVariance(arrsDiff));
			
 
				+        }
			
 
				+
			
 
				+        key = "return_uv_list_1day";
			
 
				+        if (maps.containsKey(key)){
			
 
				+            Map<String, Double> fList = maps.get(key);
			
 
				+            List<Double> arrs = hourStrs.stream().map(r -> fList.getOrDefault(r, 0.0D)).collect(Collectors.toList());
			
 
				+            Collections.reverse(arrs);
			
 
				+            result1.put(key+"_"+N+"_avg", ExtractorUtils.calculateAverage(arrs));
			
 
				+            result1.put(key+"_"+N+"_var", ExtractorUtils.calculateVariance(arrs));
			
 
				+
			
 
				+            List<Double> arrsDiff = ExtractorUtils.calculateDifferences(arrs);
			
 
				+            result2.put(key+"_diff_"+N+"_avg", ExtractorUtils.calculateAverage(arrsDiff));
			
 
				+            result2.put(key+"_diff_"+N+"_var", ExtractorUtils.calculateVariance(arrsDiff));
			
 
				+        }
			
 
				+
			
 
				+        key = "share_uv_list_1h";
			
 
				+        if (maps.containsKey(key)){
			
 
				+            Map<String, Double> fList = maps.get(key);
			
 
				+            List<Double> arrs = hourStrs.stream().map(r -> fList.getOrDefault(r, 0.0D)).collect(Collectors.toList());
			
 
				+            Collections.reverse(arrs);
			
 
				+            result1.put(key+"_"+N+"_avg", ExtractorUtils.calculateAverage(arrs));
			
 
				+            result1.put(key+"_"+N+"_var", ExtractorUtils.calculateVariance(arrs));
			
 
				+
			
 
				+            List<Double> arrsDiff = ExtractorUtils.calculateDifferences(arrs);
			
 
				+            result2.put(key+"_diff_"+N+"_avg", ExtractorUtils.calculateAverage(arrsDiff));
			
 
				+            result2.put(key+"_diff_"+N+"_var", ExtractorUtils.calculateVariance(arrsDiff));
			
 
				+        }
			
 
				+
			
 
				+        key = "return_uv_list_1h";
			
 
				+        if (maps.containsKey(key)){
			
 
				+            Map<String, Double> fList = maps.get(key);
			
 
				+            List<Double> arrs = hourStrs.stream().map(r -> fList.getOrDefault(r, 0.0D)).collect(Collectors.toList());
			
 
				+            Collections.reverse(arrs);
			
 
				+            result1.put(key+"_"+N+"_avg", ExtractorUtils.calculateAverage(arrs));
			
 
				+            result1.put(key+"_"+N+"_var", ExtractorUtils.calculateVariance(arrs));
			
 
				+
			
 
				+            List<Double> arrsDiff = ExtractorUtils.calculateDifferences(arrs);
			
 
				+            result2.put(key+"_diff_"+N+"_avg", ExtractorUtils.calculateAverage(arrsDiff));
			
 
				+            result2.put(key+"_diff_"+N+"_var", ExtractorUtils.calculateVariance(arrsDiff));
			
 
				+        }
			
 
				+        Map<String, String> result = new HashMap<>();
			
 
				+        for (Map.Entry<String, Double> entry : result1.entrySet()){
			
 
				+            result.put(entry.getKey(), String.valueOf(entry.getValue()));
			
 
				+        }
			
 
				+        for (Map.Entry<String, Double> entry : result2.entrySet()){
			
 
				+            result.put(entry.getKey(), String.valueOf(entry.getValue()));
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    public static Map<String, String> rateFeatureChange(Map<String, Double> maps){
			
 
				+        Map<String, String> result = new HashMap<>();
			
 
				+        for (Map.Entry<String, Double> entry : maps.entrySet()){
			
 
				+            int value = ExtractorUtils.ceilLogRate(entry.getValue());
			
 
				+            result.put(entry.getKey(), String.valueOf(value));
			
 
				+        }
			
 
				+        return result;
			
 
				+    }
			
 
				+    public static Map<String, String> cntFeatureChange4Double(Map<String, Double> maps){
			
 
				+        Map<String, String> result = new HashMap<>();
			
 
				+        for (Map.Entry<String, Double> entry : maps.entrySet()){
			
 
				+            int value = ExtractorUtils.bucketCnt(entry.getValue());
			
 
				+            result.put(entry.getKey(), String.valueOf(value));
			
 
				+        }
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+    public static Map<String, String> cntFeatureChange(Map<String, String> maps,
			
 
				+                                                       Set<String> names){
			
 
				+        Map<String, String> result = new HashMap<>();
			
 
				+        for (Map.Entry<String, String> entry : maps.entrySet()){
			
 
				+            if (!names.contains(entry.getKey())){
			
 
				+                continue;
			
 
				+            }
			
 
				+            int value = ExtractorUtils.bucketCnt(Double.valueOf(entry.getValue()));
			
 
				+            result.put(entry.getKey(), String.valueOf(value));
			
 
				+        }
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+    public static Map<String, String> getItemRealtimeCnt(Map<String, Map<String, Double>> maps,
			
 
				+                                                         Set<String> names,
			
 
				+                                                         String date, String hour){
			
 
				+        Map<String, String> result = new HashMap<>();
			
 
				+        if (date.isEmpty() || hour.isEmpty()){
			
 
				+            return result;
			
 
				+        }
			
 
				+        String dateHour = ExtractorUtils.subtractHours(date + hour, 0);
			
 
				+        for (Map.Entry<String, Map<String, Double>> entry : maps.entrySet()){
			
 
				+            if (!names.contains(entry.getKey())){
			
 
				+                continue;
			
 
				+            }
			
 
				+            Double num = entry.getValue().getOrDefault(dateHour, 0.0);
			
 
				+            if (!ExtractorUtils.isDoubleEqualToZero(num)){
			
 
				+                result.put(entry.getKey(), String.valueOf(num));
			
 
				+            }
			
 
				+        }
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+    public static Map<String, String> getItemRealtimeRate(Map<String, Map<String, Double>> maps,
			
 
				+                                                         String date, String hour){
			
 
				+        Map<String, Double> result = new HashMap<>();
			
 
				+        if (date.isEmpty() || hour.isEmpty()){
			
 
				+            return rateFeatureChange(result);
			
 
				+        }
			
 
				+        String dateHour = ExtractorUtils.subtractHours(date + hour, 0);
			
 
				+
			
 
				+        double d, d1, d2;
			
 
				+        String k1, k2;
			
 
				+
			
 
				+        k1 = "view_pv_list_1day";
			
 
				+        k2 = "play_pv_list_1day";
			
 
				+        if (maps.containsKey(k1) && maps.containsKey(k2)){
			
 
				+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
			
 
				+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
			
 
				+            d = ExtractorUtils.divisionDouble(d1, d2);
			
 
				+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+                result.put("i_1day_ctr_rt", d);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        k1 = "view_pv_list_1day";
			
 
				+        k2 = "share_pv_list_1day";
			
 
				+        if (maps.containsKey(k1) && maps.containsKey(k2)){
			
 
				+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
			
 
				+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
			
 
				+            d = ExtractorUtils.divisionDouble(d1, d2);
			
 
				+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+                result.put("i_1day_str_rt", d);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        k1 = "share_pv_list_1day";
			
 
				+        k2 = "return_uv_list_1day";
			
 
				+        if (maps.containsKey(k1) && maps.containsKey(k2)){
			
 
				+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
			
 
				+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
			
 
				+            d = ExtractorUtils.divisionDouble(d1, d2);
			
 
				+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+                result.put("i_1day_ros_rt", d);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        k1 = "view_pv_list_1day";
			
 
				+        k2 = "return_uv_list_1day";
			
 
				+        if (maps.containsKey(k1) && maps.containsKey(k2)){
			
 
				+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
			
 
				+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
			
 
				+            d = ExtractorUtils.divisionDouble(d1, d2);
			
 
				+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+                result.put("i_1day_rov_rt", d);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        //---
			
 
				+        k1 = "view_pv_list_1h";
			
 
				+        k2 = "play_pv_list_1h";
			
 
				+        if (maps.containsKey(k1) && maps.containsKey(k2)){
			
 
				+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
			
 
				+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
			
 
				+            d = ExtractorUtils.divisionDouble(d1, d2);
			
 
				+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+                result.put("i_1h_ctr_rt", d);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        k1 = "view_pv_list_1h";
			
 
				+        k2 = "share_pv_list_1h";
			
 
				+        if (maps.containsKey(k1) && maps.containsKey(k2)){
			
 
				+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
			
 
				+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
			
 
				+            d = ExtractorUtils.divisionDouble(d1, d2);
			
 
				+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+                result.put("i_1h_str_rt", d);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        k1 = "share_pv_list_1day";
			
 
				+        k2 = "return_uv_list_1h";
			
 
				+        if (maps.containsKey(k1) && maps.containsKey(k2)){
			
 
				+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
			
 
				+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
			
 
				+            d = ExtractorUtils.divisionDouble(d1, d2);
			
 
				+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+                result.put("i_1h_ros_rt", d);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        k1 = "view_pv_list_1h";
			
 
				+        k2 = "return_uv_list_1h";
			
 
				+        if (maps.containsKey(k1) && maps.containsKey(k2)){
			
 
				+            d1 = maps.get(k1).getOrDefault(dateHour, 0.0);
			
 
				+            d2 = maps.get(k2).getOrDefault(dateHour, 0.0);
			
 
				+            d = ExtractorUtils.divisionDouble(d1, d2);
			
 
				+            if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+                result.put("i_1h_rov_rt", d);
			
 
				+            }
			
 
				+        }
			
 
				+        Map<String, String> result2 = new HashMap<>();
			
 
				+        for (Map.Entry<String, Double> entry : result.entrySet()){
			
 
				+            result2.put(entry.getKey(), String.valueOf(entry.getValue()));
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+        return result2;
			
 
				+    }
			
 
				+
			
 
				+    public static void main(String[] args) {
			
 
				+        String s1 = "share_uv_list_1day";
			
 
				+        String s2 = "2024011300:2,2024011301:2,2024011304:2,2024011309:3,2024011311:3,2024011314:4,2024011315:4,2024011321:1,2024011323:1,2024011400:1,2024011401:1,2024011404:1,2024011406:1,2024011407:1,2024011408:1,2024011410:1,2024011423:1,2024011302:2,2024011305:2,2024011312:4,2024011313:4,2024011317:4,2024011318:4,2024011319:3,2024011320:1,2024011403:1,2024011409:1,2024011411:1,2024011419:1,2024011420:1,2024011422:1,2024011303:2,2024011306:2,2024011307:2,2024011308:2,2024011310:3,2024011316:4,2024011322:1,2024011402:1,2024011405:1,2024011421:1";
			
 
				+        Map<String, Double> m1 = new HashMap<>();
			
 
				+        Map<String, Map<String, Double>> maps = new HashMap<>();
			
 
				+        for (String s : s2.split(",")){
			
 
				+            String s3 = s.split(":")[0];
			
 
				+            String s4 = s.split(":")[1];
			
 
				+            m1.put(s3, Double.valueOf(s4));
			
 
				+        }
			
 
				+        maps.put(s1, m1);
			
 
				+
			
 
				+        String date = "20240114";
			
 
				+        String hour = "20";
			
 
				+        System.out.println(getItemRealtimeTrend(maps, date, hour));
			
 
				+    }
			
 
				+}
			
--- a/src/main/java/examples/extractor/RankExtractorUserFeature.java
+++ b/src/main/java/examples/extractor/RankExtractorUserFeature.java
@@ -0,0 +1,104 @@
 
				+package examples.extractor;
			
 
				+
			
 
				+
			
 
				+import java.util.*;
			
 
				+
			
 
				+public class RankExtractorUserFeature {
			
 
				+    public static Map<String, String> getUserRateFeature(Map<String, String> maps) {
			
 
				+
			
 
				+        double d;
			
 
				+        Map<String, Double> result = new HashMap<>();
			
 
				+        d = ExtractorUtils.division("u_1day_exp_cnt", "u_1day_click_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_1day_ctr",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_1day_exp_cnt", "u_1day_share_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_1day_str",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_1day_exp_cnt", "u_1day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_1day_rov",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_1day_share_cnt", "u_1day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_1day_ros",d);
			
 
				+        }
			
 
				+
			
 
				+        d = ExtractorUtils.division("u_3day_exp_cnt", "u_3day_click_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_3day_ctr",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_3day_exp_cnt", "u_3day_share_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_3day_str",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_3day_exp_cnt", "u_3day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_3day_rov",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_3day_share_cnt", "u_3day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_3day_ros",d);
			
 
				+        }
			
 
				+
			
 
				+        d = ExtractorUtils.division("u_7day_exp_cnt", "u_7day_click_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_7day_ctr",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_7day_exp_cnt", "u_7day_share_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_7day_str",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_7day_exp_cnt", "u_7day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_7day_rov",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_7day_share_cnt", "u_7day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_7day_ros",d);
			
 
				+        }
			
 
				+
			
 
				+        d = ExtractorUtils.division("u_3month_exp_cnt", "u_3month_click_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_3month_ctr",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_3month_exp_cnt", "u_3month_share_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_3month_str",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_3month_exp_cnt", "u_3month_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_3month_rov",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_3month_share_cnt", "u_3month_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_3month_ros",d);
			
 
				+        }
			
 
				+
			
 
				+        return rateFeatureChange(result);
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    public static Map<String, String> rateFeatureChange(Map<String, Double> maps){
			
 
				+        Map<String, String> result = new HashMap<>();
			
 
				+        for (Map.Entry<String, Double> entry : maps.entrySet()){
			
 
				+            int value = ExtractorUtils.ceilLogRate(entry.getValue());
			
 
				+            result.put(entry.getKey(), String.valueOf(value));
			
 
				+        }
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+    public static Map<String, String> cntFeatureChange(Map<String, String> maps, Set<String> names){
			
 
				+        Map<String, String> result = new HashMap<>();
			
 
				+        for (Map.Entry<String, String> entry : maps.entrySet()){
			
 
				+            if (!names.contains(entry.getKey())){
			
 
				+                continue;
			
 
				+            }
			
 
				+            int value = ExtractorUtils.bucketCnt(Double.valueOf(entry.getValue()));
			
 
				+            result.put(entry.getKey(), String.valueOf(value));
			
 
				+        }
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+}
			
--- a/src/main/java/examples/extractor/RankExtractorUserFeatureV2.java
+++ b/src/main/java/examples/extractor/RankExtractorUserFeatureV2.java
@@ -0,0 +1,110 @@
 
				+package examples.extractor;
			
 
				+
			
 
				+
			
 
				+import java.util.HashMap;
			
 
				+import java.util.Map;
			
 
				+import java.util.Set;
			
 
				+
			
 
				+public class RankExtractorUserFeatureV2 {
			
 
				+    public static Map<String, String> getUserRateFeature(Map<String, String> maps) {
			
 
				+
			
 
				+        double d;
			
 
				+        Map<String, Double> result = new HashMap<>();
			
 
				+        d = ExtractorUtils.division("u_1day_exp_cnt", "u_1day_click_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_1day_ctr",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_1day_exp_cnt", "u_1day_share_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_1day_str",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_1day_exp_cnt", "u_1day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_1day_rov",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_1day_share_cnt", "u_1day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_1day_ros",d);
			
 
				+        }
			
 
				+
			
 
				+        d = ExtractorUtils.division("u_3day_exp_cnt", "u_3day_click_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_3day_ctr",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_3day_exp_cnt", "u_3day_share_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_3day_str",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_3day_exp_cnt", "u_3day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_3day_rov",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_3day_share_cnt", "u_3day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_3day_ros",d);
			
 
				+        }
			
 
				+
			
 
				+        d = ExtractorUtils.division("u_7day_exp_cnt", "u_7day_click_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_7day_ctr",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_7day_exp_cnt", "u_7day_share_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_7day_str",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_7day_exp_cnt", "u_7day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_7day_rov",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_7day_share_cnt", "u_7day_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_7day_ros",d);
			
 
				+        }
			
 
				+
			
 
				+        d = ExtractorUtils.division("u_3month_exp_cnt", "u_3month_click_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_3month_ctr",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_3month_exp_cnt", "u_3month_share_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_3month_str",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_3month_exp_cnt", "u_3month_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_3month_rov",d);
			
 
				+        }
			
 
				+        d = ExtractorUtils.division("u_3month_share_cnt", "u_3month_return_cnt", maps);
			
 
				+        if (!ExtractorUtils.isDoubleEqualToZero(d)){
			
 
				+            result.put("u_3month_ros",d);
			
 
				+        }
			
 
				+        Map<String, String> result2 = new HashMap<>();
			
 
				+        for (Map.Entry<String, Double> entry : result.entrySet()){
			
 
				+            result2.put(entry.getKey(), String.valueOf(entry.getValue()));
			
 
				+        }
			
 
				+
			
 
				+        return result2;
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    public static Map<String, String> rateFeatureChange(Map<String, Double> maps){
			
 
				+        Map<String, String> result = new HashMap<>();
			
 
				+        for (Map.Entry<String, Double> entry : maps.entrySet()){
			
 
				+            int value = ExtractorUtils.ceilLogRate(entry.getValue());
			
 
				+            result.put(entry.getKey(), String.valueOf(value));
			
 
				+        }
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+    public static Map<String, String> cntFeatureChange(Map<String, String> maps, Set<String> names){
			
 
				+        Map<String, String> result = new HashMap<>();
			
 
				+        for (Map.Entry<String, String> entry : maps.entrySet()){
			
 
				+            if (!names.contains(entry.getKey())){
			
 
				+                continue;
			
 
				+            }
			
 
				+            int value = ExtractorUtils.bucketCnt(Double.valueOf(entry.getValue()));
			
 
				+            result.put(entry.getKey(), String.valueOf(value));
			
 
				+        }
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+}
			
--- a/src/main/java/examples/sparksql/SparkAdCTRSampleLoader.java
+++ b/src/main/java/examples/sparksql/SparkAdCTRSampleLoader.java
@@ -1,107 +1,100 @@
 
				-package examples.sparksql;
			
 
				-
			
 
				-import com.aliyun.odps.TableSchema;
			
 
				-import com.aliyun.odps.data.Record;
			
 
				-import com.google.common.collect.ListMultimap;
			
 
				-
			
 
				-import com.tzld.piaoquan.recommend.feature.domain.ad.base.*;
			
 
				-import com.tzld.piaoquan.recommend.feature.domain.ad.feature.VlogAdCtrLRFeatureExtractor;
			
 
				-import com.tzld.piaoquan.recommend.feature.model.sample.BaseFeature;
			
 
				-import com.tzld.piaoquan.recommend.feature.model.sample.FeatureGroup;
			
 
				-import com.tzld.piaoquan.recommend.feature.model.sample.GroupedFeature;
			
 
				-import com.tzld.piaoquan.recommend.feature.model.sample.LRSamples;
			
 
				-import examples.dataloader.AdSampleConstructor;
			
 
				-import org.apache.spark.SparkConf;
			
 
				-import org.apache.spark.aliyun.odps.OdpsOps;
			
 
				-import org.apache.spark.api.java.JavaRDD;
			
 
				-import org.apache.spark.api.java.JavaSparkContext;
			
 
				-import org.apache.spark.api.java.function.Function2;
			
 
				-
			
 
				-import java.util.ArrayList;
			
 
				-import java.util.List;
			
 
				-import java.util.Map;
			
 
				-
			
 
				-
			
 
				-public class SparkAdCTRSampleLoader {
			
 
				-
			
 
				-    public static void main(String[] args) {
			
 
				-
			
 
				-        String partition = args[0];
			
 
				-        String accessId = "LTAIWYUujJAm7CbH";
			
 
				-        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
			
 
				-        String odpsUrl = "http://service.odps.aliyun.com/api";
			
 
				-        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
			
 
				-        String project = "loghubods";
			
 
				-        String table = "alg_ad_view_sample";
			
 
				-        String hdfsPath = "/dw/recommend/model/ad_ctr_samples/" + partition;
			
 
				-
			
 
				-        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
			
 
				-        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
			
 
				-        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
			
 
				-        System.out.println("Read odps table...");
			
 
				-
			
 
				-        JavaRDD<String> readData = odpsOps.readTableWithJava(project, table, partition, new RecordsToSamples(), Integer.valueOf(30));
			
 
				-        readData.saveAsTextFile(hdfsPath);
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-    static class RecordsToSamples implements Function2<Record, TableSchema, String> {
			
 
				-        @Override
			
 
				-        public String call(Record record, TableSchema schema) throws Exception {
			
 
				-            String labelName = "adclick_ornot";
			
 
				-            String ret = singleParse(record, labelName);
			
 
				-            return ret;
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-    // 单条日志处理逻辑
			
 
				-    public static String singleParse(Record record, String labelName) {
			
 
				-        // 数据解析
			
 
				-        String label = record.getString(labelName);
			
 
				-        if (label == null || label.equals("1")) {
			
 
				-            label = "0";
			
 
				-        } else {
			
 
				-            label = "1";
			
 
				-        }
			
 
				-
			
 
				-        // 从sql的 record中 初始化对象内容
			
 
				-        AdRequestContext requestContext = AdSampleConstructor.constructRequestContext(record);
			
 
				-        UserAdFeature userFeature = AdSampleConstructor.constructUserFeature(record);
			
 
				-        AdItemFeature itemFeature = AdSampleConstructor.constructItemFeature(record);
			
 
				-
			
 
				-        // 转化成bytes
			
 
				-        AdRequestContextBytesFeature adRequestContextBytesFeature = new AdRequestContextBytesFeature(requestContext);
			
 
				-        UserAdBytesFeature userBytesFeature = new UserAdBytesFeature(userFeature);
			
 
				-        AdItemBytesFeature adItemBytesFeature = new AdItemBytesFeature(itemFeature);
			
 
				-
			
 
				-        // 特征抽取
			
 
				-        VlogAdCtrLRFeatureExtractor bytesFeatureExtractor;
			
 
				-        bytesFeatureExtractor = new VlogAdCtrLRFeatureExtractor();
			
 
				-
			
 
				-        LRSamples lrSamples = bytesFeatureExtractor.single(userBytesFeature, adItemBytesFeature, adRequestContextBytesFeature);
			
 
				-
			
 
				-        return parseSamplesToString2(label, lrSamples);
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-
			
 
				-    // 构建样本的字符串
			
 
				-    public static String parseSamplesToString2(String label, LRSamples lrSamples) {
			
 
				-        ArrayList<String> featureList = new ArrayList<String>();
			
 
				-        for (int i = 0; i < lrSamples.getFeaturesCount(); i++) {
			
 
				-            GroupedFeature groupedFeature = lrSamples.getFeatures(i);
			
 
				-            if (groupedFeature != null && groupedFeature.getFeaturesCount() != 0) {
			
 
				-                for (int j = 0; j < groupedFeature.getFeaturesCount(); j++) {
			
 
				-                    BaseFeature baseFeature = groupedFeature.getFeatures(j);
			
 
				-                    if (baseFeature != null) {
			
 
				-                        featureList.add(String.valueOf(baseFeature.getIdentifier()) + ":1" );
			
 
				-                    }
			
 
				-                }
			
 
				-            }
			
 
				-        }
			
 
				-        return label + "\t" + String.join("\t", featureList);
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-}
			
 
				+//package examples.sparksql;
			
 
				+//
			
 
				+//import com.aliyun.odps.TableSchema;
			
 
				+//import com.aliyun.odps.data.Record;
			
 
				+//import com.google.common.collect.ListMultimap;
			
 
				+//import com.tzld.piaoquan.ad.engine.commons.base.*;
			
 
				+//import com.tzld.piaoquan.ad.engine.commons.score.feature.VlogAdCtrLRFeatureExtractor;
			
 
				+//import com.tzld.piaoquan.recommend.server.gen.recommend.BaseFeature;
			
 
				+//import com.tzld.piaoquan.recommend.server.gen.recommend.FeatureGroup;
			
 
				+//import examples.dataloader.AdSampleConstructor;
			
 
				+//import examples.dataloader.RecommendSampleConstructor;
			
 
				+//import org.apache.spark.SparkConf;
			
 
				+//import org.apache.spark.aliyun.odps.OdpsOps;
			
 
				+//import org.apache.spark.api.java.JavaRDD;
			
 
				+//import org.apache.spark.api.java.JavaSparkContext;
			
 
				+//import org.apache.spark.api.java.function.Function2;
			
 
				+//
			
 
				+//import java.util.ArrayList;
			
 
				+//import java.util.Map;
			
 
				+//
			
 
				+//
			
 
				+//public class SparkAdCTRSampleLoader {
			
 
				+//
			
 
				+//    public static void main(String[] args) {
			
 
				+//
			
 
				+//        String partition = args[0];
			
 
				+//        String accessId = "LTAIWYUujJAm7CbH";
			
 
				+//        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
			
 
				+//        String odpsUrl = "http://service.odps.aliyun.com/api";
			
 
				+//        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
			
 
				+//        String project = "loghubods";
			
 
				+//        String table = "alg_ad_view_sample";
			
 
				+//        String hdfsPath = "/dw/recommend/model/ad_ctr_samples/" + partition;
			
 
				+//
			
 
				+//        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
			
 
				+//        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
			
 
				+//        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
			
 
				+//        System.out.println("Read odps table...");
			
 
				+//
			
 
				+//        JavaRDD<String> readData = odpsOps.readTableWithJava(project, table, partition, new RecordsToSamples(), Integer.valueOf(30));
			
 
				+//        readData.saveAsTextFile(hdfsPath);
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//    static class RecordsToSamples implements Function2<Record, TableSchema, String> {
			
 
				+//        @Override
			
 
				+//        public String call(Record record, TableSchema schema) throws Exception {
			
 
				+//            String labelName = "adclick_ornot";
			
 
				+//            String ret = singleParse(record, labelName);
			
 
				+//            return ret;
			
 
				+//        }
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//    // 单条日志处理逻辑
			
 
				+//    public static String singleParse(Record record, String labelName) {
			
 
				+//        // 数据解析
			
 
				+//        String label = record.getString(labelName);
			
 
				+//        if (label == null || label.equals("0")) {
			
 
				+//            label = "0";
			
 
				+//        } else {
			
 
				+//            label = "1";
			
 
				+//        }
			
 
				+//
			
 
				+//        // 从sql的 record中 初始化对象内容
			
 
				+//        AdRequestContext requestContext = AdSampleConstructor.constructRequestContext(record);
			
 
				+//        UserAdFeature userFeature = AdSampleConstructor.constructUserFeature(record);
			
 
				+//        AdItemFeature itemFeature = AdSampleConstructor.constructItemFeature(record);
			
 
				+//
			
 
				+//        // 转化成bytes
			
 
				+//        AdRequestContextBytesFeature adRequestContextBytesFeature = new AdRequestContextBytesFeature(requestContext);
			
 
				+//        UserAdBytesFeature userBytesFeature = new UserAdBytesFeature(userFeature);
			
 
				+//        AdItemBytesFeature adItemBytesFeature = new AdItemBytesFeature(itemFeature);
			
 
				+//
			
 
				+//        // 特征抽取
			
 
				+//        VlogAdCtrLRFeatureExtractor bytesFeatureExtractor;
			
 
				+//        bytesFeatureExtractor = new VlogAdCtrLRFeatureExtractor();
			
 
				+//
			
 
				+//        bytesFeatureExtractor.getUserFeatures(userBytesFeature);
			
 
				+//        bytesFeatureExtractor.getItemFeature(adItemBytesFeature);
			
 
				+//        bytesFeatureExtractor.getContextFeatures(adRequestContextBytesFeature);
			
 
				+//        bytesFeatureExtractor.getCrossFeature(adItemBytesFeature, adRequestContextBytesFeature, userBytesFeature);
			
 
				+//
			
 
				+//        ListMultimap<FeatureGroup, BaseFeature> featureMap = bytesFeatureExtractor.getFeatures();
			
 
				+//        return parseSamplesToString(label, featureMap);
			
 
				+//    }
			
 
				+//
			
 
				+//    // 构建样本的字符串
			
 
				+//    public static String parseSamplesToString(String label, ListMultimap<FeatureGroup, BaseFeature> featureMap) {
			
 
				+//        ArrayList<String> featureList = new ArrayList<String>();
			
 
				+//        for (Map.Entry<FeatureGroup, BaseFeature> entry : featureMap.entries()) {
			
 
				+//            FeatureGroup groupedFeature = entry.getKey();
			
 
				+//            BaseFeature baseFeature = entry.getValue();
			
 
				+//            Long featureIdentifier = baseFeature.getIdentifier();
			
 
				+//            featureList.add(String.valueOf(featureIdentifier) + ":1");
			
 
				+//        }
			
 
				+//        return label + "\t" + String.join("\t", featureList);
			
 
				+//    }
			
 
				+//
			
 
				+//}
			
--- a/src/main/java/examples/sparksql/SparkAdFeaToRedisLoader.java
+++ b/src/main/java/examples/sparksql/SparkAdFeaToRedisLoader.java
@@ -1,124 +1,125 @@
 
				-package examples.sparksql;
			
 
				-
			
 
				-import com.aliyun.odps.TableSchema;
			
 
				-import com.aliyun.odps.data.Record;
			
 
				-
			
 
				-
			
 
				-import com.tzld.piaoquan.recommend.feature.domain.ad.base.*;
			
 
				-import examples.dataloader.AdRedisFeatureConstructor;
			
 
				-import org.apache.spark.SparkConf;
			
 
				-import org.apache.spark.aliyun.odps.OdpsOps;
			
 
				-import org.apache.spark.api.java.JavaRDD;
			
 
				-import org.apache.spark.api.java.JavaSparkContext;
			
 
				-import org.apache.spark.api.java.function.Function2;
			
 
				-import org.springframework.data.redis.connection.RedisStandaloneConfiguration;
			
 
				-import org.springframework.data.redis.connection.jedis.JedisConnectionFactory;
			
 
				-import org.springframework.data.redis.core.RedisTemplate;
			
 
				-import org.springframework.data.redis.serializer.StringRedisSerializer;
			
 
				-
			
 
				-import java.util.ArrayList;
			
 
				-import java.util.HashMap;
			
 
				-import java.util.List;
			
 
				-import java.util.Map;
			
 
				-
			
 
				-
			
 
				-public class SparkAdFeaToRedisLoader {
			
 
				-
			
 
				-    private static final String userKeyFormat = "user:ad:%s";
			
 
				-
			
 
				-    private static final String adKeyFormat = "ad:%s";
			
 
				-
			
 
				-
			
 
				-    public static RedisTemplate<String, String> buildRedisTemplate() {
			
 
				-        RedisStandaloneConfiguration rsc = new RedisStandaloneConfiguration();
			
 
				-        rsc.setPort(6379);
			
 
				-        rsc.setPassword("Wqsd@2019");
			
 
				-        rsc.setHostName("r-bp1pi8wyv6lzvgjy5z.redis.rds.aliyuncs.com");
			
 
				-        RedisTemplate<String, String> template = new RedisTemplate<>();
			
 
				-        JedisConnectionFactory fac = new JedisConnectionFactory(rsc);
			
 
				-        fac.afterPropertiesSet();
			
 
				-        template.setDefaultSerializer(new StringRedisSerializer());
			
 
				-        template.setConnectionFactory(fac);
			
 
				-        template.afterPropertiesSet();
			
 
				-        return template;
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-    public static void loadFeatureToRedis(RedisTemplate<String, String> redisTemplate, List<String> line) {
			
 
				-        Map<String, String> redisFormat = new HashMap<String, String>();
			
 
				-        String key = line.get(0);
			
 
				-        String value = line.get(1);
			
 
				-        redisFormat.put(key, value);
			
 
				-        redisTemplate.opsForValue().multiSet(redisFormat);
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-    static class RecordsToAdRedisKV implements Function2<Record, TableSchema, List<String>> {
			
 
				-        @Override
			
 
				-        public List<String> call(Record record, TableSchema schema) throws Exception {
			
 
				-            AdItemFeature adItemFeature = AdRedisFeatureConstructor.constructItemFeature(record);
			
 
				-            // ad feature 中的key以creativeID拼接
			
 
				-            String key = String.format(adKeyFormat, adItemFeature.getCreativeId());
			
 
				-            String value = adItemFeature.getValue();
			
 
				-            List<String> kv = new ArrayList<String>();
			
 
				-            kv.add(key);
			
 
				-            kv.add(value);
			
 
				-            return kv;
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-    static class RecordsToUserRedisKV implements Function2<Record, TableSchema, List<String>> {
			
 
				-        @Override
			
 
				-        public List<String> call(Record record, TableSchema schema) throws Exception {
			
 
				-            UserAdFeature userFeature = AdRedisFeatureConstructor.constructUserFeature(record);
			
 
				-            List<String> kv = new ArrayList<String>();
			
 
				-            String key = String.format(userKeyFormat, userFeature.getKey());
			
 
				-            String value = userFeature.getValue();
			
 
				-            kv.add(key);
			
 
				-            kv.add(value);
			
 
				-            return kv;
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-    public static void main(String[] args) {
			
 
				-
			
 
				-        String partition = args[0];
			
 
				-        String accessId = "LTAIWYUujJAm7CbH";
			
 
				-        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
			
 
				-        String odpsUrl = "http://service.odps.aliyun.com/api";
			
 
				-        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
			
 
				-        String project = "loghubods";
			
 
				-        String tableAdInfo = "alg_ad_item_info";
			
 
				-        String tableUserInfo = "alg_ad_user_info";
			
 
				-
			
 
				-
			
 
				-        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
			
 
				-        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
			
 
				-        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
			
 
				-        System.out.println("Read odps table...");
			
 
				-
			
 
				-
			
 
				-        // load Ad features
			
 
				-        JavaRDD<List<String>> readAdData = odpsOps.readTableWithJava(project, tableAdInfo, partition, new RecordsToAdRedisKV(), Integer.valueOf(10));
			
 
				-        readAdData.foreachPartition(
			
 
				-                rowIterator -> {
			
 
				-                    RedisTemplate<String, String> redisTemplate = buildRedisTemplate();
			
 
				-                    rowIterator.forEachRemaining(line -> loadFeatureToRedis(redisTemplate, line));
			
 
				-                }
			
 
				-        );
			
 
				-
			
 
				-
			
 
				-        // load user features
			
 
				-        JavaRDD<List<String>> readUserData = odpsOps.readTableWithJava(project, tableUserInfo, partition, new RecordsToUserRedisKV(), Integer.valueOf(50));
			
 
				-        readUserData.repartition(50).foreachPartition(
			
 
				-                rowIterator -> {
			
 
				-                    RedisTemplate<String, String> redisTemplate = buildRedisTemplate();
			
 
				-                    rowIterator.forEachRemaining(line -> loadFeatureToRedis(redisTemplate, line));
			
 
				-                }
			
 
				-        );
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-}
			
 
				+//package examples.sparksql;
			
 
				+//
			
 
				+//import com.aliyun.odps.TableSchema;
			
 
				+//import com.aliyun.odps.data.Record;
			
 
				+//import com.tzld.piaoquan.ad.engine.commons.base.AdItemFeature;
			
 
				+//import com.tzld.piaoquan.ad.engine.commons.base.UserAdFeature;
			
 
				+//import examples.dataloader.AdRedisFeatureConstructor;
			
 
				+//import org.apache.spark.SparkConf;
			
 
				+//import org.apache.spark.aliyun.odps.OdpsOps;
			
 
				+//import org.apache.spark.api.java.JavaRDD;
			
 
				+//import org.apache.spark.api.java.JavaSparkContext;
			
 
				+//import org.apache.spark.api.java.function.Function2;
			
 
				+//import org.springframework.data.redis.connection.RedisStandaloneConfiguration;
			
 
				+//import org.springframework.data.redis.connection.jedis.JedisConnectionFactory;
			
 
				+//import org.springframework.data.redis.core.RedisTemplate;
			
 
				+//import org.springframework.data.redis.serializer.StringRedisSerializer;
			
 
				+//
			
 
				+//import java.io.IOException;
			
 
				+//import java.util.ArrayList;
			
 
				+//import java.util.HashMap;
			
 
				+//import java.util.List;
			
 
				+//import java.util.Map;
			
 
				+//
			
 
				+//
			
 
				+//public class SparkAdFeaToRedisLoader {
			
 
				+//
			
 
				+//    private static final String userKeyFormat = "user:ad:%s";
			
 
				+//
			
 
				+//    private static final String adKeyFormat = "ad:%s";
			
 
				+//
			
 
				+//
			
 
				+//    public static RedisTemplate<String, String> buildRedisTemplate() {
			
 
				+//        RedisStandaloneConfiguration rsc = new RedisStandaloneConfiguration();
			
 
				+//        rsc.setPort(6379);
			
 
				+//        rsc.setPassword("Wqsd@2019");
			
 
				+//        rsc.setHostName("r-bp1pi8wyv6lzvgjy5z.redis.rds.aliyuncs.com");
			
 
				+//        RedisTemplate<String, String> template = new RedisTemplate<>();
			
 
				+//        JedisConnectionFactory fac = new JedisConnectionFactory(rsc);
			
 
				+//        fac.afterPropertiesSet();
			
 
				+//        template.setDefaultSerializer(new StringRedisSerializer());
			
 
				+//        template.setConnectionFactory(fac);
			
 
				+//        template.afterPropertiesSet();
			
 
				+//        return template;
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//    public static void loadFeatureToRedis(RedisTemplate<String, String> redisTemplate, List<String> line) {
			
 
				+//        Map<String, String> redisFormat = new HashMap<String, String>();
			
 
				+//        String key = line.get(0);
			
 
				+//        String value = line.get(1);
			
 
				+//        redisFormat.put(key, value);
			
 
				+//        redisTemplate.opsForValue().multiSet(redisFormat);
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//    static class RecordsToAdRedisKV implements Function2<Record, TableSchema, List<String>> {
			
 
				+//        @Override
			
 
				+//        public List<String> call(Record record, TableSchema schema) throws Exception {
			
 
				+//            AdItemFeature adItemFeature = AdRedisFeatureConstructor.constructItemFeature(record);
			
 
				+//            // ad feature 中的key以creativeID拼接
			
 
				+//            String key = String.format(adKeyFormat, adItemFeature.getCreativeId());
			
 
				+//            String value = adItemFeature.getValue();
			
 
				+//            List<String> kv = new ArrayList<String>();
			
 
				+//            kv.add(key);
			
 
				+//            kv.add(value);
			
 
				+//            return kv;
			
 
				+//        }
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//    static class RecordsToUserRedisKV implements Function2<Record, TableSchema, List<String>> {
			
 
				+//        @Override
			
 
				+//        public List<String> call(Record record, TableSchema schema) throws Exception {
			
 
				+//            UserAdFeature userFeature = AdRedisFeatureConstructor.constructUserFeature(record);
			
 
				+//            List<String> kv = new ArrayList<String>();
			
 
				+//            String key = String.format(userKeyFormat, userFeature.getKey());
			
 
				+//
			
 
				+//            String value = userFeature.getValue();
			
 
				+//            kv.add(key);
			
 
				+//            kv.add(value);
			
 
				+//            return kv;
			
 
				+//        }
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//    public static void main(String[] args) {
			
 
				+//
			
 
				+//        String partition = args[0];
			
 
				+//        String accessId = "LTAIWYUujJAm7CbH";
			
 
				+//        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
			
 
				+//        String odpsUrl = "http://service.odps.aliyun.com/api";
			
 
				+//        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
			
 
				+//        String project = "loghubods";
			
 
				+//        String tableAdInfo = "alg_ad_item_info";
			
 
				+//        String tableUserInfo = "alg_ad_user_info";
			
 
				+//
			
 
				+//
			
 
				+//        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
			
 
				+//        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
			
 
				+//        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
			
 
				+//        System.out.println("Read odps table...");
			
 
				+//
			
 
				+//
			
 
				+//        // load Ad features
			
 
				+//        JavaRDD<List<String>> readAdData = odpsOps.readTableWithJava(project, tableAdInfo, partition, new RecordsToAdRedisKV(), Integer.valueOf(10));
			
 
				+//        readAdData.foreachPartition(
			
 
				+//                rowIterator -> {
			
 
				+//                    RedisTemplate<String, String> redisTemplate = buildRedisTemplate();
			
 
				+//                    rowIterator.forEachRemaining(line -> loadFeatureToRedis(redisTemplate, line));
			
 
				+//                }
			
 
				+//        );
			
 
				+//
			
 
				+//
			
 
				+//        // load user features
			
 
				+//        JavaRDD<List<String>> readUserData = odpsOps.readTableWithJava(project, tableUserInfo, partition, new RecordsToUserRedisKV(), Integer.valueOf(50));
			
 
				+//        readUserData.repartition(50).foreachPartition(
			
 
				+//                rowIterator -> {
			
 
				+//                    RedisTemplate<String, String> redisTemplate = buildRedisTemplate();
			
 
				+//                    rowIterator.forEachRemaining(line -> loadFeatureToRedis(redisTemplate, line));
			
 
				+//                }
			
 
				+//        );
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//}
			
--- a/src/main/java/examples/sparksql/SparkShareRatioSampleLoader.java
+++ b/src/main/java/examples/sparksql/SparkShareRatioSampleLoader.java
@@ -1,99 +1,98 @@
 
				-package examples.sparksql;
			
 
				-
			
 
				-import com.aliyun.odps.TableSchema;
			
 
				-import com.aliyun.odps.data.Record;
			
 
				-import com.google.common.collect.ListMultimap;
			
 
				-import com.tzld.piaoquan.recommend.feature.domain.video.base.*;
			
 
				-
			
 
				-import examples.dataloader.RecommendSampleConstructor;
			
 
				-import com.tzld.piaoquan.recommend.feature.domain.video.feature.VlogShareLRFeatureExtractor;
			
 
				-import com.tzld.piaoquan.recommend.feature.model.sample.BaseFeature;
			
 
				-import com.tzld.piaoquan.recommend.feature.model.sample.FeatureGroup;
			
 
				-import org.apache.spark.SparkConf;
			
 
				-import org.apache.spark.aliyun.odps.OdpsOps;
			
 
				-import org.apache.spark.api.java.JavaRDD;
			
 
				-import org.apache.spark.api.java.JavaSparkContext;
			
 
				-import org.apache.spark.api.java.function.Function2;
			
 
				-
			
 
				-import java.util.ArrayList;
			
 
				-import java.util.Map;
			
 
				-
			
 
				-
			
 
				-public class SparkShareRatioSampleLoader {
			
 
				-
			
 
				-    public static void main(String[] args) {
			
 
				-
			
 
				-        String partition = args[0];
			
 
				-        String accessId = "LTAIWYUujJAm7CbH";
			
 
				-        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
			
 
				-        String odpsUrl = "http://service.odps.aliyun.com/api";
			
 
				-        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
			
 
				-        String project = "loghubods";
			
 
				-        String table = "alg_recsys_view_sample";
			
 
				-        String hdfsPath = "/dw/recommend/model/share_ratio_samples/" + partition;
			
 
				-
			
 
				-        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
			
 
				-        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
			
 
				-        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
			
 
				-        System.out.println("Read odps table...");
			
 
				-
			
 
				-        JavaRDD<String> readData = odpsOps.readTableWithJava(project, table, partition, new RecordsToSamples(), Integer.valueOf(50));
			
 
				-        readData.saveAsTextFile(hdfsPath);
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-    static class RecordsToSamples implements Function2<Record, TableSchema, String> {
			
 
				-        @Override
			
 
				-        public String call(Record record, TableSchema schema) throws Exception {
			
 
				-            String labelName = "share_ornot";
			
 
				-            String ret = singleParse(record, labelName);
			
 
				-            return ret;
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-    // 单条日志处理逻辑
			
 
				-    public static String singleParse(Record record, String labelName) {
			
 
				-        // 数据解析
			
 
				-        String label = record.getString(labelName);
			
 
				-        if (label == null || label.equals("1")) {
			
 
				-            label = "0";
			
 
				-        } else {
			
 
				-            label = "1";
			
 
				-        }
			
 
				-
			
 
				-        // 从sql的 record中 初始化对象内容
			
 
				-        RequestContext requestContext = RecommendSampleConstructor.constructRequestContext(record);
			
 
				-        UserFeature userFeature = RecommendSampleConstructor.constructUserFeature(record);
			
 
				-        ItemFeature itemFeature = RecommendSampleConstructor.constructItemFeature(record);
			
 
				-
			
 
				-        // 转化成bytes
			
 
				-        RequestContextBytesFeature requestContextBytesFeature = new RequestContextBytesFeature(requestContext);
			
 
				-        UserBytesFeature userBytesFeature = new UserBytesFeature(userFeature);
			
 
				-        VideoBytesFeature videoBytesFeature = new VideoBytesFeature(itemFeature);
			
 
				-
			
 
				-        // 特征抽取
			
 
				-        VlogShareLRFeatureExtractor bytesFeatureExtractor;
			
 
				-        bytesFeatureExtractor = new VlogShareLRFeatureExtractor();
			
 
				-
			
 
				-        bytesFeatureExtractor.getUserFeatures(userBytesFeature);
			
 
				-        bytesFeatureExtractor.getItemFeature(videoBytesFeature);
			
 
				-        bytesFeatureExtractor.getContextFeatures(requestContextBytesFeature);
			
 
				-
			
 
				-        ListMultimap<FeatureGroup, BaseFeature> featureMap = bytesFeatureExtractor.getFeatures();
			
 
				-        return parseSamplesToString(label, featureMap);
			
 
				-    }
			
 
				-
			
 
				-    // 构建样本的字符串
			
 
				-    public static String parseSamplesToString(String label, ListMultimap<FeatureGroup, BaseFeature> featureMap) {
			
 
				-        ArrayList<String> featureList = new ArrayList<String>();
			
 
				-        for (Map.Entry<FeatureGroup, BaseFeature> entry : featureMap.entries()) {
			
 
				-            FeatureGroup groupedFeature = entry.getKey();
			
 
				-            BaseFeature baseFeature = entry.getValue();
			
 
				-            Long featureIdentifier = baseFeature.getIdentifier();
			
 
				-            featureList.add(String.valueOf(featureIdentifier) + ":1");
			
 
				-        }
			
 
				-        return label + "\t" + String.join("\t", featureList);
			
 
				-    }
			
 
				-
			
 
				-}
			
 
				+//package examples.sparksql;
			
 
				+//
			
 
				+//import com.aliyun.odps.TableSchema;
			
 
				+//import com.aliyun.odps.data.Record;
			
 
				+//import com.google.common.collect.ListMultimap;
			
 
				+//import com.tzld.piaoquan.data.base.*;
			
 
				+//import examples.dataloader.RecommendSampleConstructor;
			
 
				+//import com.tzld.piaoquan.data.score.feature.VlogShareLRFeatureExtractor;
			
 
				+//import com.tzld.piaoquan.recommend.server.gen.recommend.BaseFeature;
			
 
				+//import com.tzld.piaoquan.recommend.server.gen.recommend.FeatureGroup;
			
 
				+//import org.apache.spark.SparkConf;
			
 
				+//import org.apache.spark.aliyun.odps.OdpsOps;
			
 
				+//import org.apache.spark.api.java.JavaRDD;
			
 
				+//import org.apache.spark.api.java.JavaSparkContext;
			
 
				+//import org.apache.spark.api.java.function.Function2;
			
 
				+//
			
 
				+//import java.util.ArrayList;
			
 
				+//import java.util.Map;
			
 
				+//
			
 
				+//
			
 
				+//public class SparkShareRatioSampleLoader {
			
 
				+//
			
 
				+//    public static void main(String[] args) {
			
 
				+//
			
 
				+//        String partition = args[0];
			
 
				+//        String accessId = "LTAIWYUujJAm7CbH";
			
 
				+//        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
			
 
				+//        String odpsUrl = "http://service.odps.aliyun.com/api";
			
 
				+//        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
			
 
				+//        String project = "loghubods";
			
 
				+//        String table = "alg_recsys_view_sample";
			
 
				+//        String hdfsPath = "/dw/recommend/model/share_ratio_samples/" + partition;
			
 
				+//
			
 
				+//        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
			
 
				+//        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
			
 
				+//        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
			
 
				+//        System.out.println("Read odps table...");
			
 
				+//
			
 
				+//        JavaRDD<String> readData = odpsOps.readTableWithJava(project, table, partition, new RecordsToSamples(), Integer.valueOf(50));
			
 
				+//        readData.saveAsTextFile(hdfsPath);
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//    static class RecordsToSamples implements Function2<Record, TableSchema, String> {
			
 
				+//        @Override
			
 
				+//        public String call(Record record, TableSchema schema) throws Exception {
			
 
				+//            String labelName = "share_ornot";
			
 
				+//            String ret = singleParse(record, labelName);
			
 
				+//            return ret;
			
 
				+//        }
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//    // 单条日志处理逻辑
			
 
				+//    public static String singleParse(Record record, String labelName) {
			
 
				+//        // 数据解析
			
 
				+//        String label = record.getString(labelName);
			
 
				+//        if (label == null || label.equals("1")) {
			
 
				+//            label = "0";
			
 
				+//        } else {
			
 
				+//            label = "1";
			
 
				+//        }
			
 
				+//
			
 
				+//        // 从sql的 record中 初始化对象内容
			
 
				+//        RequestContext requestContext = RecommendSampleConstructor.constructRequestContext(record);
			
 
				+//        UserFeature userFeature = RecommendSampleConstructor.constructUserFeature(record);
			
 
				+//        ItemFeature itemFeature = RecommendSampleConstructor.constructItemFeature(record);
			
 
				+//
			
 
				+//        // 转化成bytes
			
 
				+//        RequestContextBytesFeature requestContextBytesFeature = new RequestContextBytesFeature(requestContext);
			
 
				+//        UserBytesFeature userBytesFeature = new UserBytesFeature(userFeature);
			
 
				+//        VideoBytesFeature videoBytesFeature = new VideoBytesFeature(itemFeature);
			
 
				+//
			
 
				+//        // 特征抽取
			
 
				+//        VlogShareLRFeatureExtractor bytesFeatureExtractor;
			
 
				+//        bytesFeatureExtractor = new VlogShareLRFeatureExtractor();
			
 
				+//
			
 
				+//        bytesFeatureExtractor.getUserFeatures(userBytesFeature);
			
 
				+//        bytesFeatureExtractor.getItemFeature(videoBytesFeature);
			
 
				+//        bytesFeatureExtractor.getContextFeatures(requestContextBytesFeature);
			
 
				+//
			
 
				+//        ListMultimap<FeatureGroup, BaseFeature> featureMap = bytesFeatureExtractor.getFeatures();
			
 
				+//        return parseSamplesToString(label, featureMap);
			
 
				+//    }
			
 
				+//
			
 
				+//    // 构建样本的字符串
			
 
				+//    public static String parseSamplesToString(String label, ListMultimap<FeatureGroup, BaseFeature> featureMap) {
			
 
				+//        ArrayList<String> featureList = new ArrayList<String>();
			
 
				+//        for (Map.Entry<FeatureGroup, BaseFeature> entry : featureMap.entries()) {
			
 
				+//            FeatureGroup groupedFeature = entry.getKey();
			
 
				+//            BaseFeature baseFeature = entry.getValue();
			
 
				+//            Long featureIdentifier = baseFeature.getIdentifier();
			
 
				+//            featureList.add(String.valueOf(featureIdentifier) + ":1");
			
 
				+//        }
			
 
				+//        return label + "\t" + String.join("\t", featureList);
			
 
				+//    }
			
 
				+//
			
 
				+//}
			
--- a/src/main/java/examples/sparksql/SparkVideoFeaToRedisLoader.java
+++ b/src/main/java/examples/sparksql/SparkVideoFeaToRedisLoader.java
@@ -1,124 +1,123 @@
 
				-package examples.sparksql;
			
 
				-
			
 
				-import com.aliyun.odps.TableSchema;
			
 
				-import com.aliyun.odps.data.Record;
			
 
				-
			
 
				-import com.tzld.piaoquan.recommend.feature.domain.video.base.ItemFeature;
			
 
				-import com.tzld.piaoquan.recommend.feature.domain.video.base.UserFeature;
			
 
				-import examples.dataloader.AdRedisFeatureConstructor;
			
 
				-import examples.dataloader.RecommRedisFeatureConstructor;
			
 
				-import org.apache.spark.SparkConf;
			
 
				-import org.apache.spark.aliyun.odps.OdpsOps;
			
 
				-import org.apache.spark.api.java.JavaRDD;
			
 
				-import org.apache.spark.api.java.JavaSparkContext;
			
 
				-import org.apache.spark.api.java.function.Function2;
			
 
				-import org.springframework.data.redis.connection.RedisStandaloneConfiguration;
			
 
				-import org.springframework.data.redis.connection.jedis.JedisConnectionFactory;
			
 
				-import org.springframework.data.redis.core.RedisTemplate;
			
 
				-import org.springframework.data.redis.serializer.StringRedisSerializer;
			
 
				-
			
 
				-import java.util.ArrayList;
			
 
				-import java.util.HashMap;
			
 
				-import java.util.List;
			
 
				-import java.util.Map;
			
 
				-
			
 
				-
			
 
				-public class SparkVideoFeaToRedisLoader {
			
 
				-
			
 
				-    private static final String userKeyFormat = "user:video:%s";
			
 
				-
			
 
				-    private static final String adKeyFormat = "video:%s";
			
 
				-
			
 
				-
			
 
				-    public static RedisTemplate<String, String> buildRedisTemplate() {
			
 
				-        RedisStandaloneConfiguration rsc = new RedisStandaloneConfiguration();
			
 
				-        rsc.setPort(6379);
			
 
				-        rsc.setPassword("Wqsd@2019");
			
 
				-        rsc.setHostName("r-bp1pi8wyv6lzvgjy5z.redis.rds.aliyuncs.com");
			
 
				-        RedisTemplate<String, String> template = new RedisTemplate<>();
			
 
				-        JedisConnectionFactory fac = new JedisConnectionFactory(rsc);
			
 
				-        fac.afterPropertiesSet();
			
 
				-        template.setDefaultSerializer(new StringRedisSerializer());
			
 
				-        template.setConnectionFactory(fac);
			
 
				-        template.afterPropertiesSet();
			
 
				-        return template;
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-    public static void loadFeatureToRedis(RedisTemplate<String, String> redisTemplate, List<String> line) {
			
 
				-        Map<String, String> redisFormat = new HashMap<String, String>();
			
 
				-        String key = line.get(0);
			
 
				-        String value = line.get(1);
			
 
				-        redisFormat.put(key, value);
			
 
				-        redisTemplate.opsForValue().multiSet(redisFormat);
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-    static class RecordsToVideoRedisKV implements Function2<Record, TableSchema, List<String>> {
			
 
				-        @Override
			
 
				-        public List<String> call(Record record, TableSchema schema) throws Exception {
			
 
				-            ItemFeature itemFeature = RecommRedisFeatureConstructor.constructItemFeature(record);
			
 
				-            String key = String.format(adKeyFormat, itemFeature.getKey());
			
 
				-            String value = itemFeature.getValue();
			
 
				-            List<String> kv = new ArrayList<String>();
			
 
				-            kv.add(key);
			
 
				-            kv.add(value);
			
 
				-            return kv;
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-    static class RecordsToUserRedisKV implements Function2<Record, TableSchema, List<String>> {
			
 
				-        @Override
			
 
				-        public List<String> call(Record record, TableSchema schema) throws Exception {
			
 
				-            UserFeature userFeature = RecommRedisFeatureConstructor.constructUserFeature(record);
			
 
				-            String key = String.format(userKeyFormat, userFeature.getKey());
			
 
				-            String value = userFeature.getValue();
			
 
				-            List<String> kv = new ArrayList<String>();
			
 
				-            kv.add(key);
			
 
				-            kv.add(value);
			
 
				-            return kv;
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-    public static void main(String[] args) {
			
 
				-
			
 
				-        String partition = args[0];
			
 
				-        String accessId = "LTAIWYUujJAm7CbH";
			
 
				-        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
			
 
				-        String odpsUrl = "http://service.odps.aliyun.com/api";
			
 
				-        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
			
 
				-        String project = "loghubods";
			
 
				-        String tableItemInfo = "alg_recsys_video_info";
			
 
				-        String tableUserInfo = "alg_recsys_user_info";
			
 
				-
			
 
				-
			
 
				-        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
			
 
				-        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
			
 
				-        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
			
 
				-        System.out.println("Read odps table...");
			
 
				-
			
 
				-
			
 
				-        // load Ad features
			
 
				-        JavaRDD<List<String>> readAdData = odpsOps.readTableWithJava(project, tableItemInfo, partition, new RecordsToVideoRedisKV(), Integer.valueOf(10));
			
 
				-        readAdData.sample(false, 0.0001).foreachPartition(
			
 
				-                rowIterator -> {
			
 
				-                    RedisTemplate<String, String> redisTemplate = buildRedisTemplate();
			
 
				-                    rowIterator.forEachRemaining(line -> loadFeatureToRedis(redisTemplate, line));
			
 
				-                }
			
 
				-        );
			
 
				-
			
 
				-
			
 
				-        // load user features
			
 
				-        JavaRDD<List<String>> readUserData = odpsOps.readTableWithJava(project, tableUserInfo, partition, new RecordsToUserRedisKV(), Integer.valueOf(50));
			
 
				-        readUserData.repartition(50).sample(false, 0.00001).foreachPartition(
			
 
				-                rowIterator -> {
			
 
				-                    RedisTemplate<String, String> redisTemplate = buildRedisTemplate();
			
 
				-                    rowIterator.forEachRemaining(line -> loadFeatureToRedis(redisTemplate, line));
			
 
				-                }
			
 
				-        );
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-}
			
 
				+//package examples.sparksql;
			
 
				+//
			
 
				+//import com.aliyun.odps.TableSchema;
			
 
				+//import com.aliyun.odps.data.Record;
			
 
				+//
			
 
				+//import com.tzld.piaoquan.recommend.feature.domain.video.base.ItemFeature;
			
 
				+//import com.tzld.piaoquan.recommend.feature.domain.video.base.UserFeature;
			
 
				+//import examples.dataloader.AdRedisFeatureConstructor;
			
 
				+//import examples.dataloader.RecommRedisFeatureConstructor;
			
 
				+//import org.apache.spark.SparkConf;
			
 
				+//import org.apache.spark.aliyun.odps.OdpsOps;
			
 
				+//import org.apache.spark.api.java.JavaRDD;
			
 
				+//import org.apache.spark.api.java.JavaSparkContext;
			
 
				+//import org.apache.spark.api.java.function.Function2;
			
 
				+//import org.springframework.data.redis.connection.RedisStandaloneConfiguration;
			
 
				+//import org.springframework.data.redis.connection.jedis.JedisConnectionFactory;
			
 
				+//import org.springframework.data.redis.core.RedisTemplate;
			
 
				+//import org.springframework.data.redis.serializer.StringRedisSerializer;
			
 
				+//
			
 
				+//import java.util.ArrayList;
			
 
				+//import java.util.HashMap;
			
 
				+//import java.util.List;
			
 
				+//import java.util.Map;
			
 
				+//
			
 
				+//
			
 
				+//public class SparkVideoFeaToRedisLoader {
			
 
				+//
			
 
				+//    private static final String userKeyFormat = "user:video:%s";
			
 
				+//
			
 
				+//    private static final String adKeyFormat = "video:%s";
			
 
				+//
			
 
				+//
			
 
				+//    public static RedisTemplate<String, String> buildRedisTemplate() {
			
 
				+//        RedisStandaloneConfiguration rsc = new RedisStandaloneConfiguration();
			
 
				+//        rsc.setPort(6379);
			
 
				+//        rsc.setPassword("Wqsd@2019");
			
 
				+//        rsc.setHostName("r-bp1pi8wyv6lzvgjy5z.redis.rds.aliyuncs.com");
			
 
				+//        RedisTemplate<String, String> template = new RedisTemplate<>();
			
 
				+//        JedisConnectionFactory fac = new JedisConnectionFactory(rsc);
			
 
				+//        fac.afterPropertiesSet();
			
 
				+//        template.setDefaultSerializer(new StringRedisSerializer());
			
 
				+//        template.setConnectionFactory(fac);
			
 
				+//        template.afterPropertiesSet();
			
 
				+//        return template;
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//    public static void loadFeatureToRedis(RedisTemplate<String, String> redisTemplate, List<String> line) {
			
 
				+//        Map<String, String> redisFormat = new HashMap<String, String>();
			
 
				+//        String key = line.get(0);
			
 
				+//        String value = line.get(1);
			
 
				+//        redisFormat.put(key, value);
			
 
				+//        redisTemplate.opsForValue().multiSet(redisFormat);
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//    static class RecordsToVideoRedisKV implements Function2<Record, TableSchema, List<String>> {
			
 
				+//        @Override
			
 
				+//        public List<String> call(Record record, TableSchema schema) throws Exception {
			
 
				+//            ItemFeature itemFeature = RecommRedisFeatureConstructor.constructItemFeature(record);
			
 
				+//            String key = String.format(adKeyFormat, itemFeature.getKey());
			
 
				+//            String value = itemFeature.getValue();
			
 
				+//            List<String> kv = new ArrayList<String>();
			
 
				+//            kv.add(key);
			
 
				+//            kv.add(value);
			
 
				+//            return kv;
			
 
				+//        }
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//    static class RecordsToUserRedisKV implements Function2<Record, TableSchema, List<String>> {
			
 
				+//        @Override
			
 
				+//        public List<String> call(Record record, TableSchema schema) throws Exception {
			
 
				+//            UserFeature userFeature = RecommRedisFeatureConstructor.constructUserFeature(record);
			
 
				+//            String key = String.format(userKeyFormat, userFeature.getKey());
			
 
				+//            String value = userFeature.getValue();
			
 
				+//            List<String> kv = new ArrayList<String>();
			
 
				+//            kv.add(key);
			
 
				+//            kv.add(value);
			
 
				+//            return kv;
			
 
				+//        }
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//    public static void main(String[] args) {
			
 
				+//
			
 
				+//        String partition = args[0];
			
 
				+//        String accessId = "LTAIWYUujJAm7CbH";
			
 
				+//        String accessKey = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P";
			
 
				+//        String odpsUrl = "http://service.odps.aliyun.com/api";
			
 
				+//        String tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com";
			
 
				+//        String project = "loghubods";
			
 
				+//        String tableItemInfo = "alg_recsys_video_info";
			
 
				+//        String tableUserInfo = "alg_recsys_user_info";
			
 
				+//
			
 
				+//        SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
			
 
				+//        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
			
 
				+//        OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
			
 
				+//        System.out.println("Read odps table...");
			
 
				+//
			
 
				+//
			
 
				+//        // load Ad features
			
 
				+//        JavaRDD<List<String>> readAdData = odpsOps.readTableWithJava(project, tableItemInfo, partition, new RecordsToVideoRedisKV(), Integer.valueOf(10));
			
 
				+//        readAdData.sample(false, 0.0001).foreachPartition(
			
 
				+//                rowIterator -> {
			
 
				+//                    RedisTemplate<String, String> redisTemplate = buildRedisTemplate();
			
 
				+//                    rowIterator.forEachRemaining(line -> loadFeatureToRedis(redisTemplate, line));
			
 
				+//                }
			
 
				+//        );
			
 
				+//
			
 
				+//
			
 
				+//        // load user features
			
 
				+//        JavaRDD<List<String>> readUserData = odpsOps.readTableWithJava(project, tableUserInfo, partition, new RecordsToUserRedisKV(), Integer.valueOf(50));
			
 
				+//        readUserData.repartition(50).sample(false, 0.00001).foreachPartition(
			
 
				+//                rowIterator -> {
			
 
				+//                    RedisTemplate<String, String> redisTemplate = buildRedisTemplate();
			
 
				+//                    rowIterator.forEachRemaining(line -> loadFeatureToRedis(redisTemplate, line));
			
 
				+//                }
			
 
				+//        );
			
 
				+//    }
			
 
				+//
			
 
				+//
			
 
				+//}
			
--- a/src/main/resources/20240608_feature_name.txt
+++ b/src/main/resources/20240608_feature_name.txt
@@ -0,0 +1,274 @@
 
				+b123_1h_STR
			
 
				+b123_1h_log(share)
			
 
				+b123_1h_ROV
			
 
				+b123_1h_log(return)
			
 
				+b123_1h_ROV*log(return)
			
 
				+b123_2h_STR
			
 
				+b123_2h_log(share)
			
 
				+b123_2h_ROV
			
 
				+b123_2h_log(return)
			
 
				+b123_2h_ROV*log(return)
			
 
				+b123_3h_STR
			
 
				+b123_3h_log(share)
			
 
				+b123_3h_ROV
			
 
				+b123_3h_log(return)
			
 
				+b123_3h_ROV*log(return)
			
 
				+b123_4h_STR
			
 
				+b123_4h_log(share)
			
 
				+b123_4h_ROV
			
 
				+b123_4h_log(return)
			
 
				+b123_4h_ROV*log(return)
			
 
				+b123_12h_STR
			
 
				+b123_12h_log(share)
			
 
				+b123_12h_ROV
			
 
				+b123_12h_log(return)
			
 
				+b123_12h_ROV*log(return)
			
 
				+b123_1d_STR
			
 
				+b123_1d_log(share)
			
 
				+b123_1d_ROV
			
 
				+b123_1d_log(return)
			
 
				+b123_1d_ROV*log(return)
			
 
				+b123_3d_STR
			
 
				+b123_3d_log(share)
			
 
				+b123_3d_ROV
			
 
				+b123_3d_log(return)
			
 
				+b123_3d_ROV*log(return)
			
 
				+b123_7d_STR
			
 
				+b123_7d_log(share)
			
 
				+b123_7d_ROV
			
 
				+b123_7d_log(return)
			
 
				+b123_7d_ROV*log(return)
			
 
				+b167_1h_STR
			
 
				+b167_1h_log(share)
			
 
				+b167_1h_ROV
			
 
				+b167_1h_log(return)
			
 
				+b167_1h_ROV*log(return)
			
 
				+b167_2h_STR
			
 
				+b167_2h_log(share)
			
 
				+b167_2h_ROV
			
 
				+b167_2h_log(return)
			
 
				+b167_2h_ROV*log(return)
			
 
				+b167_3h_STR
			
 
				+b167_3h_log(share)
			
 
				+b167_3h_ROV
			
 
				+b167_3h_log(return)
			
 
				+b167_3h_ROV*log(return)
			
 
				+b167_4h_STR
			
 
				+b167_4h_log(share)
			
 
				+b167_4h_ROV
			
 
				+b167_4h_log(return)
			
 
				+b167_4h_ROV*log(return)
			
 
				+b167_12h_STR
			
 
				+b167_12h_log(share)
			
 
				+b167_12h_ROV
			
 
				+b167_12h_log(return)
			
 
				+b167_12h_ROV*log(return)
			
 
				+b167_1d_STR
			
 
				+b167_1d_log(share)
			
 
				+b167_1d_ROV
			
 
				+b167_1d_log(return)
			
 
				+b167_1d_ROV*log(return)
			
 
				+b167_3d_STR
			
 
				+b167_3d_log(share)
			
 
				+b167_3d_ROV
			
 
				+b167_3d_log(return)
			
 
				+b167_3d_ROV*log(return)
			
 
				+b167_7d_STR
			
 
				+b167_7d_log(share)
			
 
				+b167_7d_ROV
			
 
				+b167_7d_log(return)
			
 
				+b167_7d_ROV*log(return)
			
 
				+b8910_1h_STR
			
 
				+b8910_1h_log(share)
			
 
				+b8910_1h_ROV
			
 
				+b8910_1h_log(return)
			
 
				+b8910_1h_ROV*log(return)
			
 
				+b8910_2h_STR
			
 
				+b8910_2h_log(share)
			
 
				+b8910_2h_ROV
			
 
				+b8910_2h_log(return)
			
 
				+b8910_2h_ROV*log(return)
			
 
				+b8910_3h_STR
			
 
				+b8910_3h_log(share)
			
 
				+b8910_3h_ROV
			
 
				+b8910_3h_log(return)
			
 
				+b8910_3h_ROV*log(return)
			
 
				+b8910_4h_STR
			
 
				+b8910_4h_log(share)
			
 
				+b8910_4h_ROV
			
 
				+b8910_4h_log(return)
			
 
				+b8910_4h_ROV*log(return)
			
 
				+b8910_12h_STR
			
 
				+b8910_12h_log(share)
			
 
				+b8910_12h_ROV
			
 
				+b8910_12h_log(return)
			
 
				+b8910_12h_ROV*log(return)
			
 
				+b8910_1d_STR
			
 
				+b8910_1d_log(share)
			
 
				+b8910_1d_ROV
			
 
				+b8910_1d_log(return)
			
 
				+b8910_1d_ROV*log(return)
			
 
				+b8910_3d_STR
			
 
				+b8910_3d_log(share)
			
 
				+b8910_3d_ROV
			
 
				+b8910_3d_log(return)
			
 
				+b8910_3d_ROV*log(return)
			
 
				+b8910_7d_STR
			
 
				+b8910_7d_log(share)
			
 
				+b8910_7d_ROV
			
 
				+b8910_7d_log(return)
			
 
				+b8910_7d_ROV*log(return)
			
 
				+b111213_1h_STR
			
 
				+b111213_1h_log(share)
			
 
				+b111213_1h_ROV
			
 
				+b111213_1h_log(return)
			
 
				+b111213_1h_ROV*log(return)
			
 
				+b111213_2h_STR
			
 
				+b111213_2h_log(share)
			
 
				+b111213_2h_ROV
			
 
				+b111213_2h_log(return)
			
 
				+b111213_2h_ROV*log(return)
			
 
				+b111213_3h_STR
			
 
				+b111213_3h_log(share)
			
 
				+b111213_3h_ROV
			
 
				+b111213_3h_log(return)
			
 
				+b111213_3h_ROV*log(return)
			
 
				+b111213_4h_STR
			
 
				+b111213_4h_log(share)
			
 
				+b111213_4h_ROV
			
 
				+b111213_4h_log(return)
			
 
				+b111213_4h_ROV*log(return)
			
 
				+b111213_12h_STR
			
 
				+b111213_12h_log(share)
			
 
				+b111213_12h_ROV
			
 
				+b111213_12h_log(return)
			
 
				+b111213_12h_ROV*log(return)
			
 
				+b111213_1d_STR
			
 
				+b111213_1d_log(share)
			
 
				+b111213_1d_ROV
			
 
				+b111213_1d_log(return)
			
 
				+b111213_1d_ROV*log(return)
			
 
				+b111213_3d_STR
			
 
				+b111213_3d_log(share)
			
 
				+b111213_3d_ROV
			
 
				+b111213_3d_log(return)
			
 
				+b111213_3d_ROV*log(return)
			
 
				+b111213_7d_STR
			
 
				+b111213_7d_log(share)
			
 
				+b111213_7d_ROV
			
 
				+b111213_7d_log(return)
			
 
				+b111213_7d_ROV*log(return)
			
 
				+b171819_1h_STR
			
 
				+b171819_1h_log(share)
			
 
				+b171819_1h_ROV
			
 
				+b171819_1h_log(return)
			
 
				+b171819_1h_ROV*log(return)
			
 
				+b171819_2h_STR
			
 
				+b171819_2h_log(share)
			
 
				+b171819_2h_ROV
			
 
				+b171819_2h_log(return)
			
 
				+b171819_2h_ROV*log(return)
			
 
				+b171819_3h_STR
			
 
				+b171819_3h_log(share)
			
 
				+b171819_3h_ROV
			
 
				+b171819_3h_log(return)
			
 
				+b171819_3h_ROV*log(return)
			
 
				+b171819_4h_STR
			
 
				+b171819_4h_log(share)
			
 
				+b171819_4h_ROV
			
 
				+b171819_4h_log(return)
			
 
				+b171819_4h_ROV*log(return)
			
 
				+b171819_12h_STR
			
 
				+b171819_12h_log(share)
			
 
				+b171819_12h_ROV
			
 
				+b171819_12h_log(return)
			
 
				+b171819_12h_ROV*log(return)
			
 
				+b171819_1d_STR
			
 
				+b171819_1d_log(share)
			
 
				+b171819_1d_ROV
			
 
				+b171819_1d_log(return)
			
 
				+b171819_1d_ROV*log(return)
			
 
				+b171819_3d_STR
			
 
				+b171819_3d_log(share)
			
 
				+b171819_3d_ROV
			
 
				+b171819_3d_log(return)
			
 
				+b171819_3d_ROV*log(return)
			
 
				+b171819_7d_STR
			
 
				+b171819_7d_log(share)
			
 
				+b171819_7d_ROV
			
 
				+b171819_7d_log(return)
			
 
				+b171819_7d_ROV*log(return)
			
 
				+total_time
			
 
				+bit_rate
			
 
				+playcnt_6h
			
 
				+playcnt_1d
			
 
				+playcnt_3d
			
 
				+playcnt_7d
			
 
				+share_pv_12h
			
 
				+share_pv_1d
			
 
				+share_pv_3d
			
 
				+share_pv_7d
			
 
				+return_uv_12h
			
 
				+return_uv_1d
			
 
				+return_uv_3d
			
 
				+return_uv_7d
			
 
				+c3_feature_tags_1d_matchnum
			
 
				+c3_feature_tags_1d_maxscore
			
 
				+c3_feature_tags_1d_avgscore
			
 
				+c3_feature_tags_3d_matchnum
			
 
				+c3_feature_tags_3d_maxscore
			
 
				+c3_feature_tags_3d_avgscore
			
 
				+c3_feature_tags_7d_matchnum
			
 
				+c3_feature_tags_7d_maxscore
			
 
				+c3_feature_tags_7d_avgscore
			
 
				+c4_feature_tags_1d_matchnum
			
 
				+c4_feature_tags_1d_maxscore
			
 
				+c4_feature_tags_1d_avgscore
			
 
				+c4_feature_tags_3d_matchnum
			
 
				+c4_feature_tags_3d_maxscore
			
 
				+c4_feature_tags_3d_avgscore
			
 
				+c4_feature_tags_7d_matchnum
			
 
				+c4_feature_tags_7d_maxscore
			
 
				+c4_feature_tags_7d_avgscore
			
 
				+c5_feature_tags_1d_matchnum
			
 
				+c5_feature_tags_1d_maxscore
			
 
				+c5_feature_tags_1d_avgscore
			
 
				+c5_feature_tags_3d_matchnum
			
 
				+c5_feature_tags_3d_maxscore
			
 
				+c5_feature_tags_3d_avgscore
			
 
				+c5_feature_tags_7d_matchnum
			
 
				+c5_feature_tags_7d_maxscore
			
 
				+c5_feature_tags_7d_avgscore
			
 
				+c6_feature_tags_1d_matchnum
			
 
				+c6_feature_tags_1d_maxscore
			
 
				+c6_feature_tags_1d_avgscore
			
 
				+c6_feature_tags_3d_matchnum
			
 
				+c6_feature_tags_3d_maxscore
			
 
				+c6_feature_tags_3d_avgscore
			
 
				+c6_feature_tags_7d_matchnum
			
 
				+c6_feature_tags_7d_maxscore
			
 
				+c6_feature_tags_7d_avgscore
			
 
				+c7_feature_tags_1d_matchnum
			
 
				+c7_feature_tags_1d_maxscore
			
 
				+c7_feature_tags_1d_avgscore
			
 
				+c7_feature_tags_3d_matchnum
			
 
				+c7_feature_tags_3d_maxscore
			
 
				+c7_feature_tags_3d_avgscore
			
 
				+c7_feature_tags_7d_matchnum
			
 
				+c7_feature_tags_7d_maxscore
			
 
				+c7_feature_tags_7d_avgscore
			
 
				+c8_feature_share_score
			
 
				+c8_feature_share_num
			
 
				+c8_feature_share_rank
			
 
				+c8_feature_return_score
			
 
				+c8_feature_return_num
			
 
				+c8_feature_return_rank
			
 
				+c9_feature_share_score
			
 
				+c9_feature_share_num
			
 
				+c9_feature_share_rank
			
 
				+c9_feature_return_score
			
 
				+c9_feature_return_num
			
 
				+c9_feature_return_rank
			
 
				+d1_exp
			
 
				+d1_return_n
			
 
				+d1_rovn
			
--- a/src/main/resources/20240609_bucket_274.txt
+++ b/src/main/resources/20240609_bucket_274.txt
--- a/src/main/resources/20240609_bucket_274_old.txt
+++ b/src/main/resources/20240609_bucket_274_old.txt
--- a/src/main/resources/20240622_ad_bucket_249.txt
+++ b/src/main/resources/20240622_ad_bucket_249.txt
--- a/src/main/resources/20240622_ad_feature_name.txt
+++ b/src/main/resources/20240622_ad_feature_name.txt
@@ -0,0 +1,249 @@
 
				+cpa
			
 
				+b2_3h_ctr
			
 
				+b2_3h_ctcvr
			
 
				+b2_3h_cvr
			
 
				+b2_3h_conver
			
 
				+b2_3h_ecpm
			
 
				+b2_6h_ctr
			
 
				+b2_6h_ctcvr
			
 
				+b2_6h_cvr
			
 
				+b2_6h_conver
			
 
				+b2_6h_ecpm
			
 
				+b2_12h_ctr
			
 
				+b2_12h_ctcvr
			
 
				+b2_12h_cvr
			
 
				+b2_12h_conver
			
 
				+b2_12h_ecpm
			
 
				+b2_1d_ctr
			
 
				+b2_1d_ctcvr
			
 
				+b2_1d_cvr
			
 
				+b2_1d_conver
			
 
				+b2_1d_ecpm
			
 
				+b2_3d_ctr
			
 
				+b2_3d_ctcvr
			
 
				+b2_3d_cvr
			
 
				+b2_3d_conver
			
 
				+b2_3d_ecpm
			
 
				+b2_7d_ctr
			
 
				+b2_7d_ctcvr
			
 
				+b2_7d_cvr
			
 
				+b2_7d_conver
			
 
				+b2_7d_ecpm
			
 
				+b3_3h_ctr
			
 
				+b3_3h_ctcvr
			
 
				+b3_3h_cvr
			
 
				+b3_3h_conver
			
 
				+b3_3h_ecpm
			
 
				+b3_6h_ctr
			
 
				+b3_6h_ctcvr
			
 
				+b3_6h_cvr
			
 
				+b3_6h_conver
			
 
				+b3_6h_ecpm
			
 
				+b3_12h_ctr
			
 
				+b3_12h_ctcvr
			
 
				+b3_12h_cvr
			
 
				+b3_12h_conver
			
 
				+b3_12h_ecpm
			
 
				+b3_1d_ctr
			
 
				+b3_1d_ctcvr
			
 
				+b3_1d_cvr
			
 
				+b3_1d_conver
			
 
				+b3_1d_ecpm
			
 
				+b3_3d_ctr
			
 
				+b3_3d_ctcvr
			
 
				+b3_3d_cvr
			
 
				+b3_3d_conver
			
 
				+b3_3d_ecpm
			
 
				+b3_7d_ctr
			
 
				+b3_7d_ctcvr
			
 
				+b3_7d_cvr
			
 
				+b3_7d_conver
			
 
				+b3_7d_ecpm
			
 
				+b4_3h_ctr
			
 
				+b4_3h_ctcvr
			
 
				+b4_3h_cvr
			
 
				+b4_3h_conver
			
 
				+b4_3h_ecpm
			
 
				+b4_6h_ctr
			
 
				+b4_6h_ctcvr
			
 
				+b4_6h_cvr
			
 
				+b4_6h_conver
			
 
				+b4_6h_ecpm
			
 
				+b4_12h_ctr
			
 
				+b4_12h_ctcvr
			
 
				+b4_12h_cvr
			
 
				+b4_12h_conver
			
 
				+b4_12h_ecpm
			
 
				+b4_1d_ctr
			
 
				+b4_1d_ctcvr
			
 
				+b4_1d_cvr
			
 
				+b4_1d_conver
			
 
				+b4_1d_ecpm
			
 
				+b4_3d_ctr
			
 
				+b4_3d_ctcvr
			
 
				+b4_3d_cvr
			
 
				+b4_3d_conver
			
 
				+b4_3d_ecpm
			
 
				+b4_7d_ctr
			
 
				+b4_7d_ctcvr
			
 
				+b4_7d_cvr
			
 
				+b4_7d_conver
			
 
				+b4_7d_ecpm
			
 
				+b5_3h_ctr
			
 
				+b5_3h_ctcvr
			
 
				+b5_3h_cvr
			
 
				+b5_3h_conver
			
 
				+b5_3h_ecpm
			
 
				+b5_6h_ctr
			
 
				+b5_6h_ctcvr
			
 
				+b5_6h_cvr
			
 
				+b5_6h_conver
			
 
				+b5_6h_ecpm
			
 
				+b5_12h_ctr
			
 
				+b5_12h_ctcvr
			
 
				+b5_12h_cvr
			
 
				+b5_12h_conver
			
 
				+b5_12h_ecpm
			
 
				+b5_1d_ctr
			
 
				+b5_1d_ctcvr
			
 
				+b5_1d_cvr
			
 
				+b5_1d_conver
			
 
				+b5_1d_ecpm
			
 
				+b5_3d_ctr
			
 
				+b5_3d_ctcvr
			
 
				+b5_3d_cvr
			
 
				+b5_3d_conver
			
 
				+b5_3d_ecpm
			
 
				+b5_7d_ctr
			
 
				+b5_7d_ctcvr
			
 
				+b5_7d_cvr
			
 
				+b5_7d_conver
			
 
				+b5_7d_ecpm
			
 
				+b8_3h_ctr
			
 
				+b8_3h_ctcvr
			
 
				+b8_3h_cvr
			
 
				+b8_3h_conver
			
 
				+b8_3h_ecpm
			
 
				+b8_6h_ctr
			
 
				+b8_6h_ctcvr
			
 
				+b8_6h_cvr
			
 
				+b8_6h_conver
			
 
				+b8_6h_ecpm
			
 
				+b8_12h_ctr
			
 
				+b8_12h_ctcvr
			
 
				+b8_12h_cvr
			
 
				+b8_12h_conver
			
 
				+b8_12h_ecpm
			
 
				+b8_1d_ctr
			
 
				+b8_1d_ctcvr
			
 
				+b8_1d_cvr
			
 
				+b8_1d_conver
			
 
				+b8_1d_ecpm
			
 
				+b8_3d_ctr
			
 
				+b8_3d_ctcvr
			
 
				+b8_3d_cvr
			
 
				+b8_3d_conver
			
 
				+b8_3d_ecpm
			
 
				+b8_7d_ctr
			
 
				+b8_7d_ctcvr
			
 
				+b8_7d_cvr
			
 
				+b8_7d_conver
			
 
				+b8_7d_ecpm
			
 
				+b6_7d_ctr
			
 
				+b6_7d_ctcvr
			
 
				+b6_7d_cvr
			
 
				+b6_7d_conver
			
 
				+b6_7d_ecpm
			
 
				+b6_14d_ctr
			
 
				+b6_14d_ctcvr
			
 
				+b6_14d_cvr
			
 
				+b6_14d_conver
			
 
				+b6_14d_ecpm
			
 
				+b7_7d_ctr
			
 
				+b7_7d_ctcvr
			
 
				+b7_7d_cvr
			
 
				+b7_7d_conver
			
 
				+b7_7d_ecpm
			
 
				+b7_14d_ctr
			
 
				+b7_14d_ctcvr
			
 
				+b7_14d_cvr
			
 
				+b7_14d_conver
			
 
				+b7_14d_ecpm
			
 
				+viewAll
			
 
				+clickAll
			
 
				+converAll
			
 
				+incomeAll
			
 
				+ctr_all
			
 
				+ctcvr_all
			
 
				+cvr_all
			
 
				+ecpm_all
			
 
				+timediff_view
			
 
				+timediff_click
			
 
				+timediff_conver
			
 
				+actionstatic_view
			
 
				+actionstatic_click
			
 
				+actionstatic_conver
			
 
				+actionstatic_income
			
 
				+actionstatic_ctr
			
 
				+actionstatic_ctcvr
			
 
				+actionstatic_cvr
			
 
				+e1_tags_3d_matchnum
			
 
				+e1_tags_3d_maxscore
			
 
				+e1_tags_3d_avgscore
			
 
				+e1_tags_7d_matchnum
			
 
				+e1_tags_7d_maxscore
			
 
				+e1_tags_7d_avgscore
			
 
				+e1_tags_14d_matchnum
			
 
				+e1_tags_14d_maxscore
			
 
				+e1_tags_14d_avgscore
			
 
				+e2_tags_3d_matchnum
			
 
				+e2_tags_3d_maxscore
			
 
				+e2_tags_3d_avgscore
			
 
				+e2_tags_7d_matchnum
			
 
				+e2_tags_7d_maxscore
			
 
				+e2_tags_7d_avgscore
			
 
				+e2_tags_14d_matchnum
			
 
				+e2_tags_14d_maxscore
			
 
				+e2_tags_14d_avgscore
			
 
				+d1_feature_3h_ctr
			
 
				+d1_feature_3h_ctcvr
			
 
				+d1_feature_3h_cvr
			
 
				+d1_feature_3h_conver
			
 
				+d1_feature_3h_ecpm
			
 
				+d1_feature_6h_ctr
			
 
				+d1_feature_6h_ctcvr
			
 
				+d1_feature_6h_cvr
			
 
				+d1_feature_6h_conver
			
 
				+d1_feature_6h_ecpm
			
 
				+d1_feature_12h_ctr
			
 
				+d1_feature_12h_ctcvr
			
 
				+d1_feature_12h_cvr
			
 
				+d1_feature_12h_conver
			
 
				+d1_feature_12h_ecpm
			
 
				+d1_feature_1d_ctr
			
 
				+d1_feature_1d_ctcvr
			
 
				+d1_feature_1d_cvr
			
 
				+d1_feature_1d_conver
			
 
				+d1_feature_1d_ecpm
			
 
				+d1_feature_3d_ctr
			
 
				+d1_feature_3d_ctcvr
			
 
				+d1_feature_3d_cvr
			
 
				+d1_feature_3d_conver
			
 
				+d1_feature_3d_ecpm
			
 
				+d1_feature_7d_ctr
			
 
				+d1_feature_7d_ctcvr
			
 
				+d1_feature_7d_cvr
			
 
				+d1_feature_7d_conver
			
 
				+d1_feature_7d_ecpm
			
 
				+vid_rank_ctr_1d
			
 
				+vid_rank_ctr_3d
			
 
				+vid_rank_ctr_7d
			
 
				+vid_rank_ctr_14d
			
 
				+vid_rank_ctcvr_1d
			
 
				+vid_rank_ctcvr_3d
			
 
				+vid_rank_ctcvr_7d
			
 
				+vid_rank_ctcvr_14d
			
 
				+vid_rank_ecpm_1d
			
 
				+vid_rank_ecpm_3d
			
 
				+vid_rank_ecpm_7d
			
 
				+vid_rank_ecpm_14d
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/ana/ana_01_cidvidpk.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/ana/ana_01_cidvidpk.scala
@@ -0,0 +1,125 @@
 
				+package com.aliyun.odps.spark.examples.ana
			
 
				+
			
 
				+
			
 
				+import com.alibaba.fastjson.JSONObject
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import java.util
			
 
				+import java.util.{HashMap, Map}
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+/*
			
 
				+   所有获取不到的特征，给默认值0.
			
 
				+ */
			
 
				+
			
 
				+object ana_01_cidvidpk {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val tablePart = param.getOrElse("tablePart", "64").toInt
			
 
				+    val project = param.getOrElse("project", "loghubods")
			
 
				+    val table = param.getOrElse("table", "ad_engine_statistics_log_per5min")
			
 
				+    val beginStr = param.getOrElse("beginStr", "2024060208")
			
 
				+    val endStr = param.getOrElse("endStr", "2024060223")
			
 
				+    val vidSelect = param.getOrElse("vidSelect", "")
			
 
				+    val cidsSelect = param.getOrElse("cidsSelect", "").split(",").toSet
			
 
				+    val apptypeSelect = param.getOrElse("apptype", "")
			
 
				+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
			
 
				+
			
 
				+    // 2 读取odps+表信息
			
 
				+    val odpsOps = env.getODPS(sc)
			
 
				+    val timeRange = MyDateUtils.getDateHourRange(beginStr, endStr)
			
 
				+    val partitions = new ArrayBuffer[String]()
			
 
				+    for (dt_hh <- timeRange) {
			
 
				+      for (mi <- List (
			
 
				+        "0000", "0500", "1000", "1500", "2000", "2500",
			
 
				+        "3000", "3500", "4000", "4500", "5000", "5500"
			
 
				+      )){
			
 
				+        val partition = dt_hh + mi
			
 
				+        println("partition:" + partition)
			
 
				+        partitions.add(partition)
			
 
				+      }
			
 
				+    }
			
 
				+    val rdds = partitions.map(p => {
			
 
				+      odpsOps.readTable(project = project,
			
 
				+        table = table,
			
 
				+        partition = partitionPrefix + p,
			
 
				+        transfer = func,
			
 
				+        numPartition = tablePart)
			
 
				+    }).reduce((r1, r2) => r1.union(r2))
			
 
				+
			
 
				+    val data = rdds.map(record=>{
			
 
				+      val vid = if (record.isNull("videoid")) "" else record.getString("videoid")
			
 
				+      val recalls = if (record.isNull("creativelist")) "" else record.getString("creativelist")
			
 
				+      val ranks = if (record.isNull("scoreresult")) "" else record.getString("scoreresult")
			
 
				+      val apptype = if (record.isNull("apptype")) "" else record.getString("apptype")
			
 
				+      val abcode = if (record.isNull("adabgroup")) "" else record.getString("adabgroup")
			
 
				+      (apptype, abcode, vid, recalls, ranks)
			
 
				+    }).filter(r => r._1.equals(apptypeSelect) && !r._3.equals("") && !r._4.equals("") && !r._5.equals(""))
			
 
				+      .filter(r=> r._3.equals(vidSelect)) // 过滤的vid
			
 
				+      .map{
			
 
				+        case (apptype, abcode, vid, recalls, ranks) =>
			
 
				+          val recalls_json = JSON.parseArray(recalls).map(r=>{
			
 
				+            val j = JSON.parseObject(r.toString)
			
 
				+            j.getOrElse("creativeId", 0).toString
			
 
				+          }).filter(!_.equals("0")).toSet
			
 
				+          val ranks_json = JSON.parseArray(ranks).map(r => {
			
 
				+            val j = JSON.parseObject(r.toString)
			
 
				+            val adId = j.getOrElse("adId", 0).toString
			
 
				+            val score = j.getOrElse("score", 0.0)
			
 
				+            (adId, score.toString.toDouble)
			
 
				+          })
			
 
				+          var rankId = ranks_json.get(0)._1
			
 
				+          var score = ranks_json.get(0)._2
			
 
				+//          for (i <- 1 until ranks_json.size){
			
 
				+//            val item = ranks_json.get(i)
			
 
				+//            if (item._2 > score){
			
 
				+//              rankId = item._1
			
 
				+//              score = item._2
			
 
				+//            }
			
 
				+//          }
			
 
				+          (apptype, abcode, vid, recalls_json, rankId)
			
 
				+      }.flatMap({
			
 
				+        case (apptype, abcode, vid, recalls_json, rankId) =>
			
 
				+          recalls_json.map(recallId=> {
			
 
				+            (apptype, abcode, vid, recallId, rankId, recalls_json)
			
 
				+          })
			
 
				+      }).filter(r=> cidsSelect.contains(r._4)) // 过滤的cid
			
 
				+      .map({
			
 
				+        case (apptype, abcode, vid, recallId, rankId, recalls_json) =>
			
 
				+          val x1 = 1
			
 
				+          val x2 = if (recallId.equals(rankId)) 1 else 0
			
 
				+          val x3 = if (cidsSelect.subsetOf(recalls_json)) 1 else 0
			
 
				+          val x4 = if (cidsSelect.subsetOf(recalls_json) && cidsSelect.contains(rankId)) 1 else 0
			
 
				+          val x5 = if (cidsSelect.subsetOf(recalls_json) && recallId.equals(rankId)) 1 else 0
			
 
				+          ((apptype, abcode, vid, recallId), (x1, x2, x3, x4, x5))
			
 
				+      }).aggregateByKey(
			
 
				+        (0, 0, 0, 0, 0)
			
 
				+      )(
			
 
				+        seqOp = (runningSum, x) => (runningSum._1 + x._1, runningSum._2 + x._2, runningSum._3 + x._3, runningSum._4 + x._4, runningSum._5 + x._5),
			
 
				+        combOp = (sum1, sum2) => (sum1._1 + sum2._1, sum1._2 + sum2._2, sum1._3 + sum2._3, sum1._4 + sum2._4, sum1._5 + sum2._5)
			
 
				+      )
			
 
				+
			
 
				+    data.collect().foreach(r => println("结果\t" + r._1.productIterator.mkString("\t") + "\t" + r._2.productIterator.mkString("\t")))
			
 
				+
			
 
				+  }
			
 
				+
			
 
				+
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): Record = {
			
 
				+    record
			
 
				+  }
			
 
				+
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_01_readtable2hdfs.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_01_readtable2hdfs.scala
@@ -0,0 +1,79 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
			
 
				+import examples.dataloader.RequestContextOffline
			
 
				+import examples.dataloader.OfflineVlogShareLRFeatureExtractor
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+
			
 
				+object makedata_01_readtable2hdfs {
			
 
				+  def main(args: Array[String]) {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
			
 
				+    val tablePart = param.getOrElse("tablePart", "64").toInt
			
 
				+    val beginStr = param.getOrElse("beginStr", "20230101")
			
 
				+    val endStr = param.getOrElse("endStr", "20230101")
			
 
				+    val savePath = param.getOrElse("savePath", "")
			
 
				+    // /dw/recommend/model/share_ratio_samples/
			
 
				+
			
 
				+
			
 
				+    // 2 读取odps+表信息
			
 
				+    val odpsOps = env.getODPS(sc)
			
 
				+    val project = "loghubods"
			
 
				+    val table = "alg_recsys_view_sample"
			
 
				+
			
 
				+    // 3 循环执行数据生产
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      val partition = partitionPrefix + date
			
 
				+      println("执行partiton:" + partition)
			
 
				+      val odpsData = odpsOps.readTable(project = project,
			
 
				+        table = table,
			
 
				+        partition = partition,
			
 
				+        transfer = func,
			
 
				+        numPartition = tablePart)
			
 
				+      val hdfsPath = savePath + "/" + partition
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        odpsData.saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+        println("数据写入完成:" + hdfsPath)
			
 
				+        println("数据量:" + odpsData.count())
			
 
				+      }else{
			
 
				+        println("路径不合法， 无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): String = {
			
 
				+    singleParse(record)
			
 
				+  }
			
 
				+
			
 
				+  def singleParse(record: Record): String = {
			
 
				+    //1 处理标签
			
 
				+    val label: String = record.getString("share_ornot")
			
 
				+    val newLabel = if ("1".equals(label)) "0" else "1"
			
 
				+    //2 处理特征
			
 
				+    val reqContext: RequestContextOffline = new RequestContextOffline()
			
 
				+    reqContext.putUserFeature(record)
			
 
				+    reqContext.putItemFeature(record)
			
 
				+    reqContext.putSceneFeature(record)
			
 
				+    val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractor()
			
 
				+    bytesFeatureExtractor.makeFeature(reqContext.featureMap)
			
 
				+    val featureMap = bytesFeatureExtractor.featureMap
			
 
				+    newLabel + "\t" + featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t")
			
 
				+  }
			
 
				+
			
 
				+
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_02_writeredis.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_02_writeredis.scala
@@ -0,0 +1,249 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
			
 
				+import com.google.gson.GsonBuilder
			
 
				+import examples.dataloader.RequestContextOffline
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import java.text.SimpleDateFormat
			
 
				+import java.util.concurrent.TimeUnit
			
 
				+import java.util
			
 
				+import scala.collection.JavaConversions._
			
 
				+
			
 
				+
			
 
				+object makedata_02_writeredis {
			
 
				+  def main(args: Array[String]) {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
			
 
				+    val tablePart = param.getOrElse("tablePart", "64").toInt
			
 
				+    val ifUser = param.getOrDefault("ifUser", "False").toBoolean
			
 
				+    val ifVideo = param.getOrDefault("ifVideo", "False").toBoolean
			
 
				+    val date = param.getOrDefault("date", "20231220")
			
 
				+    val expireDay = param.getOrDefault("expireDay", "2").toInt
			
 
				+    val ifDebug = param.getOrDefault("ifDebug", "False").toBoolean
			
 
				+    val ifDeleteRedisUser = param.getOrDefault("ifDeleteRedisUser", "False").toBoolean
			
 
				+    val ifWriteRedisUser = param.getOrDefault("ifWriteRedisUser", "False").toBoolean
			
 
				+    val ifWriteRedis = param.getOrDefault("ifWriteRedis", "True").toBoolean
			
 
				+    val partition = partitionPrefix + date
			
 
				+    val savePathUser = param.getOrDefault("savePathUser", "")
			
 
				+    val savePathVideo = param.getOrDefault("savePathVideo", "")
			
 
				+    val userSampleIDs = param.getOrDefault("userSampleIDs", "")
			
 
				+    val sampleRate = param.getOrDefault("sampleRate", "1.0").toDouble
			
 
				+//    val userSampleIDsPathFix = param.getOrDefault("userSampleIDsPathFix", "")
			
 
				+    //  /dw/recommend/model/feature/
			
 
				+
			
 
				+
			
 
				+    // 2 读取数据库odps
			
 
				+    val odpsOps = env.getODPS(sc)
			
 
				+    val project = "loghubods"
			
 
				+    val tableUser = "alg_recsys_user_info"
			
 
				+    val tableItem = "alg_recsys_video_info"
			
 
				+    val userRedisKeyPrefix = "user_info_4video_"
			
 
				+    val videoRedisKeyPrefix = "video_info_"
			
 
				+
			
 
				+
			
 
				+
			
 
				+    // 3 用户测特征处理
			
 
				+    if (ifUser){
			
 
				+      println("user特征处理")
			
 
				+      var userData = odpsOps.readTable(project = project, table = tableUser, partition = partition, transfer = handleUser, numPartition = tablePart)
			
 
				+        .filter {
			
 
				+          case (mid, fea, feaSize) =>
			
 
				+            mid.nonEmpty && fea.nonEmpty && feaSize > 0
			
 
				+        }
			
 
				+      if (userSampleIDs.nonEmpty){
			
 
				+        val IDs = userSampleIDs.split(",").filter(_.nonEmpty).map(_.toInt).toList
			
 
				+        userData = userData.filter(r => IDs.contains(r._1.hashCode % 10))
			
 
				+      }
			
 
				+      if (ifDebug){
			
 
				+        println("user特征处理-debug开启-只保留5条数据-特征数量大于1")
			
 
				+        val userDataTake = userData.take(5)
			
 
				+        userDataTake.foreach(r=> println(r._1 + "\t" + r._2 + "\t" + r._3))
			
 
				+        userData = sc.parallelize(userDataTake)
			
 
				+      }
			
 
				+      if (savePathUser.nonEmpty && savePathUser.startsWith("/dw/recommend/model/")) {
			
 
				+        var savePathPart = savePathUser + "/" + partition
			
 
				+        if (userSampleIDs.nonEmpty) {
			
 
				+          savePathPart = savePathPart + "_" + userSampleIDs
			
 
				+        }
			
 
				+        MyHdfsUtils.delete_hdfs_path(savePathPart)
			
 
				+        userData.map(r => r._1 + "\t" + r._2).saveAsTextFile(savePathPart, classOf[GzipCodec])
			
 
				+      }
			
 
				+      println("user.action.count=" + userData.count())
			
 
				+    } else {
			
 
				+      println("不处理user")
			
 
				+    }
			
 
				+
			
 
				+    if (ifDeleteRedisUser){
			
 
				+      println("user redis 删除")
			
 
				+      var savePathPart = savePathUser + "/" + partition
			
 
				+      if (userSampleIDs.nonEmpty) {
			
 
				+        savePathPart = savePathPart + "_" + userSampleIDs
			
 
				+      }
			
 
				+      println("读取数据路径:" + savePathPart)
			
 
				+      val userDataRead = sc.textFile(savePathPart)
			
 
				+      val userDataRead2 = userDataRead.filter(_.split("\t").length >= 2).map(r => {
			
 
				+        val rList = r.split("\t")
			
 
				+        (rList(0), rList(1))
			
 
				+      })
			
 
				+      println("预计删除数据量:" + userDataRead2.count())
			
 
				+      val userDataTakeRddRun = userDataRead2.mapPartitions(row => {
			
 
				+        val redisFormat = new util.HashMap[String, String]
			
 
				+        val redisTemplate = env.getRedisTemplate()
			
 
				+        var i = 1
			
 
				+        row.foreach {
			
 
				+          case (key, value) =>
			
 
				+            if (key.nonEmpty) {
			
 
				+              redisFormat.put(userRedisKeyPrefix + key, value)
			
 
				+            }
			
 
				+            if (i % 1000 == 0) {
			
 
				+              redisTemplate.delete(redisFormat.map(_._1))
			
 
				+              redisFormat.clear()
			
 
				+            }
			
 
				+            i = i + 1
			
 
				+        }
			
 
				+        redisTemplate.delete(redisFormat.map(_._1))
			
 
				+        redisFormat.clear()
			
 
				+        redisFormat.iterator
			
 
				+      })
			
 
				+      println("delete redis.count=" + userDataTakeRddRun.count())
			
 
				+    } else {
			
 
				+      println("不处理user的redis删除")
			
 
				+    }
			
 
				+
			
 
				+    if (ifWriteRedisUser){
			
 
				+      println("user redis 写入")
			
 
				+      var savePathPart = savePathUser + "/" + partition
			
 
				+      if (userSampleIDs.nonEmpty) {
			
 
				+        savePathPart = savePathPart + "_" + userSampleIDs
			
 
				+      }
			
 
				+      val userDataRead = sc.textFile(savePathPart).filter(_.split("\t").length >= 2)
			
 
				+        .sample(false, sampleRate)
			
 
				+        .map(r => {
			
 
				+        val rList = r.split("\t")
			
 
				+        (rList(0), rList(1))
			
 
				+      })
			
 
				+      val userDataTakeRddRun = userDataRead.mapPartitions(row => {
			
 
				+        val redisFormat = new util.HashMap[String, String]
			
 
				+        val redisTemplate = env.getRedisTemplate()
			
 
				+        var i = 1
			
 
				+        row.foreach {
			
 
				+          case (key, value) =>
			
 
				+            if (key.nonEmpty) {
			
 
				+              redisFormat.put(userRedisKeyPrefix + key, value)
			
 
				+            }
			
 
				+            if (i % 1000 == 0) {
			
 
				+              redisTemplate.opsForValue.multiSet(redisFormat)
			
 
				+              redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
			
 
				+              redisFormat.clear()
			
 
				+            }
			
 
				+            i = i + 1
			
 
				+        }
			
 
				+        redisTemplate.opsForValue.multiSet(redisFormat)
			
 
				+        redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
			
 
				+        redisFormat.clear()
			
 
				+        redisFormat.iterator
			
 
				+      })
			
 
				+      println("put in redis.count=" + userDataTakeRddRun.count())
			
 
				+    } else {
			
 
				+      println("不处理user的redis写入")
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    // 4 video测特征处理
			
 
				+    if (ifVideo){
			
 
				+      println("video特征处理")
			
 
				+      val handleItemFunction: (Record, TableSchema) => Tuple3[String, String, Int] = handleItem(_, _, date)
			
 
				+      var itemData = odpsOps.readTable(project = project, table = tableItem, partition = partition, transfer = handleItemFunction, numPartition = tablePart)
			
 
				+      if (ifDebug) {
			
 
				+        println("video特征处理-debug开启-只保留5条数据-特征数量大于1")
			
 
				+        val itemDataTake = itemData.filter(_._3 > 1).take(5)
			
 
				+        itemDataTake.foreach(r => println(r._1 + "\t" + r._2 + "\t" + r._3))
			
 
				+        itemData = sc.parallelize(itemDataTake)
			
 
				+      }
			
 
				+      val itemDataTakeRddRun = itemData.mapPartitions(row => {
			
 
				+        val redisFormat = new util.HashMap[String, String]
			
 
				+        val redisTemplate = env.getRedisTemplate()
			
 
				+        row.foreach {
			
 
				+          case (key, value, _) =>
			
 
				+            if (key.nonEmpty && value != null && value.nonEmpty) {
			
 
				+              redisFormat.put(videoRedisKeyPrefix + key, value)
			
 
				+              if (ifWriteRedis) {
			
 
				+                redisTemplate.opsForValue.set(videoRedisKeyPrefix + key, value, 24 * expireDay, TimeUnit.HOURS)
			
 
				+              }
			
 
				+            }
			
 
				+        }
			
 
				+//        if (ifWriteRedis){
			
 
				+//          redisTemplate.opsForValue.multiSet(redisFormat)
			
 
				+//          redisFormat.keySet.foreach(key => redisTemplate.expire(key, 24 * expireDay, TimeUnit.HOURS))
			
 
				+//        }
			
 
				+        redisFormat.iterator
			
 
				+      })
			
 
				+      if (savePathVideo.nonEmpty && savePathVideo.startsWith("/dw/recommend/model/")){
			
 
				+        val savePathPart = savePathVideo + "/" + partition
			
 
				+        MyHdfsUtils.delete_hdfs_path(savePathPart)
			
 
				+        itemDataTakeRddRun.map(r => r._1 + "\t" + r._2).saveAsTextFile(savePathPart, classOf[GzipCodec])
			
 
				+      }
			
 
				+      println("item.action.count=" + itemDataTakeRddRun.count())
			
 
				+    }else{
			
 
				+      println("不处理video")
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  def handleUser(record: Record, schema: TableSchema): Tuple3[String, String, Int] = {
			
 
				+    val userKey = "mids"
			
 
				+    val mid = record.getString(userKey)
			
 
				+    val reqContext: RequestContextOffline = new RequestContextOffline()
			
 
				+    reqContext.putUserFeature(record)
			
 
				+    // reqContext.featureMap.put("mid", mid)
			
 
				+    val gson = (new GsonBuilder).serializeSpecialFloatingPointValues.create
			
 
				+    val value = gson.toJson(reqContext.featureMap)
			
 
				+    (mid, value, reqContext.featureMap.size())
			
 
				+  }
			
 
				+
			
 
				+  def handleItem(record: Record, schema: TableSchema, date:String): Tuple3[String, String, Int] = {
			
 
				+    val videoKey = "videoid"
			
 
				+    val videoid = record.getBigint(videoKey).toString
			
 
				+    val reqContext: RequestContextOffline = new RequestContextOffline()
			
 
				+
			
 
				+    //---------todo 有特征不在表里 临时修复---------
			
 
				+//    val i_title_len =  if (record.getString("title") != null) record.getString("title").length.toString else ""
			
 
				+//    val i_days_since_upload = if (record.getDatetime("gmt_create") != null){
			
 
				+//      val format = new SimpleDateFormat("yyyyMMdd")
			
 
				+//      val dateOld = format.format(record.getDatetime("gmt_create"))
			
 
				+//      val dayDiff = MyDateUtils.calculateDateDifference(dateOld, date)
			
 
				+//      dayDiff.toString
			
 
				+//    }else{
			
 
				+//      ""
			
 
				+//    }
			
 
				+//    if (i_title_len.nonEmpty){
			
 
				+//      val d = reqContext.bucketRatioFeature(i_title_len.toDouble)
			
 
				+//      reqContext.featureMap.put("i_title_len", d.toString)
			
 
				+//    }
			
 
				+//    if (i_days_since_upload.nonEmpty) {
			
 
				+//      val d = reqContext.bucketRatioFeature(i_days_since_upload.toDouble)
			
 
				+//      reqContext.featureMap.put("i_days_since_upload", d.toString)
			
 
				+//    }
			
 
				+    //------修复完成---------
			
 
				+
			
 
				+    reqContext.putItemFeature(record)
			
 
				+     reqContext.featureMap.put("videoid", videoid)
			
 
				+
			
 
				+    val gson = (new GsonBuilder).serializeSpecialFloatingPointValues.create
			
 
				+    val value = gson.toJson(reqContext.featureMap)
			
 
				+    (videoid, value, reqContext.featureMap.size())
			
 
				+  }
			
 
				+
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_03_deleteredis.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_03_deleteredis.scala
@@ -0,0 +1,74 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import examples.dataloader.RecommRedisFeatureConstructor
			
 
				+import org.apache.spark.aliyun.odps.OdpsOps
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{ParamUtils, env}
			
 
				+
			
 
				+import java.util
			
 
				+import scala.collection.JavaConversions._
			
 
				+
			
 
				+
			
 
				+object makedata_03_deleteredis {
			
 
				+  def main(args: Array[String]) {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+
			
 
				+    // 读取数据库odps
			
 
				+    val accessKeyId = "LTAIWYUujJAm7CbH"
			
 
				+    val accessKeySecret = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P"
			
 
				+    val odpsUrl = "http://service.odps.aliyun.com/api"
			
 
				+    val tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com"
			
 
				+
			
 
				+    val project = "loghubods"
			
 
				+    val tableItem = "alg_recsys_video_info"
			
 
				+    val tableUser = "alg_recsys_user_info"
			
 
				+    val partition = "dt=20231220"
			
 
				+
			
 
				+    val odpsOps = OdpsOps(sc, accessKeyId, accessKeySecret, odpsUrl, tunnelUrl)
			
 
				+
			
 
				+    //用户测特征处理
			
 
				+    val userData = odpsOps.readTable(project = project, table = tableUser, partition = partition, transfer = handleUser, numPartition = 100)
			
 
				+    val userDataTake = userData.take(10)
			
 
				+    userDataTake.foreach(r=>{
			
 
				+      println(r.get(0) + "\t" + r.get(1))
			
 
				+    })
			
 
				+
			
 
				+    val userDataTakeRddRun = userData.mapPartitions(row=>{
			
 
				+      val redisTemplate = env.getRedisTemplate()
			
 
				+      val redisFormat = new util.HashMap[String, String]
			
 
				+      row.foreach(r =>{
			
 
				+        val key = r.get(0)
			
 
				+        val value = r.get(1)
			
 
				+        redisFormat.put(key, value)
			
 
				+        if (redisTemplate.hasKey(key)){
			
 
				+          redisTemplate.delete(key)
			
 
				+        }
			
 
				+      })
			
 
				+      // redisTemplate.delete(redisFormat.keySet().toList)
			
 
				+      redisFormat.iterator
			
 
				+    })
			
 
				+    println("delete.user.action.count="+userDataTakeRddRun.count())
			
 
				+
			
 
				+
			
 
				+  }
			
 
				+
			
 
				+  def handleUser(record: Record, schema: TableSchema): util.ArrayList[String] = {
			
 
				+    val feature = RecommRedisFeatureConstructor.constructUserFeature(record)
			
 
				+    val key = String.format("user_info_%s", feature.getUid)
			
 
				+    val value = feature.getValue
			
 
				+    val kv = new util.ArrayList[String](2)
			
 
				+    kv.add(key)
			
 
				+    kv.add(value)
			
 
				+    kv
			
 
				+  }
			
 
				+
			
 
				+
			
 
				+
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_04_rosHdfsFromTablev1.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_04_rosHdfsFromTablev1.scala
@@ -0,0 +1,85 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
			
 
				+import examples.dataloader.{OfflineVlogShareLRFeatureExtractor, RequestContextOffline}
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+
			
 
				+object makedata_04_rosHdfsFromTablev1 {
			
 
				+  def main(args: Array[String]) {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val tablePart = param.getOrElse("tablePart", "64").toInt
			
 
				+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20230101")
			
 
				+    val endStr = param.getOrElse("endStr", "20230101")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/ros_sample/")
			
 
				+    val project = param.getOrElse("project", "loghubods")
			
 
				+    val table = param.getOrElse("table", "alg_recsys_view_sample")
			
 
				+
			
 
				+
			
 
				+    // 2 读取odps+表信息
			
 
				+    val odpsOps = env.getODPS(sc)
			
 
				+
			
 
				+    // 3 循环执行数据生产
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      val partition = partitionPrefix + date
			
 
				+      println("执行partiton:" + partition)
			
 
				+      val odpsData = odpsOps.readTable(project = project,
			
 
				+        table = table,
			
 
				+        partition = partition,
			
 
				+        transfer = func,
			
 
				+        numPartition = tablePart)
			
 
				+        .filter{
			
 
				+          case record =>
			
 
				+            val not_share: String = record.getString("share_ornot")
			
 
				+            "0".equals(not_share)
			
 
				+        }
			
 
				+        .map{
			
 
				+          case record =>
			
 
				+            singleParse(record)
			
 
				+      }
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val hdfsPath = savePath + "/" + partition
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        odpsData.saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+        println("写入数据量:" + odpsData.count())
			
 
				+      }else{
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): Record = {
			
 
				+    record
			
 
				+  }
			
 
				+
			
 
				+  def singleParse(record: Record): String = {
			
 
				+    //1 处理标签
			
 
				+    val label: String = record.getString("return_ornot")
			
 
				+    val newLabel = if ("1".equals(label)) "0" else "1"
			
 
				+    //2 处理特征
			
 
				+    val reqContext: RequestContextOffline = new RequestContextOffline()
			
 
				+    reqContext.putUserFeature(record)
			
 
				+    reqContext.putItemFeature(record)
			
 
				+    reqContext.putSceneFeature(record)
			
 
				+    val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractor()
			
 
				+    bytesFeatureExtractor.makeFeature(reqContext.featureMap)
			
 
				+    val featureMap = bytesFeatureExtractor.featureMap
			
 
				+    newLabel + "\t" + featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t")
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_04_rosHdfsFromTablev2.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_04_rosHdfsFromTablev2.scala
@@ -0,0 +1,106 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
			
 
				+import examples.dataloader.{OfflineVlogShareLRFeatureExtractor, RequestContextOffline}
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+
			
 
				+object makedata_04_rosHdfsFromTablev2 {
			
 
				+  def main(args: Array[String]) {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val tablePart = param.getOrElse("tablePart", "64").toInt
			
 
				+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20230101")
			
 
				+    val endStr = param.getOrElse("endStr", "20230101")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/ros_sample_v2/")
			
 
				+    val project = param.getOrElse("project", "loghubods")
			
 
				+    val table = param.getOrElse("table", "alg_recsys_view_sample")
			
 
				+
			
 
				+
			
 
				+    // 2 读取odps+表信息
			
 
				+    val odpsOps = env.getODPS(sc)
			
 
				+
			
 
				+    // 3 循环执行数据生产
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      val partition = partitionPrefix + date
			
 
				+      println("执行partiton:" + partition)
			
 
				+      val odpsData = odpsOps.readTable(project = project,
			
 
				+        table = table,
			
 
				+        partition = partition,
			
 
				+        transfer = func,
			
 
				+        numPartition = tablePart)
			
 
				+        .filter{
			
 
				+          case record =>
			
 
				+            val not_share: String = record.getString("share_ornot")
			
 
				+            "0".equals(not_share)
			
 
				+        }
			
 
				+        .flatMap(record =>{
			
 
				+          val res = ArrayBuffer[(Record, String)]()
			
 
				+          val hour = record.getString("ctx_hour").toInt
			
 
				+          hour match {
			
 
				+            case 23 => res
			
 
				+            case _ =>
			
 
				+              res.add((record, "0"))
			
 
				+              val label_return = record.getString("return_ornot")
			
 
				+              val expTs = record.getString("view_logtimestamp").toLong / 1000
			
 
				+              if ("0".equals(label_return)) {
			
 
				+                if (!record.isNull("machinecode_clienttimestamp")) {
			
 
				+                  record.getString("machinecode_clienttimestamp").split(",")
			
 
				+                    .map(r => r.split(":")(1).toLong / 1000)
			
 
				+                    .foreach(ts=>{
			
 
				+                      if (ts - expTs < 3600){
			
 
				+                        res.add((record, "1"))
			
 
				+                      }
			
 
				+                    })
			
 
				+                }
			
 
				+              }
			
 
				+              res
			
 
				+          }
			
 
				+        })
			
 
				+        .map{
			
 
				+          case (record, label) =>
			
 
				+            singleParse(record, label)
			
 
				+      }
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val hdfsPath = savePath + "/" + partition
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        odpsData.saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+        println("写入数据量:" + odpsData.count())
			
 
				+      }else{
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): Record = {
			
 
				+    record
			
 
				+  }
			
 
				+
			
 
				+  def singleParse(record: Record, label: String): String = {
			
 
				+    //2 处理特征
			
 
				+    val reqContext: RequestContextOffline = new RequestContextOffline()
			
 
				+    reqContext.putUserFeature(record)
			
 
				+    reqContext.putItemFeature(record)
			
 
				+    reqContext.putSceneFeature(record)
			
 
				+    val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractor()
			
 
				+    bytesFeatureExtractor.makeFeature(reqContext.featureMap)
			
 
				+    val featureMap = bytesFeatureExtractor.featureMap
			
 
				+    label + "\t" + featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t")
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_05_sampleStatic.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_05_sampleStatic.scala
@@ -0,0 +1,43 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, ParamUtils, env}
			
 
				+import examples.dataloader.RecommRedisFeatureConstructor
			
 
				+import org.apache.spark.aliyun.odps.OdpsOps
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import java.util
			
 
				+import scala.collection.JavaConversions._
			
 
				+
			
 
				+
			
 
				+object makedata_05_sampleStatic {
			
 
				+  def main(args: Array[String]) {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20230101")
			
 
				+    val endStr = param.getOrElse("endStr", "20230101")
			
 
				+    val path = param.getOrElse("path", "")
			
 
				+
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      val partition = partitionPrefix + date
			
 
				+      val hdfsPath = path + "/" + partition
			
 
				+      println("数据路径:" + hdfsPath)
			
 
				+      val data = sc.textFile(hdfsPath).map(r =>{
			
 
				+        (r.split("\t")(0), 1)
			
 
				+      }).reduceByKey{
			
 
				+        case (a, b) => a + b
			
 
				+      }
			
 
				+      data.collect().foreach(r=> println(r._1 + "\t" + r._2))
			
 
				+    }
			
 
				+
			
 
				+  }
			
 
				+
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_06_originData.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_06_originData.scala
@@ -0,0 +1,257 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
			
 
				+import examples.extractor.RankExtractorUserFeature
			
 
				+import examples.extractor.RankExtractorItemFeature
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import java.util
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable
			
 
				+import java.util.{Arrays, HashMap, HashSet, Map}
			
 
				+import com.alibaba.fastjson.JSONObject
			
 
				+
			
 
				+/*
			
 
				+   注意：所有的构造特征，原始值为0.0时，当作无意义，不保留； 如果经过change变换，得到0.0，保留。
			
 
				+ */
			
 
				+
			
 
				+object makedata_06_originData {
			
 
				+  def main(args: Array[String]) {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val tablePart = param.getOrElse("tablePart", "32").toInt
			
 
				+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20230101")
			
 
				+    val endStr = param.getOrElse("endStr", "20230101")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/00_sample_data/")
			
 
				+    val project = param.getOrElse("project", "loghubods")
			
 
				+    val table = param.getOrElse("table", "alg_recsys_view_sample_v2")
			
 
				+
			
 
				+
			
 
				+    // 2 读取odps+表信息
			
 
				+    val odpsOps = env.getODPS(sc)
			
 
				+
			
 
				+    // 3 循环执行数据生产
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      val partition = partitionPrefix + date
			
 
				+      println("执行partiton:" + partition)
			
 
				+      val odpsData = odpsOps.readTable(project = project,
			
 
				+        table = table,
			
 
				+        partition = partition,
			
 
				+        transfer = func,
			
 
				+        numPartition = tablePart)
			
 
				+        .map(record => {
			
 
				+
			
 
				+          val originFeatureName = Set(
			
 
				+            "apptype", "logtimestamp", "clientip", "ctx_day", "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
			
 
				+
			
 
				+            "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion",
			
 
				+            "machineinfo_system", "machineinfo_wechatversion", "gmt_create_user",
			
 
				+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
			
 
				+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt",
			
 
				+
			
 
				+            "title", "tags", "total_time", "play_count_total",
			
 
				+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
			
 
				+            "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt"
			
 
				+          )
			
 
				+          val originFeatureMap = getFeatureFromSet(originFeatureName, record)
			
 
				+
			
 
				+          val itemRealtimeFeatureMap = getFeatureFromSet(Set(
			
 
				+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
			
 
				+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
			
 
				+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
			
 
				+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
			
 
				+
			
 
				+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
			
 
				+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
			
 
				+          ), record).map(r => {
			
 
				+            val m = new java.util.HashMap[String, Double]()
			
 
				+            r._2.split(",").foreach(r => {
			
 
				+              m.put(r.split(":")(0), r.split(":")(1).toDouble)
			
 
				+            })
			
 
				+            (r._1, m)
			
 
				+          })
			
 
				+          val javaMap = new HashMap[String, Map[String, java.lang.Double]]()
			
 
				+          itemRealtimeFeatureMap.foreach { case (key, value) =>
			
 
				+            val javaValue = new HashMap[String, java.lang.Double]()
			
 
				+            value.foreach { case (innerKey, innerValue) =>
			
 
				+              javaValue.put(innerKey, innerValue.asInstanceOf[java.lang.Double])
			
 
				+            }
			
 
				+            javaMap.put(key, javaValue)
			
 
				+          }
			
 
				+
			
 
				+          val f1 = getFeatureFromSet(Set(
			
 
				+            "apptype", "logtimestamp", "clientip", "ctx_day", "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
			
 
				+            "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion",
			
 
				+            "machineinfo_system", "machineinfo_wechatversion", "gmt_create_user",
			
 
				+            "title", "tags"
			
 
				+          ), record)
			
 
				+          val f2 = RankExtractorUserFeature.getUserRateFeature(originFeatureMap)
			
 
				+          val f3 = RankExtractorUserFeature.cntFeatureChange(originFeatureMap,
			
 
				+            new util.HashSet[String](util.Arrays.asList(
			
 
				+              "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+              "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+              "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
			
 
				+              "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt"))
			
 
				+          )
			
 
				+          val f4 = RankExtractorItemFeature.getItemRateFeature(originFeatureMap)
			
 
				+          val f5 = RankExtractorItemFeature.cntFeatureChange(originFeatureMap,
			
 
				+            new util.HashSet[String](util.Arrays.asList(
			
 
				+              "total_time", "play_count_total",
			
 
				+              "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+              "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+              "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
			
 
				+              "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt")))
			
 
				+          val f6 = RankExtractorItemFeature.getItemRealtimeTrend(javaMap,
			
 
				+            originFeatureMap.getOrElse("ctx_day", ""),
			
 
				+            originFeatureMap.getOrElse("ctx_hour", ""))
			
 
				+          val f7 = RankExtractorItemFeature.getItemRealtimeCnt(javaMap,
			
 
				+            new util.HashSet[String](util.Arrays.asList(
			
 
				+              "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
			
 
				+              "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
			
 
				+              "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
			
 
				+              "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
			
 
				+
			
 
				+              "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
			
 
				+              "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
			
 
				+            )),
			
 
				+            originFeatureMap.getOrElse("ctx_day", ""),
			
 
				+            originFeatureMap.getOrElse("ctx_hour", "")
			
 
				+          )
			
 
				+          val f8 = RankExtractorItemFeature.getItemRealtimeRate(javaMap,
			
 
				+            originFeatureMap.getOrElse("ctx_day", ""),
			
 
				+            originFeatureMap.getOrElse("ctx_hour", "")
			
 
				+          )
			
 
				+
			
 
				+          // 1:特征聚合到map中
			
 
				+          val result = new util.HashMap[String, String]()
			
 
				+          result ++= f1
			
 
				+          result ++= f2
			
 
				+          result ++= f3
			
 
				+          result ++= f4
			
 
				+          result ++= f5
			
 
				+          result ++= f6
			
 
				+          result ++= f7
			
 
				+          result ++= f8
			
 
				+          val names = Set(
			
 
				+            "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
			
 
				+
			
 
				+            "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion",
			
 
				+            "machineinfo_system", "machineinfo_wechatversion",
			
 
				+
			
 
				+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
			
 
				+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt",
			
 
				+
			
 
				+            "title", "tags", "total_time", "play_count_total",
			
 
				+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
			
 
				+            "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt",
			
 
				+
			
 
				+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
			
 
				+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
			
 
				+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
			
 
				+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
			
 
				+
			
 
				+            "u_7day_ctr", "u_7day_str", "u_7day_rov", "u_7day_ros",
			
 
				+            "u_3month_ctr", "u_3month_str", "u_3month_rov", "u_3month_ros",
			
 
				+            "i_7day_ctr", "i_7day_str", "i_7day_rov", "i_7day_ros",
			
 
				+            "i_3month_ctr", "i_3month_str", "i_3month_rov", "i_3month_ros",
			
 
				+
			
 
				+            "share_uv_list_1day_6_avg", "share_uv_list_1day_6_var", "share_uv_list_1day_diff_6_avg", "share_uv_list_1day_diff_6_var",
			
 
				+            "return_uv_list_1day_6_avg", "return_uv_list_1day_6_var", "return_uv_list_1day_diff_6_avg", "return_uv_list_1day_diff_6_var",
			
 
				+            "share_uv_list_1h_6_avg", "share_uv_list_1h_6_var", "share_uv_list_1h_diff_6_avg", "share_uv_list_1h_diff_6_var",
			
 
				+            "return_uv_list_1h_6_avg", "return_uv_list_1h_6_var", "return_uv_list_1h_diff_6_avg", "return_uv_list_1h_diff_6_var",
			
 
				+
			
 
				+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
			
 
				+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
			
 
				+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
			
 
				+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
			
 
				+
			
 
				+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
			
 
				+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h",
			
 
				+
			
 
				+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
			
 
				+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
			
 
				+
			
 
				+
			
 
				+          )
			
 
				+          val resultNew = new JSONObject
			
 
				+          names.foreach(r => {
			
 
				+            if (result.containsKey(r)){
			
 
				+              resultNew.put(r, result.get(r))
			
 
				+            }
			
 
				+          })
			
 
				+          //2: label聚合到map中
			
 
				+          val labels = Set(
			
 
				+            "is_share", "is_return", "playtime",
			
 
				+            "is_play",
			
 
				+            "share_ts", "share_ts_list", "return_mid_ts_list"
			
 
				+          )
			
 
				+          val labelNew = new JSONObject
			
 
				+          val labelMap = getFeatureFromSet(labels, record)
			
 
				+          labels.foreach(r => {
			
 
				+            if (labelMap.containsKey(r)) {
			
 
				+              labelNew.put(r, labelMap.get(r).get)
			
 
				+            }
			
 
				+          })
			
 
				+          //3：记录唯一key
			
 
				+          val mid = record.getString("mid")
			
 
				+          val videoid = record.getString("videoid")
			
 
				+          val logtimestamp = record.getString("logtimestamp")
			
 
				+          val sessionid = record.getString("sessionid")
			
 
				+
			
 
				+          val logKey = (mid, videoid, logtimestamp, sessionid).productIterator.mkString(":")
			
 
				+          val labelKey = labelNew.toString()
			
 
				+          val featureKey = resultNew.toString()
			
 
				+
			
 
				+          logKey + "\t" + labelKey + "\t" + featureKey
			
 
				+        })
			
 
				+
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val hdfsPath = savePath + "/" + partition
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        odpsData.saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      }else{
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): Record = {
			
 
				+    record
			
 
				+  }
			
 
				+
			
 
				+  def getFeatureFromSet(set: Set[String], record: Record): mutable.HashMap[String, String] = {
			
 
				+    val result = mutable.HashMap[String, String]()
			
 
				+    set.foreach(r =>{
			
 
				+      if (!record.isNull(r)){
			
 
				+        try{
			
 
				+          result.put(r, record.getString(r))
			
 
				+        }catch {
			
 
				+          case _ => result.put(r, String.valueOf(record.getBigint(r)))
			
 
				+        }
			
 
				+      }
			
 
				+    })
			
 
				+    result
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_06_originData_v3.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_06_originData_v3.scala
@@ -0,0 +1,260 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.alibaba.fastjson.JSONObject
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
			
 
				+import examples.extractor.{RankExtractorItemFeature, RankExtractorUserFeature}
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import java.util
			
 
				+import java.util.{HashMap, Map}
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable
			
 
				+
			
 
				+/*
			
 
				+   注意：所有的构造特征，原始值为0.0时，当作无意义，不保留； 如果经过change变换，得到0.0，保留。
			
 
				+   =》 所有获取不到的特征，给默认值0.
			
 
				+ */
			
 
				+
			
 
				+object makedata_06_originData_v3 {
			
 
				+  def main(args: Array[String]) {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val tablePart = param.getOrElse("tablePart", "32").toInt
			
 
				+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20230101")
			
 
				+    val endStr = param.getOrElse("endStr", "20230101")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/00_sample_data_v3/")
			
 
				+    val project = param.getOrElse("project", "loghubods")
			
 
				+    val table = param.getOrElse("table", "alg_recsys_view_sample_v3")
			
 
				+
			
 
				+
			
 
				+    // 2 读取odps+表信息
			
 
				+    val odpsOps = env.getODPS(sc)
			
 
				+
			
 
				+    // 3 循环执行数据生产
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      val partition = partitionPrefix + date
			
 
				+      println("执行partiton:" + partition)
			
 
				+      val odpsData = odpsOps.readTable(project = project,
			
 
				+        table = table,
			
 
				+        partition = partition,
			
 
				+        transfer = func,
			
 
				+        numPartition = tablePart)
			
 
				+        .map(record => {
			
 
				+
			
 
				+          val originFeatureName = Set(
			
 
				+            "apptype", "logtimestamp", "clientip", "ctx_day", "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
			
 
				+
			
 
				+            "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion",
			
 
				+            "machineinfo_system", "machineinfo_wechatversion", "gmt_create_user",
			
 
				+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
			
 
				+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt",
			
 
				+
			
 
				+            "title", "tags", "total_time", "play_count_total",
			
 
				+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
			
 
				+            "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt",
			
 
				+            "video_recommend"
			
 
				+          )
			
 
				+          val originFeatureMap = getFeatureFromSet(originFeatureName, record)
			
 
				+
			
 
				+          val itemRealtimeFeatureMap = getFeatureFromSet(Set(
			
 
				+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
			
 
				+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
			
 
				+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
			
 
				+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
			
 
				+
			
 
				+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
			
 
				+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
			
 
				+          ), record).map(r => {
			
 
				+            val m = new java.util.HashMap[String, Double]()
			
 
				+            r._2.split(",").foreach(r => {
			
 
				+              m.put(r.split(":")(0), r.split(":")(1).toDouble)
			
 
				+            })
			
 
				+            (r._1, m)
			
 
				+          })
			
 
				+          val javaMap = new HashMap[String, Map[String, java.lang.Double]]()
			
 
				+          itemRealtimeFeatureMap.foreach { case (key, value) =>
			
 
				+            val javaValue = new HashMap[String, java.lang.Double]()
			
 
				+            value.foreach { case (innerKey, innerValue) =>
			
 
				+              javaValue.put(innerKey, innerValue.asInstanceOf[java.lang.Double])
			
 
				+            }
			
 
				+            javaMap.put(key, javaValue)
			
 
				+          }
			
 
				+
			
 
				+          val f1 = getFeatureFromSet(Set(
			
 
				+            "apptype", "logtimestamp", "clientip", "ctx_day", "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
			
 
				+            "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion",
			
 
				+            "machineinfo_system", "machineinfo_wechatversion", "gmt_create_user",
			
 
				+            "title", "tags", "video_recommend"
			
 
				+          ), record)
			
 
				+          val f2 = RankExtractorUserFeature.getUserRateFeature(originFeatureMap)
			
 
				+          val f3 = RankExtractorUserFeature.cntFeatureChange(originFeatureMap,
			
 
				+            new util.HashSet[String](util.Arrays.asList(
			
 
				+              "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+              "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+              "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
			
 
				+              "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt"))
			
 
				+          )
			
 
				+          val f4 = RankExtractorItemFeature.getItemRateFeature(originFeatureMap)
			
 
				+          val f5 = RankExtractorItemFeature.cntFeatureChange(originFeatureMap,
			
 
				+            new util.HashSet[String](util.Arrays.asList(
			
 
				+              "total_time", "play_count_total",
			
 
				+              "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+              "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+              "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
			
 
				+              "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt")))
			
 
				+          val f6 = RankExtractorItemFeature.getItemRealtimeTrend(javaMap,
			
 
				+            originFeatureMap.getOrElse("ctx_day", ""),
			
 
				+            originFeatureMap.getOrElse("ctx_hour", ""))
			
 
				+          val f7 = RankExtractorItemFeature.getItemRealtimeCnt(javaMap,
			
 
				+            new util.HashSet[String](util.Arrays.asList(
			
 
				+              "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
			
 
				+              "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
			
 
				+              "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
			
 
				+              "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
			
 
				+
			
 
				+              "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
			
 
				+              "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
			
 
				+            )),
			
 
				+            originFeatureMap.getOrElse("ctx_day", ""),
			
 
				+            originFeatureMap.getOrElse("ctx_hour", "")
			
 
				+          )
			
 
				+          val f8 = RankExtractorItemFeature.getItemRealtimeRate(javaMap,
			
 
				+            originFeatureMap.getOrElse("ctx_day", ""),
			
 
				+            originFeatureMap.getOrElse("ctx_hour", "")
			
 
				+          )
			
 
				+
			
 
				+          // 1:特征聚合到map中
			
 
				+          val result = new util.HashMap[String, String]()
			
 
				+          result ++= f1
			
 
				+          result ++= f2
			
 
				+          result ++= f3
			
 
				+          result ++= f4
			
 
				+          result ++= f5
			
 
				+          result ++= f6
			
 
				+          result ++= f7
			
 
				+          result ++= f8
			
 
				+          val names = Set(
			
 
				+            "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
			
 
				+
			
 
				+            "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion",
			
 
				+            "machineinfo_system", "machineinfo_wechatversion",
			
 
				+
			
 
				+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
			
 
				+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt",
			
 
				+
			
 
				+            "title", "tags", "total_time", "play_count_total", "video_recommend",
			
 
				+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
			
 
				+            "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt",
			
 
				+
			
 
				+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
			
 
				+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
			
 
				+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
			
 
				+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
			
 
				+
			
 
				+            "u_7day_ctr", "u_7day_str", "u_7day_rov", "u_7day_ros",
			
 
				+            "u_3month_ctr", "u_3month_str", "u_3month_rov", "u_3month_ros",
			
 
				+            "i_7day_ctr", "i_7day_str", "i_7day_rov", "i_7day_ros",
			
 
				+            "i_3month_ctr", "i_3month_str", "i_3month_rov", "i_3month_ros",
			
 
				+
			
 
				+            "share_uv_list_1day_6_avg", "share_uv_list_1day_6_var", "share_uv_list_1day_diff_6_avg", "share_uv_list_1day_diff_6_var",
			
 
				+            "return_uv_list_1day_6_avg", "return_uv_list_1day_6_var", "return_uv_list_1day_diff_6_avg", "return_uv_list_1day_diff_6_var",
			
 
				+            "share_uv_list_1h_6_avg", "share_uv_list_1h_6_var", "share_uv_list_1h_diff_6_avg", "share_uv_list_1h_diff_6_var",
			
 
				+            "return_uv_list_1h_6_avg", "return_uv_list_1h_6_var", "return_uv_list_1h_diff_6_avg", "return_uv_list_1h_diff_6_var",
			
 
				+
			
 
				+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
			
 
				+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
			
 
				+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
			
 
				+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
			
 
				+
			
 
				+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
			
 
				+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h",
			
 
				+
			
 
				+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
			
 
				+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
			
 
				+
			
 
				+
			
 
				+          )
			
 
				+          val resultNew = new JSONObject
			
 
				+          names.foreach(r => {
			
 
				+            if (result.containsKey(r)){
			
 
				+              resultNew.put(r, result.get(r))
			
 
				+            }
			
 
				+          })
			
 
				+          //2: label聚合到map中
			
 
				+          val labels = Set(
			
 
				+            "pagesource", "recommend_page_type", "pagesource_change",
			
 
				+            "abcode",
			
 
				+            "is_play", "playtime",
			
 
				+            "is_share", "share_cnt_pv", "share_ts_list",
			
 
				+            "is_return", "return_cnt_pv", "return_cnt_uv", "return_mid_ts_list"
			
 
				+          )
			
 
				+          val labelNew = new JSONObject
			
 
				+          val labelMap = getFeatureFromSet(labels, record)
			
 
				+          labels.foreach(r => {
			
 
				+            if (labelMap.containsKey(r)) {
			
 
				+              labelNew.put(r, labelMap.get(r).get)
			
 
				+            }
			
 
				+          })
			
 
				+          //3：记录唯一key
			
 
				+          val mid = record.getString("mid")
			
 
				+          val videoid = record.getString("videoid")
			
 
				+          val logtimestamp = record.getString("logtimestamp")
			
 
				+          val apptype = record.getString("apptype")
			
 
				+
			
 
				+          val logKey = (mid, videoid, logtimestamp, apptype).productIterator.mkString(":")
			
 
				+          val labelKey = labelNew.toString()
			
 
				+          val featureKey = resultNew.toString()
			
 
				+
			
 
				+          logKey + "\t" + labelKey + "\t" + featureKey
			
 
				+        })
			
 
				+
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val hdfsPath = savePath + "/" + partition
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        odpsData.saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      }else{
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): Record = {
			
 
				+    record
			
 
				+  }
			
 
				+
			
 
				+  def getFeatureFromSet(set: Set[String], record: Record): mutable.HashMap[String, String] = {
			
 
				+    val result = mutable.HashMap[String, String]()
			
 
				+    set.foreach(r =>{
			
 
				+      if (!record.isNull(r)){
			
 
				+        try{
			
 
				+          result.put(r, record.getString(r))
			
 
				+        }catch {
			
 
				+          case _ => result.put(r, String.valueOf(record.getBigint(r)))
			
 
				+        }
			
 
				+      }
			
 
				+    })
			
 
				+    result
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_07_rosData.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_07_rosData.scala
@@ -0,0 +1,243 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
			
 
				+import examples.dataloader.{OfflineVlogShareLRFeatureExtractorV1, OfflineVlogShareLRFeatureExtractorV2}
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import java.util
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import com.alibaba.fastjson.JSONObject
			
 
				+
			
 
				+
			
 
				+object makedata_07_rosData {
			
 
				+  def main(args: Array[String]) {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20230101")
			
 
				+    val endStr = param.getOrElse("endStr", "20230101")
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/00_sample_data/")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/04_ros_data/")
			
 
				+    val featureVersion =  param.getOrElse("featureVersion", "v2")
			
 
				+    val ifRepart = param.getOrElse("ifRepart", "10").toInt
			
 
				+    val labelVersion = param.getOrElse("labelVersion", "v1")
			
 
				+
			
 
				+
			
 
				+
			
 
				+    // 3 循环执行数据生产
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      val partition = partitionPrefix + date
			
 
				+      println("执行partiton:" + partition)
			
 
				+      var hdfsPath = readPath + "/" + partition
			
 
				+
			
 
				+      // 4 过滤保留分享样本
			
 
				+      val dataFilter = sc.textFile(hdfsPath).map(r=>{
			
 
				+        val rList = r.split("\t")
			
 
				+        val logKeyStr = rList(0)
			
 
				+        val labelStr = rList(1)
			
 
				+        val feaStr = rList(2)
			
 
				+        val logTs = logKeyStr.split(":")(2)
			
 
				+        val labelJson = JSON.parseObject(labelStr)
			
 
				+        val feaJson = JSON.parseObject(feaStr)
			
 
				+        val is_share = if (labelJson.containsKey("is_share")) labelJson.getString("is_share") else "0"
			
 
				+        (logTs, feaJson, labelJson, is_share)
			
 
				+      }).filter(_._4.equals("1"))
			
 
				+
			
 
				+      // 5 label处理
			
 
				+      val dataTrain = labelVersion match {
			
 
				+        case "v2" => dataFilter.flatMap({
			
 
				+          case (logTs, feaJson, labelJson, _) =>
			
 
				+            val res = ArrayBuffer[(String, JSONObject)]()
			
 
				+            val hour = feaJson.getString("ctx_hour").toInt
			
 
				+            val expTs = logTs.toLong / 1000
			
 
				+            hour match {
			
 
				+              case 23 => res
			
 
				+              case _ =>
			
 
				+                res.add(("0", feaJson))
			
 
				+                val is_return = if (labelJson.containsKey("is_return")) labelJson.getString("is_return") else "0"
			
 
				+                if ("1".equals(is_return)) {
			
 
				+                  if (labelJson.containsKey("return_mid_ts_list")){
			
 
				+                    labelJson.getString("return_mid_ts_list").split(",")
			
 
				+                      .map(r => r.split(":")(1).toLong / 1000)
			
 
				+                      .foreach(ts => {
			
 
				+                        if (ts - expTs < 3600) {
			
 
				+                          res.add(("1", feaJson))
			
 
				+                        }
			
 
				+                      })
			
 
				+                  }
			
 
				+                }
			
 
				+                res
			
 
				+            }
			
 
				+        })
			
 
				+        case _ => dataFilter.map({
			
 
				+          case (logTs, feaJson, labelJson, _) =>
			
 
				+            val is_return = if (labelJson.containsKey("is_return")) labelJson.getString("is_return") else "0"
			
 
				+            (is_return, feaJson)
			
 
				+        })
			
 
				+      }
			
 
				+      // 6 特征选择
			
 
				+      val data = dataTrain.map{
			
 
				+        case (is_return, feaJson) =>
			
 
				+          if ("v1".equals(featureVersion)) {
			
 
				+            val feaSet = Set(
			
 
				+              "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
			
 
				+              "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
			
 
				+              "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+              "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+              "total_time", "play_count_total",
			
 
				+              "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+              "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+              "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
			
 
				+              "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
			
 
				+              "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
			
 
				+              "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros"
			
 
				+            )
			
 
				+            val feaMap = new util.HashMap[String, String]()
			
 
				+            feaSet.foreach(r => {
			
 
				+              if (feaJson.containsKey(r)) {
			
 
				+                feaMap.put(r, feaJson.getString(r))
			
 
				+              }
			
 
				+            })
			
 
				+            val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV1()
			
 
				+            bytesFeatureExtractor.makeFeature4String(feaMap)
			
 
				+            val featureMap = bytesFeatureExtractor.featureMap
			
 
				+            (is_return, featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t"))
			
 
				+
			
 
				+          } else if ("v2".equals(featureVersion)) {
			
 
				+            val feaSet = Set(
			
 
				+              "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
			
 
				+              "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
			
 
				+              "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+              "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+              "total_time", "play_count_total",
			
 
				+              "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+              "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+              "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
			
 
				+              "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
			
 
				+              "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
			
 
				+              "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
			
 
				+
			
 
				+              //            "share_uv_list_1day_6_avg", "share_uv_list_1day_6_var", "share_uv_list_1day_diff_6_avg", "share_uv_list_1day_diff_6_var",
			
 
				+              //            "return_uv_list_1day_6_avg", "return_uv_list_1day_6_var", "return_uv_list_1day_diff_6_avg", "return_uv_list_1day_diff_6_var",
			
 
				+              //            "share_uv_list_1h_6_avg", "share_uv_list_1h_6_var", "share_uv_list_1h_diff_6_avg", "share_uv_list_1h_diff_6_var",
			
 
				+              //            "return_uv_list_1h_6_avg", "return_uv_list_1h_6_var", "return_uv_list_1h_diff_6_avg", "return_uv_list_1h_diff_6_var",
			
 
				+
			
 
				+              "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
			
 
				+              "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
			
 
				+              "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
			
 
				+              "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
			
 
				+
			
 
				+              "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
			
 
				+              "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h",
			
 
				+
			
 
				+              "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
			
 
				+              "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
			
 
				+            )
			
 
				+            val feaMap = new util.HashMap[String, String]()
			
 
				+            feaSet.foreach(r => {
			
 
				+              if (feaJson.containsKey(r)) {
			
 
				+                feaMap.put(r, feaJson.getString(r))
			
 
				+              }
			
 
				+            })
			
 
				+            val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV2()
			
 
				+            bytesFeatureExtractor.makeFeature4String(feaMap)
			
 
				+            val featureMap = bytesFeatureExtractor.featureMap
			
 
				+            (is_return, featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t"))
			
 
				+          } else if ("v4".equals(featureVersion)) {
			
 
				+            val feaSet = Set(
			
 
				+              "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
			
 
				+              "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
			
 
				+              "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+              "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+              "total_time", "play_count_total",
			
 
				+              "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+              "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+              "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
			
 
				+              "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
			
 
				+              "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
			
 
				+              "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
			
 
				+
			
 
				+              "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
			
 
				+              "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
			
 
				+            )
			
 
				+            val feaMap = new util.HashMap[String, String]()
			
 
				+            feaSet.foreach(r => {
			
 
				+              if (feaJson.containsKey(r)) {
			
 
				+                feaMap.put(r, feaJson.getString(r))
			
 
				+              }
			
 
				+            })
			
 
				+            val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV2()
			
 
				+            bytesFeatureExtractor.makeFeature4String(feaMap)
			
 
				+            val featureMap = bytesFeatureExtractor.featureMap
			
 
				+            (is_return, featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t"))
			
 
				+          } else if ("v5".equals(featureVersion)) {
			
 
				+            val feaSet = Set(
			
 
				+              "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
			
 
				+              "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
			
 
				+              "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+              "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+              "total_time", "play_count_total",
			
 
				+              "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+              "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+              "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
			
 
				+              "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
			
 
				+              "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
			
 
				+              "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
			
 
				+
			
 
				+              "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
			
 
				+              "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
			
 
				+              "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
			
 
				+              "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
			
 
				+
			
 
				+              "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
			
 
				+              "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
			
 
				+            )
			
 
				+            val feaMap = new util.HashMap[String, String]()
			
 
				+            feaSet.foreach(r => {
			
 
				+              if (feaJson.containsKey(r)) {
			
 
				+                feaMap.put(r, feaJson.getString(r))
			
 
				+              }
			
 
				+            })
			
 
				+            val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV2()
			
 
				+            bytesFeatureExtractor.makeFeature4String(feaMap)
			
 
				+            val featureMap = bytesFeatureExtractor.featureMap
			
 
				+            (is_return, featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t"))
			
 
				+          } else {
			
 
				+            (is_return, "")
			
 
				+          }
			
 
				+      }.filter(_._2.nonEmpty).map(r=> r._1 + "\t" + r._2)
			
 
				+
			
 
				+      // 7 保存数据到hdfs
			
 
				+      hdfsPath = savePath + "/" + partition
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        if (ifRepart == 0){
			
 
				+          data.saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+        }else{
			
 
				+          data.repartition(ifRepart).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+        }
			
 
				+      }else{
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): Record = {
			
 
				+    record
			
 
				+  }
			
 
				+
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_07_strData.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_07_strData.scala
@@ -0,0 +1,202 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import java.util
			
 
				+import scala.collection.JavaConversions._
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import examples.dataloader.OfflineVlogShareLRFeatureExtractorV2
			
 
				+import examples.dataloader.OfflineVlogShareLRFeatureExtractorV1
			
 
				+
			
 
				+
			
 
				+object makedata_07_strData {
			
 
				+  def main(args: Array[String]) {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20230101")
			
 
				+    val endStr = param.getOrElse("endStr", "20230101")
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/00_sample_data/")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/01_str_data/")
			
 
				+    val featureVersion =  param.getOrElse("featureVersion", "v2")
			
 
				+    val ifRepart = param.getOrElse("ifRepart", "100").toInt
			
 
				+
			
 
				+
			
 
				+
			
 
				+    // 3 循环执行数据生产
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      val partition = partitionPrefix + date
			
 
				+      println("执行partiton:" + partition)
			
 
				+      var hdfsPath = readPath + "/" + partition
			
 
				+      val data = sc.textFile(hdfsPath).map(r=>{
			
 
				+        val rList = r.split("\t")
			
 
				+        val labelStr = rList(1)
			
 
				+        val feaStr = rList(2)
			
 
				+        val labelJson = JSON.parseObject(labelStr)
			
 
				+        val label = if (labelJson.containsKey("is_share")) labelJson.getString("is_share") else "0"
			
 
				+        val feaJson = JSON.parseObject(feaStr)
			
 
				+
			
 
				+
			
 
				+        if ("v1".equals(featureVersion)){
			
 
				+          val feaSet = Set(
			
 
				+            "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
			
 
				+            "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
			
 
				+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+            "total_time", "play_count_total",
			
 
				+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
			
 
				+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
			
 
				+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
			
 
				+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros"
			
 
				+          )
			
 
				+          val feaMap = new util.HashMap[String, String]()
			
 
				+          feaSet.foreach(r => {
			
 
				+            if (feaJson.containsKey(r)) {
			
 
				+              feaMap.put(r, feaJson.getString(r))
			
 
				+            }
			
 
				+          })
			
 
				+          val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV1()
			
 
				+          bytesFeatureExtractor.makeFeature4String(feaMap)
			
 
				+          val featureMap = bytesFeatureExtractor.featureMap
			
 
				+          label + "\t" + featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t")
			
 
				+
			
 
				+        }else if ("v2".equals(featureVersion)){
			
 
				+          val feaSet = Set(
			
 
				+            "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
			
 
				+            "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
			
 
				+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+            "total_time", "play_count_total",
			
 
				+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
			
 
				+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
			
 
				+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
			
 
				+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
			
 
				+
			
 
				+//            "share_uv_list_1day_6_avg", "share_uv_list_1day_6_var", "share_uv_list_1day_diff_6_avg", "share_uv_list_1day_diff_6_var",
			
 
				+//            "return_uv_list_1day_6_avg", "return_uv_list_1day_6_var", "return_uv_list_1day_diff_6_avg", "return_uv_list_1day_diff_6_var",
			
 
				+//            "share_uv_list_1h_6_avg", "share_uv_list_1h_6_var", "share_uv_list_1h_diff_6_avg", "share_uv_list_1h_diff_6_var",
			
 
				+//            "return_uv_list_1h_6_avg", "return_uv_list_1h_6_var", "return_uv_list_1h_diff_6_avg", "return_uv_list_1h_diff_6_var",
			
 
				+
			
 
				+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
			
 
				+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
			
 
				+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
			
 
				+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
			
 
				+
			
 
				+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
			
 
				+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h",
			
 
				+
			
 
				+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
			
 
				+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
			
 
				+          )
			
 
				+          val feaMap = new util.HashMap[String, String]()
			
 
				+          feaSet.foreach(r => {
			
 
				+            if (feaJson.containsKey(r)) {
			
 
				+              feaMap.put(r, feaJson.getString(r))
			
 
				+            }
			
 
				+          })
			
 
				+          val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV2()
			
 
				+          bytesFeatureExtractor.makeFeature4String(feaMap)
			
 
				+          val featureMap = bytesFeatureExtractor.featureMap
			
 
				+          label + "\t" + featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t")
			
 
				+        }else if ("v4".equals(featureVersion)){
			
 
				+          val feaSet = Set(
			
 
				+            "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
			
 
				+            "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
			
 
				+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+            "total_time", "play_count_total",
			
 
				+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
			
 
				+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
			
 
				+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
			
 
				+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
			
 
				+
			
 
				+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
			
 
				+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
			
 
				+          )
			
 
				+          val feaMap = new util.HashMap[String, String]()
			
 
				+          feaSet.foreach(r => {
			
 
				+            if (feaJson.containsKey(r)) {
			
 
				+              feaMap.put(r, feaJson.getString(r))
			
 
				+            }
			
 
				+          })
			
 
				+          val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV2()
			
 
				+          bytesFeatureExtractor.makeFeature4String(feaMap)
			
 
				+          val featureMap = bytesFeatureExtractor.featureMap
			
 
				+          label + "\t" + featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t")
			
 
				+        } else if ("v5".equals(featureVersion)) {
			
 
				+          val feaSet = Set(
			
 
				+            "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
			
 
				+            "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
			
 
				+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+            "total_time", "play_count_total",
			
 
				+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
			
 
				+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
			
 
				+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
			
 
				+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
			
 
				+
			
 
				+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
			
 
				+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
			
 
				+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
			
 
				+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
			
 
				+
			
 
				+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
			
 
				+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
			
 
				+          )
			
 
				+          val feaMap = new util.HashMap[String, String]()
			
 
				+          feaSet.foreach(r => {
			
 
				+            if (feaJson.containsKey(r)) {
			
 
				+              feaMap.put(r, feaJson.getString(r))
			
 
				+            }
			
 
				+          })
			
 
				+          val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV2()
			
 
				+          bytesFeatureExtractor.makeFeature4String(feaMap)
			
 
				+          val featureMap = bytesFeatureExtractor.featureMap
			
 
				+          label + "\t" + featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t")
			
 
				+        }
			
 
				+
			
 
				+      })
			
 
				+      // 4 保存数据到hdfs
			
 
				+      hdfsPath = savePath + "/" + partition
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        if (ifRepart == 0){
			
 
				+          data.saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+        }else{
			
 
				+          data.repartition(ifRepart).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+        }
			
 
				+
			
 
				+      }else{
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): Record = {
			
 
				+    record
			
 
				+  }
			
 
				+
			
 
				+
			
 
				+
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_08_item2redis.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_08_item2redis.scala
@@ -0,0 +1,140 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.alibaba.fastjson.JSONObject
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyHdfsUtils, ParamUtils, env}
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import java.util
			
 
				+import java.util.Date
			
 
				+import java.util.concurrent.TimeUnit
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable
			
 
				+
			
 
				+
			
 
				+object makedata_08_item2redis {
			
 
				+  def main(args: Array[String]) {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
			
 
				+    val tablePart = param.getOrElse("tablePart", "64").toInt
			
 
				+    val ifUser = param.getOrDefault("ifUser", "False").toBoolean
			
 
				+    val ifVideo = param.getOrDefault("ifVideo", "False").toBoolean
			
 
				+    val date = param.getOrDefault("date", "20231220")
			
 
				+    val expireDay = param.getOrDefault("expireDay", "2").toInt
			
 
				+    val ifDebug = param.getOrDefault("ifDebug", "False").toBoolean
			
 
				+    val ifDeleteRedisUser = param.getOrDefault("ifDeleteRedisUser", "False").toBoolean
			
 
				+    val ifWriteRedisUser = param.getOrDefault("ifWriteRedisUser", "False").toBoolean
			
 
				+    val ifWriteRedis = param.getOrDefault("ifWriteRedis", "True").toBoolean
			
 
				+    val partition = partitionPrefix + date
			
 
				+    val savePathUser = param.getOrDefault("savePathUser", "")
			
 
				+    val savePathVideo = param.getOrDefault("savePathVideo", "")
			
 
				+    val userSampleIDs = param.getOrDefault("userSampleIDs", "")
			
 
				+    val sampleRate = param.getOrDefault("sampleRate", "1.0").toDouble
			
 
				+
			
 
				+
			
 
				+    // 2 读取数据库odps
			
 
				+    val odpsOps = env.getODPS(sc)
			
 
				+    val project = "loghubods"
			
 
				+    val tableItem = "alg_recsys_video_info"
			
 
				+    val videoRedisKeyPrefix = "video_info_"
			
 
				+
			
 
				+    // 4 video测特征处理
			
 
				+    if (ifVideo){
			
 
				+      println("video特征处理")
			
 
				+      val itemData = odpsOps.readTable(project = project, table = tableItem, partition = partition, transfer = func, numPartition = tablePart)
			
 
				+
			
 
				+      val itemDataTakeRddRun = itemData.map(record =>{
			
 
				+        val originFeatureName = Set(
			
 
				+          "gmt_create", "existence_days",
			
 
				+          "title", "tags", "total_time", "play_count_total",
			
 
				+          "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+          "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+          "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
			
 
				+          "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt"
			
 
				+        )
			
 
				+//        val myList: List[(String, String)] = List(("value1", "value2"), ("value3", "value4"))
			
 
				+        val originFeatureMap = getFeatureFromRecord(originFeatureName, record)
			
 
				+        val videoid = record.getBigint("videoid").toString
			
 
				+        val resultNew = new JSONObject
			
 
				+        originFeatureName.foreach(r => {
			
 
				+          if (originFeatureMap.containsKey(r)) {
			
 
				+            val v = originFeatureMap.get(r).get
			
 
				+            resultNew.put(r, v)
			
 
				+          }
			
 
				+        })
			
 
				+        (videoid, resultNew.toString())
			
 
				+      }).mapPartitions(row => {
			
 
				+          val redisFormat = new util.HashMap[String, String]
			
 
				+          val redisFormatSave = new util.HashMap[String, String]
			
 
				+          val redisTemplate = env.getRedisTemplate()
			
 
				+          var i = 1
			
 
				+          row.foreach {
			
 
				+            case (key, value) =>
			
 
				+              if (key.nonEmpty && value != null && value.nonEmpty) {
			
 
				+                redisFormat.put(videoRedisKeyPrefix + key, value)
			
 
				+                redisFormatSave.put(videoRedisKeyPrefix + key, value)
			
 
				+              }
			
 
				+              if (i % 1000 == 0 && ifWriteRedis) {
			
 
				+                redisTemplate.opsForValue.multiSet(redisFormat)
			
 
				+                redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
			
 
				+                redisFormat.clear()
			
 
				+              }
			
 
				+              i = i + 1
			
 
				+          }
			
 
				+          if (ifWriteRedis){
			
 
				+            redisTemplate.opsForValue.multiSet(redisFormat)
			
 
				+            redisFormat.keySet.foreach(key => redisTemplate.expire(key, 24 * expireDay, TimeUnit.HOURS))
			
 
				+            redisFormat.clear()
			
 
				+          }
			
 
				+          redisFormatSave.iterator
			
 
				+      })
			
 
				+      if (savePathVideo.nonEmpty && savePathVideo.startsWith("/dw/recommend/model/")){
			
 
				+        val savePathPart = savePathVideo + "/" + partition
			
 
				+        MyHdfsUtils.delete_hdfs_path(savePathPart)
			
 
				+        itemDataTakeRddRun.map(r => r._1 + "\t" + r._2).saveAsTextFile(savePathPart, classOf[GzipCodec])
			
 
				+      }
			
 
				+      println("item写入成功：item.action.count=" + itemDataTakeRddRun.count())
			
 
				+    }else{
			
 
				+      println("不处理video")
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): Record = {
			
 
				+    record
			
 
				+  }
			
 
				+
			
 
				+  def getFeatureFromRecord(set: Set[String], record: Record): mutable.HashMap[String, String] = {
			
 
				+    val result = mutable.HashMap[String, String]()
			
 
				+    set.foreach(r => {
			
 
				+      if (!record.isNull(r)) {
			
 
				+        val obj = record.get(r)
			
 
				+        if (obj.isInstanceOf[String]){
			
 
				+          result.put(r, record.getString(r))
			
 
				+        } else if (obj.isInstanceOf[BigInt]){
			
 
				+          result.put(r, String.valueOf(record.getBigint(r)))
			
 
				+        } else if (obj.isInstanceOf[Double]) {
			
 
				+          result.put(r, String.valueOf(record.getDouble(r)))
			
 
				+        } else if (obj.isInstanceOf[Date]) {
			
 
				+          result.put(r, String.valueOf(record.getDatetime(r)))
			
 
				+        } else {
			
 
				+          try {
			
 
				+            result.put(r, record.getString(r))
			
 
				+          } catch {
			
 
				+            case _ => result.put(r, String.valueOf(record.getBigint(r)))
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+    })
			
 
				+    result
			
 
				+  }
			
 
				+
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_09_user2redis.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_09_user2redis.scala
@@ -0,0 +1,220 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
			
 
				+import com.google.gson.GsonBuilder
			
 
				+import examples.dataloader.RequestContextOffline
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+import com.aliyun.odps.spark.examples.makedata.makedata_06_originData.getFeatureFromSet
			
 
				+import com.alibaba.fastjson.JSONObject
			
 
				+
			
 
				+import java.util
			
 
				+import java.util.concurrent.TimeUnit
			
 
				+import scala.collection.JavaConversions._
			
 
				+
			
 
				+
			
 
				+object makedata_09_user2redis {
			
 
				+  def main(args: Array[String]) {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
			
 
				+    val tablePart = param.getOrElse("tablePart", "64").toInt
			
 
				+    val ifUser = param.getOrDefault("ifUser", "False").toBoolean
			
 
				+    val ifVideo = param.getOrDefault("ifVideo", "False").toBoolean
			
 
				+    val date = param.getOrDefault("date", "20231220")
			
 
				+    val expireDay = param.getOrDefault("expireDay", "2").toInt
			
 
				+    val ifDebug = param.getOrDefault("ifDebug", "False").toBoolean
			
 
				+    val ifDeleteRedisUser = param.getOrDefault("ifDeleteRedisUser", "False").toBoolean
			
 
				+    val ifWriteRedisUser = param.getOrDefault("ifWriteRedisUser", "False").toBoolean
			
 
				+    val ifWriteRedis = param.getOrDefault("ifWriteRedis", "True").toBoolean
			
 
				+    val partition = partitionPrefix + date
			
 
				+    val savePathUser = param.getOrDefault("savePathUser", "")
			
 
				+    val savePathVideo = param.getOrDefault("savePathVideo", "")
			
 
				+    val userSampleIDs = param.getOrDefault("userSampleIDs", "")
			
 
				+    val sampleRate = param.getOrDefault("sampleRate", "1.0").toDouble
			
 
				+    val midDays = param.getOrDefault("midDays", "3").toInt
			
 
				+
			
 
				+
			
 
				+    // 2 读取数据库odps
			
 
				+    val odpsOps = env.getODPS(sc)
			
 
				+    val project = "loghubods"
			
 
				+    val tableUser = "alg_recsys_user_info"
			
 
				+    val userRedisKeyPrefix = "user_info_4video_"
			
 
				+
			
 
				+
			
 
				+
			
 
				+    // 3-1 用户测特征处理
			
 
				+    if (ifUser){
			
 
				+      println("user特征处理")
			
 
				+
			
 
				+
			
 
				+      var userData = odpsOps.readTable(project = project, table = tableUser, partition = partition,
			
 
				+        transfer = func, numPartition = tablePart)
			
 
				+        .map(record =>{
			
 
				+          val userKey = "mids"
			
 
				+          val mid = record.getString(userKey)
			
 
				+          val originFeatureName = Set(
			
 
				+            "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion",
			
 
				+            "machineinfo_system", "machineinfo_wechatversion",
			
 
				+//            "gmt_create_user",
			
 
				+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
			
 
				+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt"
			
 
				+          )
			
 
				+          val originFeatureMap = getFeatureFromSet(originFeatureName, record)
			
 
				+          val resultNew = new JSONObject
			
 
				+          originFeatureName.foreach(r => {
			
 
				+            if (originFeatureMap.containsKey(r)) {
			
 
				+              val v = originFeatureMap.get(r).get
			
 
				+              resultNew.put(r, v)
			
 
				+            }
			
 
				+          })
			
 
				+          (mid, resultNew.toString())
			
 
				+        })
			
 
				+//      userData = userData.join(midRdd.map(r=> (r, 1))).map(r=> (r._1, r._2._1))
			
 
				+
			
 
				+      if (userSampleIDs.nonEmpty){
			
 
				+        val IDs = userSampleIDs.split(",").filter(_.nonEmpty).map(_.toInt).toList
			
 
				+        userData = userData.filter(r => IDs.contains(r._1.hashCode % 10))
			
 
				+      }
			
 
				+      if (ifDebug){
			
 
				+        println("user特征处理-debug开启-只保留5条数据-特征数量大于1")
			
 
				+        val userDataTake = userData.take(5)
			
 
				+        userDataTake.foreach(r=> println(r._1 + "\t" + r._2))
			
 
				+        userData = sc.parallelize(userDataTake)
			
 
				+      }
			
 
				+      if (savePathUser.nonEmpty && savePathUser.startsWith("/dw/recommend/model/")) {
			
 
				+        var savePathPart = savePathUser + "/" + partition
			
 
				+        if (userSampleIDs.nonEmpty) {
			
 
				+          savePathPart = savePathPart + "_" + userSampleIDs
			
 
				+        }
			
 
				+        MyHdfsUtils.delete_hdfs_path(savePathPart)
			
 
				+        userData.map(r => r._1 + "\t" + r._2).saveAsTextFile(savePathPart, classOf[GzipCodec])
			
 
				+      }
			
 
				+      println("user.action.count=" + userData.count())
			
 
				+    } else {
			
 
				+      println("不处理user")
			
 
				+    }
			
 
				+
			
 
				+    if (ifDeleteRedisUser){
			
 
				+      println("user redis 删除")
			
 
				+      var savePathPart = savePathUser + "/" + partition
			
 
				+      if (userSampleIDs.nonEmpty) {
			
 
				+        savePathPart = savePathPart + "_" + userSampleIDs
			
 
				+      }
			
 
				+      println("读取数据路径:" + savePathPart)
			
 
				+      val userDataRead = sc.textFile(savePathPart)
			
 
				+      val userDataRead2 = userDataRead.filter(_.split("\t").length >= 2).map(r => {
			
 
				+        val rList = r.split("\t")
			
 
				+        (rList(0), rList(1))
			
 
				+      })
			
 
				+      println("预计删除数据量:" + userDataRead2.count())
			
 
				+      val userDataTakeRddRun = userDataRead2.mapPartitions(row => {
			
 
				+        val redisFormat = new util.HashMap[String, String]
			
 
				+        val redisTemplate = env.getRedisTemplate()
			
 
				+        var i = 1
			
 
				+        row.foreach {
			
 
				+          case (key, value) =>
			
 
				+            if (key.nonEmpty) {
			
 
				+              redisFormat.put(userRedisKeyPrefix + key, value)
			
 
				+            }
			
 
				+            if (i % 1000 == 0) {
			
 
				+              redisTemplate.delete(redisFormat.map(_._1))
			
 
				+              redisFormat.clear()
			
 
				+            }
			
 
				+            i = i + 1
			
 
				+        }
			
 
				+        redisTemplate.delete(redisFormat.map(_._1))
			
 
				+        redisFormat.clear()
			
 
				+        redisFormat.iterator
			
 
				+      })
			
 
				+      println("delete redis.count=" + userDataTakeRddRun.count())
			
 
				+    } else {
			
 
				+      println("不处理user的redis删除")
			
 
				+    }
			
 
				+
			
 
				+    if (ifWriteRedisUser){
			
 
				+
			
 
				+      // 3-2 读取最近2个月有播放行为的mid集合
			
 
				+      var midRdd = sc.emptyRDD[String]
			
 
				+      MyDateUtils.getDateRange(MyDateUtils.getNumDaysBefore(date, midDays), date).foreach(d => {
			
 
				+        println("-----------读取播放信息:" + d)
			
 
				+        val partitionMid = "dt=" + d
			
 
				+        val data = odpsOps.readTable(project = "loghubods", table = "play_action_log",
			
 
				+            partition = partitionMid, transfer = func, numPartition = tablePart)
			
 
				+          .map(r => {
			
 
				+            if (r.isNull("machinecode")) "" else r.getString("machinecode")
			
 
				+          }).filter(_.nonEmpty)
			
 
				+        midRdd = midRdd.union(data).distinct()
			
 
				+      })
			
 
				+      println("------------mid处理完毕：" + midRdd.count() + "------------------")
			
 
				+
			
 
				+
			
 
				+      println("user redis 写入")
			
 
				+      var savePathPart = savePathUser + "/" + partition
			
 
				+      if (userSampleIDs.nonEmpty) {
			
 
				+        savePathPart = savePathPart + "_" + userSampleIDs
			
 
				+      }
			
 
				+      val userDataRead = sc.textFile(savePathPart).filter(_.split("\t").length >= 2)
			
 
				+        .sample(false, sampleRate)
			
 
				+        .map(r => {
			
 
				+        val rList = r.split("\t")
			
 
				+        (rList(0), rList(1))
			
 
				+      })
			
 
				+        .join(midRdd.map(r=> (r, 1))).map(r=> (r._1, r._2._1))
			
 
				+
			
 
				+      val userDataTakeRddRun = userDataRead.mapPartitions(row => {
			
 
				+        val redisFormat = new util.HashMap[String, String]
			
 
				+        val redisTemplate = env.getRedisTemplate()
			
 
				+        var i = 1
			
 
				+        row.foreach {
			
 
				+          case (key, value) =>
			
 
				+            if (key.nonEmpty) {
			
 
				+              redisFormat.put(userRedisKeyPrefix + key, value)
			
 
				+            }
			
 
				+            if (i % 1000 == 0) {
			
 
				+              redisTemplate.opsForValue.multiSet(redisFormat)
			
 
				+              redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
			
 
				+              redisFormat.clear()
			
 
				+            }
			
 
				+            i = i + 1
			
 
				+        }
			
 
				+        redisTemplate.opsForValue.multiSet(redisFormat)
			
 
				+        redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
			
 
				+        redisFormat.clear()
			
 
				+        redisFormat.iterator
			
 
				+      })
			
 
				+      println("user写入成功：put in redis.count=" + userDataTakeRddRun.count())
			
 
				+    } else {
			
 
				+      println("不处理user的redis写入")
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+
			
 
				+  }
			
 
				+
			
 
				+  def handleUser(record: Record, schema: TableSchema): Tuple3[String, String, Int] = {
			
 
				+    val userKey = "mids"
			
 
				+    val mid = record.getString(userKey)
			
 
				+    val reqContext: RequestContextOffline = new RequestContextOffline()
			
 
				+    reqContext.putUserFeature(record)
			
 
				+    // reqContext.featureMap.put("mid", mid)
			
 
				+    val gson = (new GsonBuilder).serializeSpecialFloatingPointValues.create
			
 
				+    val value = gson.toJson(reqContext.featureMap)
			
 
				+    (mid, value, reqContext.featureMap.size())
			
 
				+  }
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): Record = {
			
 
				+    record
			
 
				+  }
			
 
				+
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_09_user2redis_freq.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_09_user2redis_freq.scala
@@ -0,0 +1,167 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.alibaba.fastjson.JSONObject
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.makedata.makedata_06_originData.getFeatureFromSet
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
			
 
				+import com.google.gson.GsonBuilder
			
 
				+import examples.dataloader.RequestContextOffline
			
 
				+import org.apache.commons.lang.time.DateUtils
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import java.util
			
 
				+import java.util.concurrent.TimeUnit
			
 
				+import scala.collection.JavaConversions._
			
 
				+
			
 
				+
			
 
				+object makedata_09_user2redis_freq {
			
 
				+  def main(args: Array[String]) {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+    MyHdfsUtils.delete_hdfs_path("/dw/recommend/model/99_zhangbo_checkpoint/")
			
 
				+    sc.setCheckpointDir("/dw/recommend/model/99_zhangbo_checkpoint/")
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
			
 
				+    val tablePart = param.getOrElse("tablePart", "64").toInt
			
 
				+    val date = param.getOrDefault("date", "20231220")
			
 
				+    val expireDay = param.getOrDefault("expireDay", "3").toInt
			
 
				+    val ifUser = param.getOrDefault("ifUser", "False").toBoolean
			
 
				+    val ifWriteRedisUser = param.getOrDefault("ifWriteRedisUser", "False").toBoolean
			
 
				+    val partition = partitionPrefix + date
			
 
				+    val savePathUser = param.getOrDefault("savePathUser", "")
			
 
				+    val midDays = param.getOrDefault("midDays", "7").toInt
			
 
				+    val redisLimit = param.getOrDefault("redisLimit", "100000000").toLong
			
 
				+
			
 
				+    //2 读取数据库odps
			
 
				+    val odpsOps = env.getODPS(sc)
			
 
				+    val project = "loghubods"
			
 
				+    val tableUser = "alg_recsys_user_info"
			
 
				+    val userRedisKeyPrefix = "user_info_4video_"
			
 
				+
			
 
				+
			
 
				+
			
 
				+    if (ifUser){
			
 
				+      //3 特征处理
			
 
				+      println("user特征处理")
			
 
				+      val userData = odpsOps.readTable(project = project, table = tableUser, partition = partition,
			
 
				+          transfer = func, numPartition = tablePart)
			
 
				+        .map(record => {
			
 
				+          val mid = record.getString("mids")
			
 
				+          val originFeatureName = Set(
			
 
				+            "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion",
			
 
				+            "machineinfo_system", "machineinfo_wechatversion",
			
 
				+            //"gmt_create_user",
			
 
				+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
			
 
				+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt"
			
 
				+          )
			
 
				+          val originFeatureMap = getFeatureFromSet(originFeatureName, record)
			
 
				+          val resultNew = new JSONObject
			
 
				+          originFeatureName.foreach(r => {
			
 
				+            if (originFeatureMap.containsKey(r)) {
			
 
				+              val v = originFeatureMap(r)
			
 
				+              resultNew.put(r, v)
			
 
				+            }
			
 
				+          })
			
 
				+          (mid, resultNew.toString())
			
 
				+        })
			
 
				+      //3 特征原始文件保存
			
 
				+      if (savePathUser.nonEmpty && savePathUser.startsWith("/dw/recommend/model/")) {
			
 
				+        val savePathPart = savePathUser + "/all/" + partition
			
 
				+        MyHdfsUtils.delete_hdfs_path(savePathPart)
			
 
				+        userData.map(r => r._1 + "\t" + r._2).saveAsTextFile(savePathPart, classOf[GzipCodec])
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    //4 近期用户统计
			
 
				+    val dateEarly = MyDateUtils.getNumDaysBefore(date, 0)
			
 
				+    val midRdd = odpsOps.readTable(project = "loghubods", table = "mid_uid",
			
 
				+        partition = "dt=" + dateEarly, transfer = func, numPartition = tablePart)
			
 
				+      .map(r => {
			
 
				+        val mid = if (r.isNull("mid")) "" else r.getString("mid")
			
 
				+        val actionTs = if (r.isNull("user_last_action_time")) "" else r.getString("user_last_action_time")
			
 
				+        (mid, actionTs)
			
 
				+      }).filter(r => r._1.nonEmpty && r._2.nonEmpty)
			
 
				+      .reduceByKey((a, b) => Math.max(a.toLong, b.toLong).toString)
			
 
				+      .filter(r => DateUtils.parseDate(date, Array[String]("yyyyMMdd")).getTime / 1000 - r._2.toLong / 1000 < 3600 * 24 * midDays)
			
 
				+    println("------------mid处理完毕,近期保留的用户有：" + midRdd.count() + "------------------")
			
 
				+    //5 用户区分
			
 
				+    val savePathPart = savePathUser + "/all/" + partition
			
 
				+    val userDataRead = sc.textFile(savePathPart).filter(_.split("\t").length >= 2)
			
 
				+      .map(r => {
			
 
				+        val rList = r.split("\t")
			
 
				+        (rList(0), rList(1))
			
 
				+      }).join(midRdd).map(r => (r._1, r._2._1))
			
 
				+    userDataRead.checkpoint()
			
 
				+//      .leftOuterJoin(midRdd).map {
			
 
				+//        case (mid, (fea, Some(_))) =>
			
 
				+//          (mid, fea, true)
			
 
				+//        case (mid, (fea, None)) =>
			
 
				+//          (mid, fea, false)
			
 
				+//      }
			
 
				+    val userDataReadTrue = userDataRead.map(r => r._1 + "\t" + r._2)
			
 
				+    // val userDataReadFalse = userDataRead.filter(!_._3).map(r => r._1 + "\t" + r._2)
			
 
				+    if (savePathUser.nonEmpty && savePathUser.startsWith("/dw/recommend/model/")) {
			
 
				+      val p1 = savePathUser + "/true/" + partition
			
 
				+      MyHdfsUtils.delete_hdfs_path(p1)
			
 
				+      userDataReadTrue.saveAsTextFile(p1, classOf[GzipCodec])
			
 
				+      //val p2 = savePathUser + "/false/" + partition
			
 
				+      //MyHdfsUtils.delete_hdfs_path(p2)
			
 
				+      //userDataReadFalse.saveAsTextFile(p2, classOf[GzipCodec])
			
 
				+    }
			
 
				+
			
 
				+    //6 redis
			
 
				+    if (ifWriteRedisUser) {
			
 
				+      println("开始处理redis写入")
			
 
				+      val p1 = savePathUser + "/true/" + partition
			
 
				+      val userDataRead = sc.textFile(p1).filter(_.split("\t").length >= 2)
			
 
				+        .map(r => {
			
 
				+          val rList = r.split("\t")
			
 
				+          (rList(0), rList(1))
			
 
				+        })
			
 
				+      val count = userDataRead.count()
			
 
				+      println("待写入数据有：" + count)
			
 
				+      if (count > redisLimit) {
			
 
				+        println(s"数据量超过${redisLimit}，不执行写入。")
			
 
				+      } else {
			
 
				+        val userDataTakeRddRun = userDataRead.mapPartitions(row => {
			
 
				+          val redisFormat = new util.HashMap[String, String]
			
 
				+          val redisTemplate = env.getRedisTemplate()
			
 
				+          var i = 1
			
 
				+          row.foreach {
			
 
				+            case (key, value) =>
			
 
				+              if (key.nonEmpty) {
			
 
				+                redisFormat.put(userRedisKeyPrefix + key, value)
			
 
				+              }
			
 
				+              if (i % 1000 == 0) {
			
 
				+                redisTemplate.opsForValue.multiSet(redisFormat)
			
 
				+                redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
			
 
				+                redisFormat.clear()
			
 
				+              }
			
 
				+              i = i + 1
			
 
				+          }
			
 
				+          redisTemplate.opsForValue.multiSet(redisFormat)
			
 
				+          redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
			
 
				+          redisFormat.clear()
			
 
				+          redisFormat.iterator
			
 
				+        })
			
 
				+        println("user写入成功：put in redis.count=" + userDataTakeRddRun.count())
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): Record = {
			
 
				+    record
			
 
				+  }
			
 
				+
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_10_originData_v3.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_10_originData_v3.scala
@@ -0,0 +1,244 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.alibaba.fastjson.JSONObject
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
			
 
				+import examples.extractor.{RankExtractorItemFeatureV2, RankExtractorUserFeatureV2}
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import java.util
			
 
				+import java.util.{HashMap, Map}
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable
			
 
				+
			
 
				+/*
			
 
				+   所有获取不到的特征，给默认值0.
			
 
				+ */
			
 
				+
			
 
				+object makedata_10_originData_v3 {
			
 
				+  def main(args: Array[String]) {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val tablePart = param.getOrElse("tablePart", "32").toInt
			
 
				+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20230101")
			
 
				+    val endStr = param.getOrElse("endStr", "20230101")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/10_sample_data_v3/")
			
 
				+    val project = param.getOrElse("project", "loghubods")
			
 
				+    val table = param.getOrElse("table", "alg_recsys_view_sample_v3")
			
 
				+
			
 
				+
			
 
				+    // 2 读取odps+表信息
			
 
				+    val odpsOps = env.getODPS(sc)
			
 
				+
			
 
				+    // 3 循环执行数据生产
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      val partition = partitionPrefix + date
			
 
				+      println("开始执行partiton:" + partition)
			
 
				+      val odpsData = odpsOps.readTable(project = project,
			
 
				+        table = table,
			
 
				+        partition = partition,
			
 
				+        transfer = func,
			
 
				+        numPartition = tablePart)
			
 
				+        .map(record => {
			
 
				+
			
 
				+          //1 拿到所有的特征，保存在hashmap中。如果表中是空，那么map中没有这个key。
			
 
				+          val originFeatureName = Set(
			
 
				+            "apptype","mid","uid","videoid","logtimestamp","ctx_day","ctx_week","ctx_hour","clientip","ctx_region",
			
 
				+            "ctx_city","pagesource","recommend_page_type","pagesource_change","abcode",
			
 
				+            // ----------
			
 
				+            "playtime","is_play","share_cnt_pv","is_share","share_ts_list","return_cnt_pv","return_cnt_uv","return_mid_ts_list","is_return",
			
 
				+            // ----------
			
 
				+
			
 
				+            // ----------
			
 
				+            "gender","machineinfo_brand","machineinfo_model","machineinfo_platform","machineinfo_sdkversion","machineinfo_system","machineinfo_wechatversion","gmt_create_user",
			
 
				+            "u_1day_exp_cnt","u_1day_click_cnt","u_1day_share_cnt","u_1day_return_cnt",
			
 
				+            "u_3day_exp_cnt","u_3day_click_cnt","u_3day_share_cnt","u_3day_return_cnt",
			
 
				+            "u_7day_exp_cnt","u_7day_click_cnt","u_7day_share_cnt","u_7day_return_cnt",
			
 
				+            "u_3month_exp_cnt","u_3month_click_cnt","u_3month_share_cnt","u_3month_return_cnt",
			
 
				+            // ----------
			
 
				+            "title","distrubute_title","gmt_create_video","tags","existence_days","total_time","play_count","play_count_total","video_recommend",
			
 
				+            "i_1day_exp_cnt","i_1day_click_cnt","i_1day_share_cnt","i_1day_return_cnt",
			
 
				+            "i_3day_exp_cnt","i_3day_click_cnt","i_3day_share_cnt","i_3day_return_cnt",
			
 
				+            "i_7day_exp_cnt","i_7day_click_cnt","i_7day_share_cnt","i_7day_return_cnt",
			
 
				+            "i_3month_exp_cnt","i_3month_click_cnt","i_3month_share_cnt","i_3month_return_cnt"
			
 
				+          )
			
 
				+          val originFeatureMap = getFeatureFromSet(originFeatureName, record)
			
 
				+          //2 计算天级别的比率特征。
			
 
				+          val f2 = RankExtractorUserFeatureV2.getUserRateFeature(originFeatureMap)
			
 
				+          val f4 = RankExtractorItemFeatureV2.getItemRateFeature(originFeatureMap)
			
 
				+          //3 计算item的实时特征。先解析格式，再进行计算。
			
 
				+          val itemRealtimeFeatureMap = getFeatureFromSet(Set(
			
 
				+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day", "share_pv_list_1day",
			
 
				+            "share_uv_list_1day", "return_uv_list_1day", "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
			
 
				+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
			
 
				+            // ----------
			
 
				+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
			
 
				+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
			
 
				+          ), record).map(r => {
			
 
				+            val m = new java.util.HashMap[String, Double]()
			
 
				+            r._2.split(",").foreach(r => {
			
 
				+              m.put(r.split(":")(0), r.split(":")(1).toDouble)
			
 
				+            })
			
 
				+            (r._1, m)
			
 
				+          })
			
 
				+          val javaMap = new HashMap[String, Map[String, java.lang.Double]]()
			
 
				+          itemRealtimeFeatureMap.foreach { case (key, value) =>
			
 
				+            val javaValue = new HashMap[String, java.lang.Double]()
			
 
				+            value.foreach { case (innerKey, innerValue) =>
			
 
				+              javaValue.put(innerKey, innerValue.asInstanceOf[java.lang.Double])
			
 
				+            }
			
 
				+            javaMap.put(key, javaValue)
			
 
				+          }
			
 
				+          val f6 = RankExtractorItemFeatureV2.getItemRealtimeTrend(javaMap,
			
 
				+            originFeatureMap.getOrElse("ctx_day", ""),
			
 
				+            originFeatureMap.getOrElse("ctx_hour", ""))
			
 
				+          val f7 = RankExtractorItemFeatureV2.getItemRealtimeCnt(javaMap,
			
 
				+            new util.HashSet[String](util.Arrays.asList(
			
 
				+              "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
			
 
				+              "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
			
 
				+              "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
			
 
				+              "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
			
 
				+              // ----------
			
 
				+              "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
			
 
				+              "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
			
 
				+            )),
			
 
				+            originFeatureMap.getOrElse("ctx_day", ""),
			
 
				+            originFeatureMap.getOrElse("ctx_hour", "")
			
 
				+          )
			
 
				+          val f8 = RankExtractorItemFeatureV2.getItemRealtimeRate(javaMap,
			
 
				+            originFeatureMap.getOrElse("ctx_day", ""),
			
 
				+            originFeatureMap.getOrElse("ctx_hour", "")
			
 
				+          )
			
 
				+          val result = new util.HashMap[String, String]()
			
 
				+          result ++= originFeatureMap
			
 
				+          result ++= f2
			
 
				+          result ++= f4
			
 
				+          result ++= f6
			
 
				+          result ++= f7
			
 
				+          result ++= f8
			
 
				+          val names = Set(
			
 
				+            "apptype", "mid", "uid", "videoid", "logtimestamp", "ctx_day", "ctx_week", "ctx_hour", "clientip", "ctx_region",
			
 
				+            "ctx_city", "pagesource", "recommend_page_type", "pagesource_change", "abcode",
			
 
				+            // ----------
			
 
				+            "playtime", "is_play", "share_cnt_pv", "is_share", "share_ts_list", "return_cnt_pv", "return_cnt_uv", "return_mid_ts_list", "is_return",
			
 
				+            // ----------
			
 
				+
			
 
				+            // ----------
			
 
				+            "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion", "machineinfo_system", "machineinfo_wechatversion", "gmt_create_user",
			
 
				+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
			
 
				+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt",
			
 
				+            // ----------
			
 
				+            "title", "distrubute_title", "gmt_create_video", "tags", "existence_days", "total_time", "play_count", "play_count_total", "video_recommend",
			
 
				+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
			
 
				+            "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt",
			
 
				+            // ---------- rate
			
 
				+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
			
 
				+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
			
 
				+            "u_7day_ctr", "u_7day_str", "u_7day_rov", "u_7day_ros",
			
 
				+            "u_3month_ctr", "u_3month_str", "u_3month_rov", "u_3month_ros",
			
 
				+            // ---------- rate
			
 
				+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
			
 
				+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
			
 
				+            "i_7day_ctr", "i_7day_str", "i_7day_rov", "i_7day_ros",
			
 
				+            "i_3month_ctr", "i_3month_str", "i_3month_rov", "i_3month_ros",
			
 
				+            // ----------
			
 
				+            "share_uv_list_1day_6_avg", "share_uv_list_1day_6_var", "share_uv_list_1day_diff_6_avg", "share_uv_list_1day_diff_6_var",
			
 
				+            "return_uv_list_1day_6_avg", "return_uv_list_1day_6_var", "return_uv_list_1day_diff_6_avg", "return_uv_list_1day_diff_6_var",
			
 
				+            "share_uv_list_1h_6_avg", "share_uv_list_1h_6_var", "share_uv_list_1h_diff_6_avg", "share_uv_list_1h_diff_6_var",
			
 
				+            "return_uv_list_1h_6_avg", "return_uv_list_1h_6_var", "return_uv_list_1h_diff_6_avg", "return_uv_list_1h_diff_6_var",
			
 
				+            // ----------
			
 
				+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
			
 
				+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
			
 
				+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
			
 
				+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
			
 
				+            // ----------
			
 
				+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
			
 
				+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h",
			
 
				+            // ---------- rate
			
 
				+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
			
 
				+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
			
 
				+          )
			
 
				+          val resultNew = new JSONObject
			
 
				+          names.foreach(r => {
			
 
				+            if (result.containsKey(r)) {
			
 
				+              resultNew.put(r, result.get(r))
			
 
				+            }
			
 
				+          })
			
 
				+
			
 
				+          //4 处理label信息。
			
 
				+          val labels = Set(
			
 
				+            "pagesource", "recommend_page_type", "pagesource_change",
			
 
				+            "abcode",
			
 
				+            "is_play", "playtime",
			
 
				+            "is_share", "share_cnt_pv", "share_ts_list",
			
 
				+            "is_return", "return_cnt_pv", "return_cnt_uv", "return_mid_ts_list"
			
 
				+          )
			
 
				+          val labelNew = new JSONObject
			
 
				+          val labelMap = getFeatureFromSet(labels, record)
			
 
				+          labels.foreach(r => {
			
 
				+            if (labelMap.containsKey(r)) {
			
 
				+              labelNew.put(r, labelMap(r))
			
 
				+            }
			
 
				+          })
			
 
				+          //5 处理log key表头。
			
 
				+          val mid = record.getString("mid")
			
 
				+          val videoid = record.getString("videoid")
			
 
				+          val logtimestamp = record.getString("logtimestamp")
			
 
				+          val apptype = record.getString("apptype")
			
 
				+          val pagesource_change = record.getString("pagesource_change")
			
 
				+          val abcode = record.getString("abcode")
			
 
				+          val video_recommend = if (!record.isNull("video_recommend")) record.getString("video_recommend") else "111"
			
 
				+
			
 
				+          val logKey = (mid, videoid, logtimestamp, apptype, pagesource_change, abcode, video_recommend).productIterator.mkString(":")
			
 
				+          val labelKey = labelNew.toString()
			
 
				+          val featureKey = resultNew.toString()
			
 
				+          //6 拼接数据，保存。
			
 
				+          logKey + "\t" + labelKey + "\t" + featureKey
			
 
				+
			
 
				+        })
			
 
				+
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val hdfsPath = savePath + "/" + partition
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        odpsData.saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      }else{
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): Record = {
			
 
				+    record
			
 
				+  }
			
 
				+
			
 
				+  def getFeatureFromSet(set: Set[String], record: Record): mutable.HashMap[String, String] = {
			
 
				+    val result = mutable.HashMap[String, String]()
			
 
				+    set.foreach(r =>{
			
 
				+      if (!record.isNull(r)){
			
 
				+        try{
			
 
				+          result.put(r, record.getString(r))
			
 
				+        }catch {
			
 
				+          case _ => result.put(r, String.valueOf(record.getBigint(r)))
			
 
				+        }
			
 
				+      }
			
 
				+    })
			
 
				+    result
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_11_strData_v3.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_11_strData_v3.scala
@@ -0,0 +1,187 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.alibaba.fastjson.{JSON, JSONObject}
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
			
 
				+import examples.dataloader.OfflineVlogShareLRFeatureExtractorV2
			
 
				+import examples.extractor.ExtractorUtils
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import java.util
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+
			
 
				+object makedata_11_strData_v3 {
			
 
				+  def main(args: Array[String]) {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20230101")
			
 
				+    val endStr = param.getOrElse("endStr", "20230101")
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/10_sample_data_v3/")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/11_str_data_v3/")
			
 
				+    val featureVersion =  param.getOrElse("featureVersion", "v2")
			
 
				+    val ifRepart = param.getOrElse("ifRepart", "10").toInt
			
 
				+    val labelVersion = param.getOrElse("labelVersion", "v1")
			
 
				+
			
 
				+    // 3 循环执行数据生产
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      val partition = partitionPrefix + date
			
 
				+      println("开始执行partiton:" + partition)
			
 
				+      var hdfsPath = readPath + "/" + partition
			
 
				+
			
 
				+      //1 样本过滤(分享样本、012345中的、可推荐的video、不同产品)
			
 
				+      val data1 = sc.textFile(hdfsPath).map(r => {
			
 
				+        val rList = r.split("\t")
			
 
				+        val logKeyStr = rList(0)
			
 
				+        val (mid, videoid, logtimestamp, apptype, pagesource_change, abcode, video_recommend) = ParamUtils.parseLogKey(logKeyStr)
			
 
				+        val labelStr = rList(1)
			
 
				+        val feaStr = rList(2)
			
 
				+        val labelJson = JSON.parseObject(labelStr)
			
 
				+        val is_share = labelJson.getString("is_share")
			
 
				+        (logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp.toLong)
			
 
				+      }).filter({
			
 
				+        case (logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp) =>
			
 
				+          val pages = Set("2")
			
 
				+          val video_status = Set("-6")
			
 
				+          val apps = Set("0", "4", "5", "21", "3", "6")
			
 
				+          pages.contains(pagesource_change) && video_status.contains(video_recommend) && apps.contains(apptype)
			
 
				+      })
			
 
				+
			
 
				+      //2 样本采样(str模型不做采样 保留所有曝光样本)
			
 
				+      val data2 = data1.map({
			
 
				+        case (logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp) =>
			
 
				+          val feaJson = JSON.parseObject(feaStr)
			
 
				+          val is_share = labelJson.getString("is_share")
			
 
				+          if ("0".equals(is_share)){
			
 
				+            ("0", feaJson)
			
 
				+          }else{
			
 
				+            ("1", feaJson)
			
 
				+          }
			
 
				+      })
			
 
				+
			
 
				+      //3 保留一份原始样本的中间数据
			
 
				+      println("样本比例")
			
 
				+      data2.map(r=> (r._1, 1)).reduceByKey(_+_).map(r=> r._1 + "\t" + r._2).collect().foreach(println)
			
 
				+
			
 
				+      //4 特征绝对值转换 如 0.456变成19
			
 
				+      val data3 = data2.map({
			
 
				+        case (label, feaJson) =>
			
 
				+          Set(
			
 
				+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
			
 
				+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
			
 
				+            "u_7day_ctr", "u_7day_str", "u_7day_rov", "u_7day_ros",
			
 
				+            "u_3month_ctr", "u_3month_str", "u_3month_rov", "u_3month_ros",
			
 
				+            // ----------
			
 
				+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
			
 
				+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
			
 
				+            "i_7day_ctr", "i_7day_str", "i_7day_rov", "i_7day_ros",
			
 
				+            "i_3month_ctr", "i_3month_str", "i_3month_rov", "i_3month_ros",
			
 
				+            // ----------
			
 
				+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
			
 
				+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
			
 
				+          ).foreach(key =>{
			
 
				+            if (feaJson.containsKey(key)){
			
 
				+              val value = ExtractorUtils.ceilLogRate(feaJson.getString(key).toDouble)
			
 
				+              feaJson.put(key, value.toString)
			
 
				+            }
			
 
				+          })
			
 
				+          Set(
			
 
				+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
			
 
				+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt",
			
 
				+            // ----------
			
 
				+            "total_time", "play_count", "play_count_total",
			
 
				+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
			
 
				+            "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt",
			
 
				+            // ----------
			
 
				+            "share_uv_list_1day_6_avg", "share_uv_list_1day_6_var", "share_uv_list_1day_diff_6_avg", "share_uv_list_1day_diff_6_var",
			
 
				+            "return_uv_list_1day_6_avg", "return_uv_list_1day_6_var", "return_uv_list_1day_diff_6_avg", "return_uv_list_1day_diff_6_var",
			
 
				+            "share_uv_list_1h_6_avg", "share_uv_list_1h_6_var", "share_uv_list_1h_diff_6_avg", "share_uv_list_1h_diff_6_var",
			
 
				+            "return_uv_list_1h_6_avg", "return_uv_list_1h_6_var", "return_uv_list_1h_diff_6_avg", "return_uv_list_1h_diff_6_var",
			
 
				+            // ----------
			
 
				+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
			
 
				+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
			
 
				+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
			
 
				+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
			
 
				+            // ----------
			
 
				+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
			
 
				+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
			
 
				+
			
 
				+          ).foreach(key => {
			
 
				+            if (feaJson.containsKey(key)) {
			
 
				+              val value = ExtractorUtils.bucketCnt(feaJson.getString(key).toDouble)
			
 
				+              feaJson.put(key, value.toString)
			
 
				+            }
			
 
				+          })
			
 
				+          (label, feaJson)
			
 
				+      })
			
 
				+      //5 libsvm 转换
			
 
				+      val data4 = data3.map({
			
 
				+        case (label, feaJson) =>
			
 
				+          val feaSet = Set(
			
 
				+            "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
			
 
				+            "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
			
 
				+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
			
 
				+            "total_time", "play_count_total",
			
 
				+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
			
 
				+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
			
 
				+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
			
 
				+            "u_7day_ctr", "u_7day_str", "u_7day_rov", "u_7day_ros",
			
 
				+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
			
 
				+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
			
 
				+            "i_7day_ctr", "i_7day_str", "i_7day_rov", "i_7day_ros",
			
 
				+
			
 
				+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
			
 
				+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
			
 
				+          )
			
 
				+          val feaMap = new util.HashMap[String, String]()
			
 
				+          feaSet.foreach(r => {
			
 
				+            if (feaJson.containsKey(r)) {
			
 
				+              feaMap.put(r, feaJson.getString(r))
			
 
				+            }
			
 
				+          })
			
 
				+          val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV2()
			
 
				+          bytesFeatureExtractor.makeFeature4String(feaMap)
			
 
				+          val featureMap = bytesFeatureExtractor.featureMap
			
 
				+          label + "\t" + featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t")
			
 
				+
			
 
				+      })
			
 
				+
			
 
				+      // 7 保存数据到hdfs
			
 
				+      hdfsPath = savePath + "/" + partition
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        if (ifRepart == 0){
			
 
				+          data4.saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+        }else{
			
 
				+          data4.repartition(ifRepart).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+        }
			
 
				+      }else{
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): Record = {
			
 
				+    record
			
 
				+  }
			
 
				+
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_12_rosData_v3.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_12_rosData_v3.scala
@@ -0,0 +1,215 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.alibaba.fastjson.{JSON, JSONObject}
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
			
 
				+import examples.dataloader.{OfflineVlogShareLRFeatureExtractorV1, OfflineVlogShareLRFeatureExtractorV2}
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import java.util
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import examples.extractor.ExtractorUtils
			
 
				+
			
 
				+object makedata_12_rosData_v3 {
			
 
				+  def main(args: Array[String]) {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20230101")
			
 
				+    val endStr = param.getOrElse("endStr", "20230101")
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/10_sample_data_v3/")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/12_ros_data_v3/")
			
 
				+    val featureVersion =  param.getOrElse("featureVersion", "v2")
			
 
				+    val ifRepart = param.getOrElse("ifRepart", "10").toInt
			
 
				+    val labelVersion = param.getOrElse("labelVersion", "v1")
			
 
				+
			
 
				+    // 3 循环执行数据生产
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      val partition = partitionPrefix + date
			
 
				+      println("开始执行partiton:" + partition)
			
 
				+      var hdfsPath = readPath + "/" + partition
			
 
				+
			
 
				+      //1 样本过滤(分享样本、012345中的、可推荐的video、不同产品)
			
 
				+      val data1 = sc.textFile(hdfsPath).map(r => {
			
 
				+        val rList = r.split("\t")
			
 
				+        val logKeyStr = rList(0)
			
 
				+        val (mid, videoid, logtimestamp, apptype, pagesource_change, abcode, video_recommend) = ParamUtils.parseLogKey(logKeyStr)
			
 
				+        val labelStr = rList(1)
			
 
				+        val feaStr = rList(2)
			
 
				+        val labelJson = JSON.parseObject(labelStr)
			
 
				+        val is_share = labelJson.getString("is_share")
			
 
				+        (mid, logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp.toLong)
			
 
				+      }).filter({
			
 
				+        case (mid, logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp) =>
			
 
				+          val pages = Set("2")
			
 
				+          val video_status = Set("-6")
			
 
				+          val apps = Set("0", "4", "5", "21", "3", "6")
			
 
				+          "1".equals(is_share) && pages.contains(pagesource_change) && video_status.contains(video_recommend) && apps.contains(apptype)
			
 
				+      })
			
 
				+
			
 
				+      //2 样本采样（多个回流的样本复制，等价回流量的加权）
			
 
				+      val data2 = data1.flatMap({
			
 
				+        case (mid, logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp) =>
			
 
				+          val res = ArrayBuffer[(String, JSONObject)]()
			
 
				+          val feaJson = JSON.parseObject(feaStr)
			
 
				+          val is_return = labelJson.getString("is_return")
			
 
				+          if ("0".equals(is_return)){
			
 
				+            res.add(("0", feaJson))
			
 
				+          }else{
			
 
				+            val return_mid_ts_list = labelJson.getString("return_mid_ts_list").split(",").map(r => {
			
 
				+              val midReturn = r.split(":")(0)
			
 
				+              val ts = r.split(":")(1).toLong
			
 
				+              (midReturn, ts)
			
 
				+            }).filter(!_._1.equals(mid)).sortBy(_._2)
			
 
				+            // 样本中做了一个必要的过滤，如果是自己的回流，过滤掉。
			
 
				+
			
 
				+            if (return_mid_ts_list.nonEmpty){
			
 
				+              var flag = true
			
 
				+              val midSet = scala.collection.mutable.HashSet[String]()
			
 
				+              for ((midReturn, tsReturn) <- return_mid_ts_list) {
			
 
				+                if (!midSet.contains(midReturn)) {
			
 
				+                  midSet.add(midReturn)
			
 
				+                  if ((tsReturn / 1000 - logtimestamp / 1000) <= 3600 && tsReturn - logtimestamp > 0) {
			
 
				+                    res.add(("1", feaJson))
			
 
				+                    flag = false
			
 
				+                  }
			
 
				+                }
			
 
				+              }
			
 
				+              if (flag) {
			
 
				+                // 如果上面一个正样本都没添加，那么添加一个负样本。代表近一个小时内没有回流。
			
 
				+                res.add(("0", feaJson))
			
 
				+              }
			
 
				+            }else {
			
 
				+              // 如果把自己的回流过滤掉了之后，没有其他回流，那么是负样本。
			
 
				+              res.add(("0", feaJson))
			
 
				+            }
			
 
				+          }
			
 
				+          res.iterator
			
 
				+      })
			
 
				+
			
 
				+      //3 保留一份原始样本的中间数据
			
 
				+      println("样本比例")
			
 
				+      data2.map(r=> (r._1, 1)).reduceByKey(_+_).map(r=> r._1 + "\t" + r._2).collect().foreach(println)
			
 
				+
			
 
				+      //4 特征绝对值转换 如 0.456变成19
			
 
				+      val data3 = data2.map({
			
 
				+        case (label, feaJson) =>
			
 
				+          Set(
			
 
				+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
			
 
				+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
			
 
				+            "u_7day_ctr", "u_7day_str", "u_7day_rov", "u_7day_ros",
			
 
				+            "u_3month_ctr", "u_3month_str", "u_3month_rov", "u_3month_ros",
			
 
				+            // ----------
			
 
				+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
			
 
				+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
			
 
				+            "i_7day_ctr", "i_7day_str", "i_7day_rov", "i_7day_ros",
			
 
				+            "i_3month_ctr", "i_3month_str", "i_3month_rov", "i_3month_ros",
			
 
				+            // ----------
			
 
				+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
			
 
				+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
			
 
				+          ).foreach(key =>{
			
 
				+            if (feaJson.containsKey(key)){
			
 
				+              val value = ExtractorUtils.ceilLogRate(feaJson.getString(key).toDouble)
			
 
				+              feaJson.put(key, value.toString)
			
 
				+            }
			
 
				+          })
			
 
				+          Set(
			
 
				+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
			
 
				+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt",
			
 
				+            // ----------
			
 
				+            "total_time", "play_count", "play_count_total",
			
 
				+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
			
 
				+            "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt",
			
 
				+            // ----------
			
 
				+            "share_uv_list_1day_6_avg", "share_uv_list_1day_6_var", "share_uv_list_1day_diff_6_avg", "share_uv_list_1day_diff_6_var",
			
 
				+            "return_uv_list_1day_6_avg", "return_uv_list_1day_6_var", "return_uv_list_1day_diff_6_avg", "return_uv_list_1day_diff_6_var",
			
 
				+            "share_uv_list_1h_6_avg", "share_uv_list_1h_6_var", "share_uv_list_1h_diff_6_avg", "share_uv_list_1h_diff_6_var",
			
 
				+            "return_uv_list_1h_6_avg", "return_uv_list_1h_6_var", "return_uv_list_1h_diff_6_avg", "return_uv_list_1h_diff_6_var",
			
 
				+            // ----------
			
 
				+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
			
 
				+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
			
 
				+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
			
 
				+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
			
 
				+            // ----------
			
 
				+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
			
 
				+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
			
 
				+
			
 
				+          ).foreach(key => {
			
 
				+            if (feaJson.containsKey(key)) {
			
 
				+              val value = ExtractorUtils.bucketCnt(feaJson.getString(key).toDouble)
			
 
				+              feaJson.put(key, value.toString)
			
 
				+            }
			
 
				+          })
			
 
				+          (label, feaJson)
			
 
				+      })
			
 
				+      //5 libsvm 转换
			
 
				+      val data4 = data3.map({
			
 
				+        case (label, feaJson) =>
			
 
				+          val feaSet = Set(
			
 
				+            "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
			
 
				+            "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
			
 
				+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
			
 
				+            "total_time", "play_count_total",
			
 
				+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
			
 
				+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
			
 
				+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
			
 
				+            "u_7day_ctr", "u_7day_str", "u_7day_rov", "u_7day_ros",
			
 
				+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
			
 
				+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
			
 
				+            "i_7day_ctr", "i_7day_str", "i_7day_rov", "i_7day_ros",
			
 
				+
			
 
				+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
			
 
				+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
			
 
				+          )
			
 
				+          val feaMap = new util.HashMap[String, String]()
			
 
				+          feaSet.foreach(r => {
			
 
				+            if (feaJson.containsKey(r)) {
			
 
				+              feaMap.put(r, feaJson.getString(r))
			
 
				+            }
			
 
				+          })
			
 
				+          val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV2()
			
 
				+          bytesFeatureExtractor.makeFeature4String(feaMap)
			
 
				+          val featureMap = bytesFeatureExtractor.featureMap
			
 
				+          label + "\t" + featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t")
			
 
				+
			
 
				+      })
			
 
				+
			
 
				+      // 7 保存数据到hdfs
			
 
				+      hdfsPath = savePath + "/" + partition
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        if (ifRepart == 0){
			
 
				+          data4.saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+        }else{
			
 
				+          data4.repartition(ifRepart).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+        }
			
 
				+      }else{
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): Record = {
			
 
				+    record
			
 
				+  }
			
 
				+
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_12_rosData_v3_noweight.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_12_rosData_v3_noweight.scala
@@ -0,0 +1,216 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.alibaba.fastjson.{JSON, JSONObject}
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
			
 
				+import examples.dataloader.OfflineVlogShareLRFeatureExtractorV2
			
 
				+import examples.extractor.ExtractorUtils
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import java.util
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+
			
 
				+object makedata_12_rosData_v3_noweight {
			
 
				+  def main(args: Array[String]) {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20230101")
			
 
				+    val endStr = param.getOrElse("endStr", "20230101")
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/10_sample_data_v3/")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/12_ros_data_v3_noweight/")
			
 
				+    val featureVersion =  param.getOrElse("featureVersion", "v2")
			
 
				+    val ifRepart = param.getOrElse("ifRepart", "10").toInt
			
 
				+    val labelVersion = param.getOrElse("labelVersion", "v1")
			
 
				+
			
 
				+    // 3 循环执行数据生产
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      val partition = partitionPrefix + date
			
 
				+      println("开始执行partiton:" + partition)
			
 
				+      var hdfsPath = readPath + "/" + partition
			
 
				+
			
 
				+      //1 样本过滤(分享样本、012345中的、可推荐的video、不同产品)
			
 
				+      val data1 = sc.textFile(hdfsPath).map(r => {
			
 
				+        val rList = r.split("\t")
			
 
				+        val logKeyStr = rList(0)
			
 
				+        val (mid, videoid, logtimestamp, apptype, pagesource_change, abcode, video_recommend) = ParamUtils.parseLogKey(logKeyStr)
			
 
				+        val labelStr = rList(1)
			
 
				+        val feaStr = rList(2)
			
 
				+        val labelJson = JSON.parseObject(labelStr)
			
 
				+        val is_share = labelJson.getString("is_share")
			
 
				+        (mid, logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp.toLong)
			
 
				+      }).filter({
			
 
				+        case (mid, logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp) =>
			
 
				+          val pages = Set("2")
			
 
				+          val video_status = Set("-6")
			
 
				+          val apps = Set("0", "4", "5", "21", "3", "6")
			
 
				+          "1".equals(is_share) && pages.contains(pagesource_change) && video_status.contains(video_recommend) && apps.contains(apptype)
			
 
				+      })
			
 
				+
			
 
				+      //2 样本采样
			
 
				+      val data2 = data1.flatMap({
			
 
				+        case (mid, logKeyStr, labelJson, feaStr, is_share, pagesource_change, video_recommend, apptype, logtimestamp) =>
			
 
				+          val res = ArrayBuffer[(String, JSONObject)]()
			
 
				+          val feaJson = JSON.parseObject(feaStr)
			
 
				+          val is_return = labelJson.getString("is_return")
			
 
				+          if ("0".equals(is_return)) {
			
 
				+            res.add(("0", feaJson))
			
 
				+          } else {
			
 
				+            val return_mid_ts_list = labelJson.getString("return_mid_ts_list").split(",").map(r => {
			
 
				+              val midReturn = r.split(":")(0)
			
 
				+              val ts = r.split(":")(1).toLong
			
 
				+              (midReturn, ts)
			
 
				+            }).filter(!_._1.equals(mid)).sortBy(_._2)
			
 
				+            // 样本中做了一个必要的过滤，如果是自己的回流，过滤掉。
			
 
				+
			
 
				+            if (return_mid_ts_list.nonEmpty) {
			
 
				+              var flag = true
			
 
				+              val midSet = scala.collection.mutable.HashSet[String]()
			
 
				+              for ((midReturn, tsReturn) <- return_mid_ts_list) {
			
 
				+                if (flag && !midSet.contains(midReturn)) {
			
 
				+                  // 通过flag的变化，只添加一条正样本。实现不加权。
			
 
				+                  midSet.add(midReturn)
			
 
				+                  if ((tsReturn / 1000 - logtimestamp / 1000) <= 3600 && tsReturn - logtimestamp > 0) {
			
 
				+                    res.add(("1", feaJson))
			
 
				+                    flag = false
			
 
				+                  }
			
 
				+                }
			
 
				+              }
			
 
				+              if (flag) {
			
 
				+                // 如果上面一个正样本都没添加，那么添加一个负样本。代表近一个小时内没有回流。
			
 
				+                res.add(("0", feaJson))
			
 
				+              }
			
 
				+            } else {
			
 
				+              // 如果把自己的回流过滤掉了之后，没有其他回流，那么是负样本。
			
 
				+              res.add(("0", feaJson))
			
 
				+            }
			
 
				+          }
			
 
				+          res.iterator
			
 
				+      })
			
 
				+
			
 
				+      //3 保留一份原始样本的中间数据
			
 
				+      println("样本比例")
			
 
				+      data2.map(r=> (r._1, 1)).reduceByKey(_+_).map(r=> r._1 + "\t" + r._2).collect().foreach(println)
			
 
				+
			
 
				+      //4 特征绝对值转换 如 0.456变成19
			
 
				+      val data3 = data2.map({
			
 
				+        case (label, feaJson) =>
			
 
				+          Set(
			
 
				+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
			
 
				+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
			
 
				+            "u_7day_ctr", "u_7day_str", "u_7day_rov", "u_7day_ros",
			
 
				+            "u_3month_ctr", "u_3month_str", "u_3month_rov", "u_3month_ros",
			
 
				+            // ----------
			
 
				+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
			
 
				+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
			
 
				+            "i_7day_ctr", "i_7day_str", "i_7day_rov", "i_7day_ros",
			
 
				+            "i_3month_ctr", "i_3month_str", "i_3month_rov", "i_3month_ros",
			
 
				+            // ----------
			
 
				+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
			
 
				+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
			
 
				+          ).foreach(key =>{
			
 
				+            if (feaJson.containsKey(key)){
			
 
				+              val value = ExtractorUtils.ceilLogRate(feaJson.getString(key).toDouble)
			
 
				+              feaJson.put(key, value.toString)
			
 
				+            }
			
 
				+          })
			
 
				+          Set(
			
 
				+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
			
 
				+            "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt",
			
 
				+            // ----------
			
 
				+            "total_time", "play_count", "play_count_total",
			
 
				+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
			
 
				+            "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt",
			
 
				+            // ----------
			
 
				+            "share_uv_list_1day_6_avg", "share_uv_list_1day_6_var", "share_uv_list_1day_diff_6_avg", "share_uv_list_1day_diff_6_var",
			
 
				+            "return_uv_list_1day_6_avg", "return_uv_list_1day_6_var", "return_uv_list_1day_diff_6_avg", "return_uv_list_1day_diff_6_var",
			
 
				+            "share_uv_list_1h_6_avg", "share_uv_list_1h_6_var", "share_uv_list_1h_diff_6_avg", "share_uv_list_1h_diff_6_var",
			
 
				+            "return_uv_list_1h_6_avg", "return_uv_list_1h_6_var", "return_uv_list_1h_diff_6_avg", "return_uv_list_1h_diff_6_var",
			
 
				+            // ----------
			
 
				+            "view_pv_list_1day", "view_uv_list_1day", "play_pv_list_1day", "play_uv_list_1day",
			
 
				+            "share_pv_list_1day", "share_uv_list_1day", "return_uv_list_1day",
			
 
				+            "p_view_uv_list_1day", "p_view_pv_list_1day", "p_return_uv_list_1day",
			
 
				+            "share_uv_list_2day", "share_pv_list_2day", "share_uv_list_3day", "share_pv_list_3day",
			
 
				+            // ----------
			
 
				+            "view_uv_list_1h", "view_pv_list_1h", "play_uv_list_1h", "play_pv_list_1h",
			
 
				+            "share_uv_list_1h", "share_pv_list_1h", "return_uv_list_1h", "p_return_uv_list_1h"
			
 
				+
			
 
				+          ).foreach(key => {
			
 
				+            if (feaJson.containsKey(key)) {
			
 
				+              val value = ExtractorUtils.bucketCnt(feaJson.getString(key).toDouble)
			
 
				+              feaJson.put(key, value.toString)
			
 
				+            }
			
 
				+          })
			
 
				+          (label, feaJson)
			
 
				+      })
			
 
				+      //5 libsvm 转换
			
 
				+      val data4 = data3.map({
			
 
				+        case (label, feaJson) =>
			
 
				+          val feaSet = Set(
			
 
				+            "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
			
 
				+            "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_system",
			
 
				+            "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
			
 
				+            "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
			
 
				+            "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
			
 
				+            "total_time", "play_count_total",
			
 
				+            "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
			
 
				+            "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
			
 
				+            "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
			
 
				+            "u_1day_ctr", "u_1day_str", "u_1day_rov", "u_1day_ros",
			
 
				+            "u_3day_ctr", "u_3day_str", "u_3day_rov", "u_3day_ros",
			
 
				+            "u_7day_ctr", "u_7day_str", "u_7day_rov", "u_7day_ros",
			
 
				+            "i_1day_ctr", "i_1day_str", "i_1day_rov", "i_1day_ros",
			
 
				+            "i_3day_ctr", "i_3day_str", "i_3day_rov", "i_3day_ros",
			
 
				+            "i_7day_ctr", "i_7day_str", "i_7day_rov", "i_7day_ros",
			
 
				+
			
 
				+            "i_1day_ctr_rt", "i_1day_str_rt", "i_1day_ros_rt", "i_1day_rov_rt",
			
 
				+            "i_1h_ctr_rt", "i_1h_str_rt", "i_1h_ros_rt", "i_1h_rov_rt"
			
 
				+          )
			
 
				+          val feaMap = new util.HashMap[String, String]()
			
 
				+          feaSet.foreach(r => {
			
 
				+            if (feaJson.containsKey(r)) {
			
 
				+              feaMap.put(r, feaJson.getString(r))
			
 
				+            }
			
 
				+          })
			
 
				+          val bytesFeatureExtractor = new OfflineVlogShareLRFeatureExtractorV2()
			
 
				+          bytesFeatureExtractor.makeFeature4String(feaMap)
			
 
				+          val featureMap = bytesFeatureExtractor.featureMap
			
 
				+          label + "\t" + featureMap.entries().map(r => r.getValue.getIdentifier + ":1").mkString("\t")
			
 
				+
			
 
				+      })
			
 
				+
			
 
				+      // 7 保存数据到hdfs
			
 
				+      hdfsPath = savePath + "/" + partition
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        if (ifRepart == 0){
			
 
				+          data4.saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+        }else{
			
 
				+          data4.repartition(ifRepart).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+        }
			
 
				+      }else{
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): Record = {
			
 
				+    record
			
 
				+  }
			
 
				+
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_13_originData_20240529.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_13_originData_20240529.scala
@@ -0,0 +1,278 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.alibaba.fastjson.JSONObject
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+import scala.collection.JavaConversions._
			
 
				+import examples.extractor.RankExtractorFeature_20240530
			
 
				+import org.xm.Similarity
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+/*
			
 
				+   20240608 提取特征
			
 
				+ */
			
 
				+
			
 
				+object makedata_13_originData_20240529 {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val tablePart = param.getOrElse("tablePart", "64").toInt
			
 
				+    val beginStr = param.getOrElse("beginStr", "2023010100")
			
 
				+    val endStr = param.getOrElse("endStr", "2023010123")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/13_sample_data/")
			
 
				+    val project = param.getOrElse("project", "loghubods")
			
 
				+    val table = param.getOrElse("table", "XXXX")
			
 
				+    val repartition = param.getOrElse("repartition", "100").toInt
			
 
				+
			
 
				+    // 2 读取odps+表信息
			
 
				+    val odpsOps = env.getODPS(sc)
			
 
				+
			
 
				+    // 3 循环执行数据生产
			
 
				+    val timeRange = MyDateUtils.getDateHourRange(beginStr, endStr)
			
 
				+    for (dt_hh <- timeRange) {
			
 
				+      val dt = dt_hh.substring(0, 8)
			
 
				+      val hh = dt_hh.substring(8, 10)
			
 
				+      val partition = s"dt=$dt,hh=$hh"
			
 
				+      println("开始执行partiton:" + partition)
			
 
				+      val odpsData = odpsOps.readTable(project = project,
			
 
				+        table = table,
			
 
				+        partition = partition,
			
 
				+        transfer = func,
			
 
				+        numPartition = tablePart)
			
 
				+        .map(record => {
			
 
				+
			
 
				+          val featureMap = new JSONObject()
			
 
				+
			
 
				+          // a 视频特征
			
 
				+          val b1: JSONObject = if (record.isNull("b1_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b1_feature"))
			
 
				+          val b2: JSONObject = if (record.isNull("b2_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b2_feature"))
			
 
				+          val b3: JSONObject = if (record.isNull("b3_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b3_feature"))
			
 
				+          val b6: JSONObject = if (record.isNull("b6_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b6_feature"))
			
 
				+          val b7: JSONObject = if (record.isNull("b7_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b7_feature"))
			
 
				+
			
 
				+          val b8: JSONObject = if (record.isNull("b8_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b8_feature"))
			
 
				+          val b9: JSONObject = if (record.isNull("b9_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b9_feature"))
			
 
				+          val b10: JSONObject = if (record.isNull("b10_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b10_feature"))
			
 
				+          val b11: JSONObject = if (record.isNull("b11_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b11_feature"))
			
 
				+          val b12: JSONObject = if (record.isNull("b12_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b12_feature"))
			
 
				+          val b13: JSONObject = if (record.isNull("b13_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b13_feature"))
			
 
				+          val b17: JSONObject = if (record.isNull("b17_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b17_feature"))
			
 
				+          val b18: JSONObject = if (record.isNull("b18_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b18_feature"))
			
 
				+          val b19: JSONObject = if (record.isNull("b19_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b19_feature"))
			
 
				+
			
 
				+
			
 
				+          val origin_data = List(
			
 
				+            (b1, b2, b3, "b123"), (b1, b6, b7, "b167"),
			
 
				+            (b8, b9, b10, "b8910"), (b11, b12, b13, "b111213"),
			
 
				+            (b17, b18, b19, "b171819")
			
 
				+          )
			
 
				+          for ((b_1, b_2, b_3, prefix1) <- origin_data){
			
 
				+            for (prefix2 <- List(
			
 
				+              "1h", "2h", "3h", "4h", "12h", "1d", "3d", "7d"
			
 
				+            )){
			
 
				+              val exp = if (b_1.isEmpty) 0D else b_1.getIntValue("exp_pv_" + prefix2).toDouble
			
 
				+              val share = if (b_2.isEmpty) 0D else b_2.getIntValue("share_pv_" + prefix2).toDouble
			
 
				+              val returns = if (b_3.isEmpty) 0D else b_3.getIntValue("return_uv_" + prefix2).toDouble
			
 
				+              val f1 = RankExtractorFeature_20240530.calDiv(share, exp)
			
 
				+              val f2 = RankExtractorFeature_20240530.calLog(share)
			
 
				+              val f3 = RankExtractorFeature_20240530.calDiv(returns, exp)
			
 
				+              val f4 = RankExtractorFeature_20240530.calLog(returns)
			
 
				+              val f5 = f3 * f4
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "STR", f1)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "log(share)", f2)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ROV", f3)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "log(return)", f4)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ROV*log(return)", f5)
			
 
				+            }
			
 
				+          }
			
 
				+
			
 
				+          val video_info: JSONObject = if (record.isNull("t_v_info_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("t_v_info_feature"))
			
 
				+          featureMap.put("total_time", if (video_info.containsKey("total_time")) video_info.getIntValue("total_time").toDouble else 0D)
			
 
				+          featureMap.put("bit_rate", if (video_info.containsKey("bit_rate")) video_info.getIntValue("bit_rate").toDouble else 0D)
			
 
				+
			
 
				+          val c1: JSONObject = if (record.isNull("c1_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("c1_feature"))
			
 
				+          if (c1.nonEmpty){
			
 
				+            featureMap.put("playcnt_6h", if (c1.containsKey("playcnt_6h")) c1.getIntValue("playcnt_6h").toDouble else 0D)
			
 
				+            featureMap.put("playcnt_1d", if (c1.containsKey("playcnt_1d")) c1.getIntValue("playcnt_1d").toDouble else 0D)
			
 
				+            featureMap.put("playcnt_3d", if (c1.containsKey("playcnt_3d")) c1.getIntValue("playcnt_3d").toDouble else 0D)
			
 
				+            featureMap.put("playcnt_7d", if (c1.containsKey("playcnt_7d")) c1.getIntValue("playcnt_7d").toDouble else 0D)
			
 
				+          }
			
 
				+          val c2: JSONObject = if (record.isNull("c2_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("c2_feature"))
			
 
				+          if (c2.nonEmpty){
			
 
				+            featureMap.put("share_pv_12h", if (c2.containsKey("share_pv_12h")) c2.getIntValue("share_pv_12h").toDouble else 0D)
			
 
				+            featureMap.put("share_pv_1d", if (c2.containsKey("share_pv_1d")) c2.getIntValue("share_pv_1d").toDouble else 0D)
			
 
				+            featureMap.put("share_pv_3d", if (c2.containsKey("share_pv_3d")) c2.getIntValue("share_pv_3d").toDouble else 0D)
			
 
				+            featureMap.put("share_pv_7d", if (c2.containsKey("share_pv_7d")) c2.getIntValue("share_pv_7d").toDouble else 0D)
			
 
				+            featureMap.put("return_uv_12h", if (c2.containsKey("return_uv_12h")) c2.getIntValue("return_uv_12h").toDouble else 0D)
			
 
				+            featureMap.put("return_uv_1d", if (c2.containsKey("return_uv_1d")) c2.getIntValue("return_uv_1d").toDouble else 0D)
			
 
				+            featureMap.put("return_uv_3d", if (c2.containsKey("return_uv_3d")) c2.getIntValue("return_uv_3d").toDouble else 0D)
			
 
				+            featureMap.put("return_uv_7d", if (c2.containsKey("return_uv_7d")) c2.getIntValue("return_uv_7d").toDouble else 0D)
			
 
				+          }
			
 
				+
			
 
				+          val title = if (video_info.containsKey("title")) video_info.getString("title") else ""
			
 
				+          if (!title.equals("")){
			
 
				+            for (key_feature <- List("c3_feature", "c4_feature", "c5_feature", "c6_feature", "c7_feature")){
			
 
				+              val c34567: JSONObject = if (record.isNull(key_feature)) new JSONObject() else
			
 
				+                JSON.parseObject(record.getString(key_feature))
			
 
				+              for (key_time <- List("tags_1d", "tags_3d", "tags_7d")) {
			
 
				+                val tags = if (c34567.containsKey(key_time)) c34567.getString(key_time) else ""
			
 
				+                if (!tags.equals("")){
			
 
				+                  val (f1, f2, f3, f4) = funcC34567ForTags(tags, title)
			
 
				+                  featureMap.put(key_feature + "_" + key_time + "_matchnum", f1)
			
 
				+                  featureMap.put(key_feature + "_" + key_time + "_maxscore", f3)
			
 
				+                  featureMap.put(key_feature + "_" + key_time + "_avgscore", f4)
			
 
				+                }
			
 
				+              }
			
 
				+            }
			
 
				+          }
			
 
				+
			
 
				+          val vid = if (record.isNull("vid")) "" else record.getString("vid")
			
 
				+          if (!vid.equals("")){
			
 
				+            for (key_feature <- List("c8_feature", "c9_feature")){
			
 
				+              val c89: JSONObject = if (record.isNull(key_feature)) new JSONObject() else
			
 
				+                JSON.parseObject(record.getString(key_feature))
			
 
				+              for (key_action <- List("share", "return")){
			
 
				+                  val cfListStr = if (c89.containsKey(key_action)) c89.getString(key_action) else ""
			
 
				+                  if (!cfListStr.equals("")){
			
 
				+                    val cfMap = cfListStr.split(",").map(r =>{
			
 
				+                      val rList = r.split(":")
			
 
				+                      (rList(0), (rList(1), rList(2), rList(3)))
			
 
				+                    }).toMap
			
 
				+                    if (cfMap.contains(vid)){
			
 
				+                      val (score, num, rank) = cfMap(vid)
			
 
				+                      featureMap.put(key_feature + "_" + key_action + "_score", score.toDouble)
			
 
				+                      featureMap.put(key_feature + "_" + key_action + "_num", num.toDouble)
			
 
				+                      featureMap.put(key_feature + "_" + key_action + "_rank", 1.0 / rank.toDouble)
			
 
				+                    }
			
 
				+                  }
			
 
				+              }
			
 
				+            }
			
 
				+          }
			
 
				+
			
 
				+          val d1: JSONObject = if (record.isNull("d1_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("d1_feature"))
			
 
				+          if (d1.nonEmpty){
			
 
				+            featureMap.put("d1_exp", if (d1.containsKey("exp")) d1.getString("exp").toDouble else 0D)
			
 
				+            featureMap.put("d1_return_n", if (d1.containsKey("return_n")) d1.getString("return_n").toDouble else 0D)
			
 
				+            featureMap.put("d1_rovn", if (d1.containsKey("rovn")) d1.getString("rovn").toDouble else 0D)
			
 
				+          }
			
 
				+
			
 
				+
			
 
				+          /*
			
 
				+
			
 
				+
			
 
				+          视频：
			
 
				+          曝光使用pv 分享使用pv 回流使用uv --> 1h 2h 3h 4h 12h 1d 3d 7d
			
 
				+          STR log(share) ROV log(return) ROV*log(return)
			
 
				+          40个特征组合
			
 
				+          整体、整体曝光对应、推荐非冷启root、推荐冷启root、分省份root
			
 
				+          200个特征值
			
 
				+
			
 
				+          视频：
			
 
				+          视频时长、比特率
			
 
				+
			
 
				+          人：
			
 
				+          播放次数 --> 6h 1d 3d 7d --> 4个
			
 
				+          带回来的分享pv 回流uv --> 12h 1d 3d 7d --> 8个
			
 
				+          人+vid-title:
			
 
				+          播放点/回流点/分享点/累积分享/累积回流 --> 1d 3d 7d --> 匹配数量 语义最高相似度分 语义平均相似度分 --> 45个
			
 
				+          人+vid-cf
			
 
				+          基于分享行为/基于回流行为 -->  “分享cf”+”回流点击cf“ 相似分 相似数量 相似rank的倒数 --> 12个
			
 
				+
			
 
				+          头部视频：
			
 
				+          曝光 回流 ROVn 3个特征
			
 
				+
			
 
				+          场景：
			
 
				+          小时 星期 apptype city province pagesource 机器型号
			
 
				+           */
			
 
				+
			
 
				+
			
 
				+
			
 
				+          //4 处理label信息。
			
 
				+          val labels = new JSONObject
			
 
				+          for (labelKey <- List(
			
 
				+            "is_play", "is_share", "is_return", "noself_is_return", "return_uv", "noself_return_uv", "total_return_uv",
			
 
				+            "share_pv", "total_share_uv"
			
 
				+          )){
			
 
				+            if (!record.isNull(labelKey)){
			
 
				+              labels.put(labelKey, record.getString(labelKey))
			
 
				+            }
			
 
				+          }
			
 
				+          //5 处理log key表头。
			
 
				+          val apptype = record.getString("apptype")
			
 
				+          val pagesource = record.getString("pagesource")
			
 
				+          val mid = record.getString("mid")
			
 
				+          // vid 已经提取了
			
 
				+          val ts = record.getString("ts")
			
 
				+          val abcode = record.getString("abcode")
			
 
				+          val level = if (record.isNull("level")) "0" else record.getString("level")
			
 
				+          val logKey = (apptype, pagesource, mid, vid, ts, abcode, level).productIterator.mkString(",")
			
 
				+          val labelKey = labels.toString()
			
 
				+          val featureKey = featureMap.toString()
			
 
				+          //6 拼接数据，保存。
			
 
				+          logKey + "\t" + labelKey + "\t" + featureKey
			
 
				+
			
 
				+        })
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val savePartition = dt + hh
			
 
				+      val hdfsPath = savePath + "/" + savePartition
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        odpsData.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      }else{
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): Record = {
			
 
				+    record
			
 
				+  }
			
 
				+  def funcC34567ForTags(tags: String, title: String): Tuple4[Double, String, Double, Double] = {
			
 
				+    // 匹配数量 匹配词 语义最高相似度分 语义平均相似度分
			
 
				+    val tagsList = tags.split(",")
			
 
				+    var d1 = 0.0
			
 
				+    val d2 = new ArrayBuffer[String]()
			
 
				+    var d3 = 0.0
			
 
				+    var d4 = 0.0
			
 
				+    for (tag <- tagsList){
			
 
				+      if (title.contains(tag)){
			
 
				+        d1 = d1 + 1.0
			
 
				+        d2.add(tag)
			
 
				+      }
			
 
				+      val score = Similarity.conceptSimilarity(tag, title)
			
 
				+      d3 = if (score > d3) score else d3
			
 
				+      d4 = d4 + score
			
 
				+    }
			
 
				+    d4 = if (tagsList.nonEmpty) d4 / tagsList.size else d4
			
 
				+    (d1, d2.mkString(","), d3, d4)
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_13_originData_20240529_check.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_13_originData_20240529_check.scala
@@ -0,0 +1,256 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.alibaba.fastjson.{JSON, JSONObject}
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
			
 
				+import examples.extractor.RankExtractorFeature_20240530
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+import org.xm.Similarity
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+/*
			
 
				+   20240608 提取特征
			
 
				+ */
			
 
				+
			
 
				+object makedata_13_originData_20240529_check {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val tablePart = param.getOrElse("tablePart", "64").toInt
			
 
				+    val beginStr = param.getOrElse("beginStr", "2023010100")
			
 
				+    val endStr = param.getOrElse("endStr", "2023010123")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/13_sample_data/")
			
 
				+    val project = param.getOrElse("project", "loghubods")
			
 
				+    val table = param.getOrElse("table", "XXXX")
			
 
				+    val repartition = param.getOrElse("repartition", "100").toInt
			
 
				+
			
 
				+    // 2 读取odps+表信息
			
 
				+    val odpsOps = env.getODPS(sc)
			
 
				+
			
 
				+    // 3 循环执行数据生产
			
 
				+    val timeRange = MyDateUtils.getDateHourRange(beginStr, endStr)
			
 
				+    for (dt_hh <- timeRange) {
			
 
				+      val dt = dt_hh.substring(0, 8)
			
 
				+      val hh = dt_hh.substring(8, 10)
			
 
				+      val partition = s"dt=$dt,hh=$hh"
			
 
				+      println("开始执行partiton:" + partition)
			
 
				+      val odpsData = odpsOps.readTable(project = project,
			
 
				+        table = table,
			
 
				+        partition = partition,
			
 
				+        transfer = func,
			
 
				+        numPartition = tablePart)
			
 
				+        .map(record_ => {
			
 
				+
			
 
				+
			
 
				+          val record = if (record_.isNull("metafeaturemap")) new JSONObject() else
			
 
				+            JSON.parseObject(record_.getString("metafeaturemap"))
			
 
				+
			
 
				+          val featureMap = new JSONObject()
			
 
				+
			
 
				+          // a 视频特征
			
 
				+          val b1: JSONObject = if (!record.containsKey("alg_vid_feature_all_exp")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("alg_vid_feature_all_exp"))
			
 
				+          val b2: JSONObject = if (!record.containsKey("alg_vid_feature_all_share")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("alg_vid_feature_all_share"))
			
 
				+          val b3: JSONObject = if (!record.containsKey("alg_vid_feature_all_return")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("alg_vid_feature_all_return"))
			
 
				+          val b6: JSONObject = if (!record.containsKey("alg_vid_feature_exp2share")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("alg_vid_feature_exp2share"))
			
 
				+          val b7: JSONObject = if (!record.containsKey("alg_vid_feature_share2return")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("alg_vid_feature_share2return"))
			
 
				+
			
 
				+          val b8: JSONObject = if (!record.containsKey("alg_vid_feature_feed_noflow_exp")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("alg_vid_feature_feed_noflow_exp"))
			
 
				+          val b9: JSONObject = if (!record.containsKey("alg_vid_feature_feed_noflow_root_share")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("alg_vid_feature_feed_noflow_root_share"))
			
 
				+          val b10: JSONObject = if (!record.containsKey("alg_vid_feature_feed_noflow_root_return")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("alg_vid_feature_feed_noflow_root_return"))
			
 
				+          val b11: JSONObject = if (!record.containsKey("alg_vid_feature_feed_flow_exp")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("alg_vid_feature_feed_flow_exp"))
			
 
				+          val b12: JSONObject = if (!record.containsKey("alg_vid_feature_feed_flow_root_share")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("alg_vid_feature_feed_flow_root_share"))
			
 
				+          val b13: JSONObject = if (!record.containsKey("alg_vid_feature_feed_flow_root_return")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("alg_vid_feature_feed_flow_root_return"))
			
 
				+          val b17: JSONObject = if (!record.containsKey("alg_vid_feature_feed_province_exp")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("alg_vid_feature_feed_province_exp"))
			
 
				+          val b18: JSONObject = if (!record.containsKey("alg_vid_feature_feed_province_root_share")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("alg_vid_feature_feed_province_root_share"))
			
 
				+          val b19: JSONObject = if (!record.containsKey("alg_vid_feature_feed_province_root_return")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("alg_vid_feature_feed_province_root_return"))
			
 
				+
			
 
				+
			
 
				+          val origin_data = List(
			
 
				+            (b1, b2, b3, "b123"), (b1, b6, b7, "b167"),
			
 
				+            (b8, b9, b10, "b8910"), (b11, b12, b13, "b111213"),
			
 
				+            (b17, b18, b19, "b171819")
			
 
				+          )
			
 
				+          for ((b_1, b_2, b_3, prefix1) <- origin_data) {
			
 
				+            for (prefix2 <- List(
			
 
				+              "1h", "2h", "3h", "4h", "12h", "1d", "3d", "7d"
			
 
				+            )) {
			
 
				+              val exp = if (b_1.isEmpty) 0D else b_1.getIntValue("exp_pv_" + prefix2).toDouble
			
 
				+              val share = if (b_2.isEmpty) 0D else b_2.getIntValue("share_pv_" + prefix2).toDouble
			
 
				+              val returns = if (b_3.isEmpty) 0D else b_3.getIntValue("return_uv_" + prefix2).toDouble
			
 
				+              val f1 = RankExtractorFeature_20240530.calDiv(share, exp)
			
 
				+              val f2 = RankExtractorFeature_20240530.calLog(share)
			
 
				+              val f3 = RankExtractorFeature_20240530.calDiv(returns, exp)
			
 
				+              val f4 = RankExtractorFeature_20240530.calLog(returns)
			
 
				+              val f5 = f3 * f4
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "STR", f1)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "log(share)", f2)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ROV", f3)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "log(return)", f4)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ROV*log(return)", f5)
			
 
				+            }
			
 
				+          }
			
 
				+
			
 
				+          val video_info: JSONObject = if (!record.containsKey("alg_vid_feature_basic_info")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("alg_vid_feature_basic_info"))
			
 
				+          featureMap.put("total_time", if (video_info.containsKey("total_time")) video_info.getIntValue("total_time").toDouble else 0D)
			
 
				+          featureMap.put("bit_rate", if (video_info.containsKey("bit_rate")) video_info.getIntValue("bit_rate").toDouble else 0D)
			
 
				+
			
 
				+          val c1: JSONObject = if (!record.containsKey("alg_mid_feature_play")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("alg_mid_feature_play"))
			
 
				+          if (c1.nonEmpty) {
			
 
				+            featureMap.put("playcnt_6h", if (c1.containsKey("playcnt_6h")) c1.getIntValue("playcnt_6h").toDouble else 0D)
			
 
				+            featureMap.put("playcnt_1d", if (c1.containsKey("playcnt_1d")) c1.getIntValue("playcnt_1d").toDouble else 0D)
			
 
				+            featureMap.put("playcnt_3d", if (c1.containsKey("playcnt_3d")) c1.getIntValue("playcnt_3d").toDouble else 0D)
			
 
				+            featureMap.put("playcnt_7d", if (c1.containsKey("playcnt_7d")) c1.getIntValue("playcnt_7d").toDouble else 0D)
			
 
				+          }
			
 
				+          val c2: JSONObject = if (!record.containsKey("alg_mid_feature_share_and_return")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("alg_mid_feature_share_and_return"))
			
 
				+          if (c2.nonEmpty) {
			
 
				+            featureMap.put("share_pv_12h", if (c2.containsKey("share_pv_12h")) c2.getIntValue("share_pv_12h").toDouble else 0D)
			
 
				+            featureMap.put("share_pv_1d", if (c2.containsKey("share_pv_1d")) c2.getIntValue("share_pv_1d").toDouble else 0D)
			
 
				+            featureMap.put("share_pv_3d", if (c2.containsKey("share_pv_3d")) c2.getIntValue("share_pv_3d").toDouble else 0D)
			
 
				+            featureMap.put("share_pv_7d", if (c2.containsKey("share_pv_7d")) c2.getIntValue("share_pv_7d").toDouble else 0D)
			
 
				+            featureMap.put("return_uv_12h", if (c2.containsKey("return_uv_12h")) c2.getIntValue("return_uv_12h").toDouble else 0D)
			
 
				+            featureMap.put("return_uv_1d", if (c2.containsKey("return_uv_1d")) c2.getIntValue("return_uv_1d").toDouble else 0D)
			
 
				+            featureMap.put("return_uv_3d", if (c2.containsKey("return_uv_3d")) c2.getIntValue("return_uv_3d").toDouble else 0D)
			
 
				+            featureMap.put("return_uv_7d", if (c2.containsKey("return_uv_7d")) c2.getIntValue("return_uv_7d").toDouble else 0D)
			
 
				+          }
			
 
				+
			
 
				+          val title = if (video_info.containsKey("title")) video_info.getString("title") else ""
			
 
				+          if (!title.equals("")) {
			
 
				+            for (key_feature <- List(("c3_feature", "alg_mid_feature_play_tags"),
			
 
				+              ("c4_feature", "alg_mid_feature_play_tags"),
			
 
				+              ("c5_feature", "alg_mid_feature_play_tags"),
			
 
				+              ("c6_feature", "alg_mid_feature_play_tags"),
			
 
				+              ("c7_feature", "alg_mid_feature_play_tags"))) {
			
 
				+              val c34567: JSONObject = if (!record.containsKey(key_feature._2)) new JSONObject() else
			
 
				+                JSON.parseObject(record.getString(key_feature._2))
			
 
				+              for (key_time <- List("tags_1d", "tags_3d", "tags_7d")) {
			
 
				+                val tags = if (c34567.containsKey(key_time)) c34567.getString(key_time) else ""
			
 
				+                if (!tags.equals("")) {
			
 
				+                  val (f1, f2, f3, f4) = funcC34567ForTags(tags, title)
			
 
				+                  featureMap.put(key_feature._1 + "_" + key_time + "_matchnum", f1)
			
 
				+                  featureMap.put(key_feature._1 + "_" + key_time + "_maxscore", f3)
			
 
				+                  featureMap.put(key_feature._1 + "_" + key_time + "_avgscore", f4)
			
 
				+                }
			
 
				+              }
			
 
				+            }
			
 
				+          }
			
 
				+
			
 
				+          val vid = if (record_.isNull("vid")) "" else record_.getString("vid")
			
 
				+          if (!vid.equals("")) {
			
 
				+            for (key_feature <- List(("c8_feature", "alg_mid_feature_sharecf"), ("c9_feature", "alg_mid_feature_returncf"))) {
			
 
				+              val c89: JSONObject = if (!record.containsKey(key_feature._2)) new JSONObject() else
			
 
				+                JSON.parseObject(record.getString(key_feature._2))
			
 
				+              for (key_action <- List("share", "return")) {
			
 
				+                val cfListStr = if (c89.containsKey(key_action)) c89.getString(key_action) else ""
			
 
				+                if (!cfListStr.equals("")) {
			
 
				+                  val cfMap = cfListStr.split(",").map(r => {
			
 
				+                    val rList = r.split(":")
			
 
				+                    (rList(0), (rList(1), rList(2), rList(3)))
			
 
				+                  }).toMap
			
 
				+                  if (cfMap.contains(vid)) {
			
 
				+                    val (score, num, rank) = cfMap(vid)
			
 
				+                    featureMap.put(key_feature._1 + "_" + key_action + "_score", score.toDouble)
			
 
				+                    featureMap.put(key_feature._1 + "_" + key_action + "_num", num.toDouble)
			
 
				+                    featureMap.put(key_feature._1 + "_" + key_action + "_rank", 1.0 / rank.toDouble)
			
 
				+                  }
			
 
				+                }
			
 
				+              }
			
 
				+            }
			
 
				+          }
			
 
				+
			
 
				+          val d1: JSONObject = if (!record.containsKey("alg_recsys_feature_cf_i2i_new")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("alg_recsys_feature_cf_i2i_new"))
			
 
				+          if (d1.nonEmpty) {
			
 
				+            featureMap.put("d1_exp", if (d1.containsKey("exp")) d1.getString("exp").toDouble else 0D)
			
 
				+            featureMap.put("d1_return_n", if (d1.containsKey("return_n")) d1.getString("return_n").toDouble else 0D)
			
 
				+            featureMap.put("d1_rovn", if (d1.containsKey("rovn")) d1.getString("rovn").toDouble else 0D)
			
 
				+          }
			
 
				+
			
 
				+
			
 
				+          //4 处理label信息。
			
 
				+          val labels = new JSONObject
			
 
				+          for (labelKey <- List(
			
 
				+            "is_play", "is_share", "is_return", "noself_is_return", "return_uv", "noself_return_uv", "total_return_uv",
			
 
				+            "share_pv", "total_share_uv"
			
 
				+          )){
			
 
				+            if (!record_.isNull(labelKey)){
			
 
				+              labels.put(labelKey, record_.getString(labelKey))
			
 
				+            }
			
 
				+          }
			
 
				+          //5 处理log key表头。
			
 
				+          val apptype = record_.getString("apptype")
			
 
				+          val pagesource = record_.getString("pagesource")
			
 
				+          val mid = record_.getString("mid")
			
 
				+          // vid 已经提取了
			
 
				+          val ts = record_.getString("ts")
			
 
				+          val abcode = record_.getString("abcode")
			
 
				+          val level = if (record_.isNull("level")) "0" else record_.getString("level")
			
 
				+          val logKey = (apptype, pagesource, mid, vid, ts, abcode, level).productIterator.mkString(",")
			
 
				+          val labelKey = labels.toString()
			
 
				+          val featureKey = featureMap.toString()
			
 
				+          //6 拼接数据，保存。
			
 
				+          logKey + "\t" + labelKey + "\t" + featureKey
			
 
				+
			
 
				+        })
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val savePartition = dt + hh
			
 
				+      val hdfsPath = savePath + "/" + savePartition
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        odpsData.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      }else{
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): Record = {
			
 
				+    record
			
 
				+  }
			
 
				+  def funcC34567ForTags(tags: String, title: String): Tuple4[Double, String, Double, Double] = {
			
 
				+    // 匹配数量 匹配词 语义最高相似度分 语义平均相似度分
			
 
				+    val tagsList = tags.split(",")
			
 
				+    var d1 = 0.0
			
 
				+    val d2 = new ArrayBuffer[String]()
			
 
				+    var d3 = 0.0
			
 
				+    var d4 = 0.0
			
 
				+    for (tag <- tagsList){
			
 
				+      if (title.contains(tag)){
			
 
				+        d1 = d1 + 1.0
			
 
				+        d2.add(tag)
			
 
				+      }
			
 
				+      val score = Similarity.conceptSimilarity(tag, title)
			
 
				+      d3 = if (score > d3) score else d3
			
 
				+      d4 = d4 + score
			
 
				+    }
			
 
				+    d4 = if (tagsList.nonEmpty) d4 / tagsList.size else d4
			
 
				+    (d1, d2.mkString(","), d3, d4)
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_14_valueData_20240608.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_14_valueData_20240608.scala
@@ -0,0 +1,92 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import scala.io.Source
			
 
				+/*
			
 
				+
			
 
				+ */
			
 
				+
			
 
				+object makedata_14_valueData_20240608 {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    val loader = getClass.getClassLoader
			
 
				+    val resourceUrl = loader.getResource("20240608_feature_name.txt")
			
 
				+    val content =
			
 
				+      if (resourceUrl != null) {
			
 
				+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrl).close()
			
 
				+        content
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(content)
			
 
				+    val contentList = content.split("\n")
			
 
				+      .map(r=> r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r=> r.nonEmpty).toList
			
 
				+    val contentList_bc = sc.broadcast(contentList)
			
 
				+
			
 
				+
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val beginStr = param.getOrElse("beginStr", "20230101")
			
 
				+    val endStr = param.getOrElse("endStr", "20230101")
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/13_sample_data/")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/14_feature_data/")
			
 
				+    val repartition = param.getOrElse("repartition", "200").toInt
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      val data = sc.textFile(readPath + "/" + date + "*")
			
 
				+      val data1 = data.map(r => {
			
 
				+        val rList = r.split("\t")
			
 
				+        val logKey = rList(0)
			
 
				+        val labelKey = rList(1)
			
 
				+        val featureKey = rList(2)
			
 
				+        (logKey, labelKey, featureKey)
			
 
				+      }).filter(r =>
			
 
				+        r._1.split(",")(6).equals("0")
			
 
				+      ).mapPartitions(row => {
			
 
				+        val result = new ArrayBuffer[String]()
			
 
				+        val contentList = contentList_bc.value
			
 
				+        row.foreach {
			
 
				+          case (logKey, labelKey, featureKey) =>
			
 
				+            val featureJson = JSON.parseObject(featureKey)
			
 
				+
			
 
				+            val featureValues = contentList.map(key => {
			
 
				+              if (featureJson.containsKey(key)) {
			
 
				+                featureJson.getDouble(key)
			
 
				+              } else {
			
 
				+                0.0
			
 
				+              }
			
 
				+            })
			
 
				+            result.add(logKey + "\t" + labelKey + "\t" + featureValues.mkString(","))
			
 
				+        }
			
 
				+        result.iterator
			
 
				+      })
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val hdfsPath = savePath + "/" + date
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        data1.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      } else {
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_15_bucket_20240608.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_15_bucket_20240608.scala
@@ -0,0 +1,92 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyHdfsUtils, ParamUtils}
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import scala.io.Source
			
 
				+/*
			
 
				+
			
 
				+ */
			
 
				+
			
 
				+object makedata_15_bucket_20240608 {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    val loader = getClass.getClassLoader
			
 
				+    val resourceUrl = loader.getResource("20240608_feature_name.txt")
			
 
				+    val content =
			
 
				+      if (resourceUrl != null) {
			
 
				+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrl).close()
			
 
				+        content
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(content)
			
 
				+    val contentList = content.split("\n")
			
 
				+      .map(r=> r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r=> r.nonEmpty).toList
			
 
				+
			
 
				+
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/14_feature_data/20240607")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/15_bucket_data/")
			
 
				+    val fileName = param.getOrElse("fileName", "20240607_200")
			
 
				+    val sampleRate = param.getOrElse("sampleRate", "0.1").toDouble
			
 
				+    val bucketNum = param.getOrElse("bucketNum", "200").toInt
			
 
				+
			
 
				+    val data = sc.textFile(readPath)
			
 
				+    val data1 = data.map(r => {
			
 
				+      val rList = r.split("\t")
			
 
				+      val doubles = rList(2).split(",").map(_.toDouble)
			
 
				+      doubles
			
 
				+    }).sample(false, sampleRate ).repartition(20)
			
 
				+
			
 
				+    val result = new ArrayBuffer[String]()
			
 
				+
			
 
				+    for (i <- contentList.indices){
			
 
				+      println("特征:" + contentList(i))
			
 
				+      val data2 = data1.map(r => r(i)).filter(_ > 1E-8).collect().sorted
			
 
				+      val len = data2.length
			
 
				+      val oneBucketNum = (len - 1) / (bucketNum - 1) + 1 // 确保每个桶至少有一个元素
			
 
				+      val buffers = new ArrayBuffer[Double]()
			
 
				+
			
 
				+      var lastBucketValue = data2(0) // 记录上一个桶的切分点
			
 
				+      for (j <- 0 until len by oneBucketNum) {
			
 
				+        val d = data2(j)
			
 
				+        if (j > 0 && d != lastBucketValue) {
			
 
				+          // 如果当前切分点不同于上一个切分点，则保存当前切分点
			
 
				+          buffers += d
			
 
				+        }
			
 
				+        lastBucketValue = d // 更新上一个桶的切分点
			
 
				+      }
			
 
				+
			
 
				+      // 最后一个桶的结束点应该是数组的最后一个元素
			
 
				+      if (!buffers.contains(data2.last)) {
			
 
				+        buffers += data2.last
			
 
				+      }
			
 
				+      result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(","))
			
 
				+    }
			
 
				+    val data3 = sc.parallelize(result)
			
 
				+
			
 
				+
			
 
				+    // 4 保存数据到hdfs
			
 
				+    val hdfsPath = savePath + "/" + fileName
			
 
				+    if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
			
 
				+      println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+      MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+      data3.repartition(1).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+    } else {
			
 
				+      println("路径不合法，无法写入:" + hdfsPath)
			
 
				+    }
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_16_bucketData_20240609.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_16_bucketData_20240609.scala
@@ -0,0 +1,127 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
			
 
				+import examples.extractor.ExtractorUtils
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import scala.io.Source
			
 
				+/*
			
 
				+
			
 
				+ */
			
 
				+
			
 
				+object makedata_16_bucketData_20240609 {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    val loader = getClass.getClassLoader
			
 
				+    val resourceUrl = loader.getResource("20240608_feature_name.txt")
			
 
				+    val content =
			
 
				+      if (resourceUrl != null) {
			
 
				+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrl).close()
			
 
				+        content
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(content)
			
 
				+    val contentList = content.split("\n")
			
 
				+      .map(r=> r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r=> r.nonEmpty).toList
			
 
				+    val contentList_br = sc.broadcast(contentList)
			
 
				+
			
 
				+    val resourceUrlBucket = loader.getResource("20240609_bucket_274.txt")
			
 
				+    val buckets =
			
 
				+      if (resourceUrlBucket != null) {
			
 
				+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrlBucket).close()
			
 
				+        buckets
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(buckets)
			
 
				+    val bucketsMap = buckets.split("\n")
			
 
				+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r => r.nonEmpty)
			
 
				+      .map(r =>{
			
 
				+        val rList = r.split("\t")
			
 
				+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
			
 
				+      }).toMap
			
 
				+    val bucketsMap_br = sc.broadcast(bucketsMap)
			
 
				+
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/14_feature_data/")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/16_train_data/")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20240606")
			
 
				+    val endStr = param.getOrElse("endStr", "20240607")
			
 
				+    val repartition = param.getOrElse("repartition", "200").toInt
			
 
				+
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      println("开始执行:" + date)
			
 
				+      val data = sc.textFile(readPath + date).map(r=>{
			
 
				+        val rList = r.split("\t")
			
 
				+        val logKey = rList(0)
			
 
				+        val labelKey = rList(1)
			
 
				+        val features = rList(2).split(",").map(_.toDouble)
			
 
				+        (logKey, labelKey, features)
			
 
				+      })
			
 
				+        .filter{
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val logKeyList = logKey.split(",")
			
 
				+            val apptype = logKeyList(0)
			
 
				+            val pagesource = logKeyList(1)
			
 
				+            Set("0", "4", "5", "21", "3", "6").contains(apptype) && pagesource.endsWith("recommend")
			
 
				+        }
			
 
				+        .map{
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val label = JSON.parseObject(labelKey).getOrDefault("is_return", "0").toString
			
 
				+            (label, features)
			
 
				+        }
			
 
				+        .mapPartitions(row => {
			
 
				+        val result = new ArrayBuffer[String]()
			
 
				+        val contentList = contentList_br.value
			
 
				+        val bucketsMap = bucketsMap_br.value
			
 
				+        row.foreach{
			
 
				+          case (label, features) =>
			
 
				+            val featuresBucket = contentList.indices.map(i =>{
			
 
				+              val featureName = contentList(i)
			
 
				+              val score = features(i)
			
 
				+              if (score > 1E-8){
			
 
				+                val (bucketNum, buckets) = bucketsMap(featureName)
			
 
				+                val scoreNew = 1.0 / bucketNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
			
 
				+                featureName + ":" + scoreNew.toString
			
 
				+              }else{
			
 
				+                ""
			
 
				+              }
			
 
				+            }).filter(_.nonEmpty)
			
 
				+            result.add(label + "\t" + featuresBucket.mkString("\t"))
			
 
				+        }
			
 
				+        result.iterator
			
 
				+      })
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val hdfsPath = savePath + "/" + date
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      } else {
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_16_bucketData_20240609_check.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_16_bucketData_20240609_check.scala
@@ -0,0 +1,132 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
			
 
				+import examples.extractor.ExtractorUtils
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import scala.io.Source
			
 
				+/*
			
 
				+
			
 
				+ */
			
 
				+
			
 
				+object makedata_16_bucketData_20240609_check {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    val loader = getClass.getClassLoader
			
 
				+    val resourceUrl = loader.getResource("20240608_feature_name.txt")
			
 
				+    val content =
			
 
				+      if (resourceUrl != null) {
			
 
				+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrl).close()
			
 
				+        content
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(content)
			
 
				+    val contentList = content.split("\n")
			
 
				+      .map(r=> r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r=> r.nonEmpty).toList
			
 
				+    val contentList_br = sc.broadcast(contentList)
			
 
				+
			
 
				+    val resourceUrlBucket = loader.getResource("20240609_bucket_274.txt")
			
 
				+    val buckets =
			
 
				+      if (resourceUrlBucket != null) {
			
 
				+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrlBucket).close()
			
 
				+        buckets
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(buckets)
			
 
				+    val bucketsMap = buckets.split("\n")
			
 
				+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r => r.nonEmpty)
			
 
				+      .map(r =>{
			
 
				+        val rList = r.split("\t")
			
 
				+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
			
 
				+      }).toMap
			
 
				+    val bucketsMap_br = sc.broadcast(bucketsMap)
			
 
				+
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/14_feature_data/")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/16_train_data/")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20240606")
			
 
				+    val endStr = param.getOrElse("endStr", "20240607")
			
 
				+    val repartition = param.getOrElse("repartition", "200").toInt
			
 
				+    val APPSETS = param.getOrElse("APPSETS", "3").split(",").filter(_.nonEmpty).toSet
			
 
				+    val ABSETS = param.getOrElse("ABSETS", "ab0,ab1,ab2,ab3").split(",").filter(_.startsWith("ab")).toSet
			
 
				+
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      println("开始执行:" + date)
			
 
				+      val data = sc.textFile(readPath + date).map(r=>{
			
 
				+        val rList = r.split("\t")
			
 
				+        val logKey = rList(0)
			
 
				+        val labelKey = rList(1)
			
 
				+        val features = rList(2).split(",").map(_.toDouble)
			
 
				+        (logKey, labelKey, features)
			
 
				+      })
			
 
				+        .filter{
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val logKeyList = logKey.split(",")
			
 
				+            val apptype = logKeyList(0)
			
 
				+            val pagesource = logKeyList(1)
			
 
				+            val abcode = logKeyList(5)
			
 
				+            val level = logKeyList(6)
			
 
				+            APPSETS.contains(apptype) && pagesource.endsWith("recommend") &&
			
 
				+              ABSETS.contains(abcode) && level.equals("0")
			
 
				+        }
			
 
				+        .map{
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val label = JSON.parseObject(labelKey).getOrDefault("is_return", "0").toString
			
 
				+            (label, features)
			
 
				+        }
			
 
				+        .mapPartitions(row => {
			
 
				+        val result = new ArrayBuffer[String]()
			
 
				+        val contentList = contentList_br.value
			
 
				+        val bucketsMap = bucketsMap_br.value
			
 
				+        row.foreach{
			
 
				+          case (label, features) =>
			
 
				+            val featuresBucket = contentList.indices.map(i =>{
			
 
				+              val featureName = contentList(i)
			
 
				+              val score = features(i)
			
 
				+              if (score > 1E-8){
			
 
				+                val (bucketNum, buckets) = bucketsMap(featureName)
			
 
				+                val scoreNew = 1.0 / bucketNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
			
 
				+                featureName + ":" + scoreNew.toString
			
 
				+              }else{
			
 
				+                ""
			
 
				+              }
			
 
				+            }).filter(_.nonEmpty)
			
 
				+            result.add(label + "\t" + featuresBucket.mkString("\t"))
			
 
				+        }
			
 
				+        result.iterator
			
 
				+      })
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val hdfsPath = savePath + "/" + date
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      } else {
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_17_bucketDataPrint_20240617.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_17_bucketDataPrint_20240617.scala
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_18_mergehour2day_20240617.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_18_mergehour2day_20240617.scala
@@ -0,0 +1,43 @@
 
				+package com.aliyun.odps.spark.examples.makedata
			
 
				+
			
 
				+import com.alibaba.fastjson.{JSON, JSONObject}
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+import org.xm.Similarity
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+/*
			
 
				+   20240608 提取特征
			
 
				+ */
			
 
				+
			
 
				+object makedata_18_mergehour2day_20240617 {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/16_train_data_print_online/20240615*")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/16_train_data_print_online_merge/20240615/")
			
 
				+    val repartition = param.getOrElse("repartition", "100").toInt
			
 
				+
			
 
				+    val data = sc.textFile(readPath)
			
 
				+
			
 
				+    // 4 保存数据到hdfs
			
 
				+    val hdfsPath = savePath
			
 
				+    if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
			
 
				+      println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+      MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+      data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+    }else{
			
 
				+      println("路径不合法，无法写入:" + hdfsPath)
			
 
				+    }
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_31_originData_20240620.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_31_originData_20240620.scala
@@ -0,0 +1,388 @@
 
				+package com.aliyun.odps.spark.examples.makedata_ad
			
 
				+
			
 
				+import com.alibaba.fastjson.{JSON, JSONObject}
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
			
 
				+import examples.extractor.RankExtractorFeature_20240530
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+import org.xm.Similarity
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+/*
			
 
				+   20240608 提取特征
			
 
				+ */
			
 
				+
			
 
				+object makedata_ad_31_originData_20240620 {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val tablePart = param.getOrElse("tablePart", "64").toInt
			
 
				+    val beginStr = param.getOrElse("beginStr", "2024062008")
			
 
				+    val endStr = param.getOrElse("endStr", "2024062023")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/31_ad_sample_data/")
			
 
				+    val project = param.getOrElse("project", "loghubods")
			
 
				+    val table = param.getOrElse("table", "alg_recsys_ad_sample_all")
			
 
				+    val repartition = param.getOrElse("repartition", "100").toInt
			
 
				+
			
 
				+    // 2 读取odps+表信息
			
 
				+    val odpsOps = env.getODPS(sc)
			
 
				+
			
 
				+    // 3 循环执行数据生产
			
 
				+    val timeRange = MyDateUtils.getDateHourRange(beginStr, endStr)
			
 
				+    for (dt_hh <- timeRange) {
			
 
				+      val dt = dt_hh.substring(0, 8)
			
 
				+      val hh = dt_hh.substring(8, 10)
			
 
				+      val partition = s"dt=$dt,hh=$hh"
			
 
				+      println("开始执行partiton:" + partition)
			
 
				+      val odpsData = odpsOps.readTable(project = project,
			
 
				+        table = table,
			
 
				+        partition = partition,
			
 
				+        transfer = func,
			
 
				+        numPartition = tablePart)
			
 
				+        .map(record => {
			
 
				+
			
 
				+
			
 
				+          val ts = record.getString("ts").toInt
			
 
				+          val cid = record.getString("cid")
			
 
				+
			
 
				+
			
 
				+          val featureMap = new JSONObject()
			
 
				+
			
 
				+          val b1: JSONObject = if (record.isNull("b1_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b1_feature"))
			
 
				+          val b2: JSONObject = if (record.isNull("b2_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b2_feature"))
			
 
				+          val b3: JSONObject = if (record.isNull("b3_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b3_feature"))
			
 
				+          val b4: JSONObject = if (record.isNull("b4_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b4_feature"))
			
 
				+          val b5: JSONObject = if (record.isNull("b5_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b5_feature"))
			
 
				+          val b6: JSONObject = if (record.isNull("b6_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b6_feature"))
			
 
				+          val b7: JSONObject = if (record.isNull("b7_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b7_feature"))
			
 
				+          val b8: JSONObject = if (record.isNull("b8_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b8_feature"))
			
 
				+          val b9: JSONObject = if (record.isNull("b9_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("b9_feature"))
			
 
				+
			
 
				+
			
 
				+          featureMap.put("cid_" + cid, 1.0)
			
 
				+          if (b1.containsKey("adid") && b1.getString("adid").nonEmpty) {
			
 
				+            featureMap.put("adid_" + b1.getString("adid"), 1.0)
			
 
				+          }
			
 
				+          if (b1.containsKey("adverid") && b1.getString("adverid").nonEmpty) {
			
 
				+            featureMap.put("adverid_" + b1.getString("adverid"), 1.0)
			
 
				+          }
			
 
				+          if (b1.containsKey("targeting_conversion") && b1.getString("targeting_conversion").nonEmpty) {
			
 
				+            featureMap.put("targeting_conversion_" + b1.getString("targeting_conversion"), 1.0)
			
 
				+          }
			
 
				+
			
 
				+
			
 
				+          if (b1.containsKey("cpa")) {
			
 
				+            featureMap.put("cpa", b1.getString("cpa").toDouble)
			
 
				+          }
			
 
				+
			
 
				+          for ((bn, prefix1) <- List(
			
 
				+            (b2, "b2"), (b3, "b3"),(b4, "b4"),(b5, "b5"),(b8, "b8")
			
 
				+          )){
			
 
				+            for (prefix2 <- List(
			
 
				+              "3h", "6h", "12h", "1d", "3d", "7d"
			
 
				+            )){
			
 
				+              val view = if (bn.isEmpty) 0D else bn.getIntValue("ad_view_" + prefix2).toDouble
			
 
				+              val click = if (bn.isEmpty) 0D else bn.getIntValue("ad_click_" + prefix2).toDouble
			
 
				+              val conver = if (bn.isEmpty) 0D else bn.getIntValue("ad_conversion_" + prefix2).toDouble
			
 
				+              val income = if (bn.isEmpty) 0D else bn.getIntValue("ad_income_" + prefix2).toDouble
			
 
				+              val f1 = RankExtractorFeature_20240530.calDiv(click, view)
			
 
				+              val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
			
 
				+              val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
			
 
				+              val f4 = conver
			
 
				+              val f5 = RankExtractorFeature_20240530.calDiv(income*1000, view)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctr", f1)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctcvr", f2)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "cvr", f3)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver", f4)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ecpm", f5)
			
 
				+            }
			
 
				+          }
			
 
				+
			
 
				+          for ((bn, prefix1) <- List(
			
 
				+            (b6, "b6"), (b7, "b7")
			
 
				+          )) {
			
 
				+            for (prefix2 <- List(
			
 
				+              "7d", "14d"
			
 
				+            )) {
			
 
				+              val view = if (bn.isEmpty) 0D else bn.getIntValue("ad_view_" + prefix2).toDouble
			
 
				+              val click = if (bn.isEmpty) 0D else bn.getIntValue("ad_click_" + prefix2).toDouble
			
 
				+              val conver = if (bn.isEmpty) 0D else bn.getIntValue("ad_conversion_" + prefix2).toDouble
			
 
				+              val income = if (bn.isEmpty) 0D else bn.getIntValue("ad_income_" + prefix2).toDouble
			
 
				+              val f1 = RankExtractorFeature_20240530.calDiv(click, view)
			
 
				+              val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
			
 
				+              val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
			
 
				+              val f4 = conver
			
 
				+              val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctr", f1)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctcvr", f2)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "cvr", f3)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver", f4)
			
 
				+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ecpm", f5)
			
 
				+            }
			
 
				+          }
			
 
				+
			
 
				+          val c1: JSONObject = if (record.isNull("c1_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("c1_feature"))
			
 
				+
			
 
				+          val midActionList = if (c1.containsKey("action") && c1.getString("action").nonEmpty){
			
 
				+            c1.getString("action").split(",").map(r=>{
			
 
				+              val rList = r.split(":")
			
 
				+              (rList(0), (rList(1).toInt, rList(2).toInt, rList(3).toInt, rList(4).toInt, rList(5)))
			
 
				+            }).sortBy(-_._2._1).toList
			
 
				+          }else {
			
 
				+            new ArrayBuffer[(String, (Int, Int, Int, Int, String))]().toList
			
 
				+          }
			
 
				+          // u特征
			
 
				+          val viewAll = midActionList.size.toDouble
			
 
				+          val clickAll = midActionList.map(_._2._2).sum.toDouble
			
 
				+          val converAll = midActionList.map(_._2._3).sum.toDouble
			
 
				+          val incomeAll = midActionList.map(_._2._4).sum.toDouble
			
 
				+          featureMap.put("viewAll", viewAll)
			
 
				+          featureMap.put("clickAll", clickAll)
			
 
				+          featureMap.put("converAll", converAll)
			
 
				+          featureMap.put("incomeAll", incomeAll)
			
 
				+          featureMap.put("ctr_all", RankExtractorFeature_20240530.calDiv(clickAll, viewAll))
			
 
				+          featureMap.put("ctcvr_all", RankExtractorFeature_20240530.calDiv(converAll, viewAll))
			
 
				+          featureMap.put("cvr_all", RankExtractorFeature_20240530.calDiv(clickAll, converAll))
			
 
				+          featureMap.put("ecpm_all", RankExtractorFeature_20240530.calDiv(incomeAll * 1000, viewAll))
			
 
				+
			
 
				+          // ui特征
			
 
				+          val midTimeDiff = scala.collection.mutable.Map[String, Double]()
			
 
				+          midActionList.foreach{
			
 
				+            case (cid, (ts_history, click, conver, income, title)) =>
			
 
				+              if (!midTimeDiff.contains("timediff_view_" + cid)){
			
 
				+                midTimeDiff.put("timediff_view_" + cid, 1.0 / ((ts - ts_history).toDouble/3600.0/24.0))
			
 
				+              }
			
 
				+              if (!midTimeDiff.contains("timediff_click_" + cid) && click > 0) {
			
 
				+                midTimeDiff.put("timediff_click_" + cid, 1.0 / ((ts - ts_history).toDouble / 3600.0 / 24.0))
			
 
				+              }
			
 
				+              if (!midTimeDiff.contains("timediff_conver_" + cid) && conver > 0) {
			
 
				+                midTimeDiff.put("timediff_conver_" + cid, 1.0 / ((ts - ts_history).toDouble / 3600.0 / 24.0))
			
 
				+              }
			
 
				+          }
			
 
				+
			
 
				+          val midActionStatic = scala.collection.mutable.Map[String, Double]()
			
 
				+          midActionList.foreach {
			
 
				+            case (cid, (ts_history, click, conver, income, title)) =>
			
 
				+              midActionStatic.put("actionstatic_view_" + cid, 1.0 + midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0))
			
 
				+              midActionStatic.put("actionstatic_click_" + cid, click + midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0))
			
 
				+              midActionStatic.put("actionstatic_conver_" + cid, conver + midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0))
			
 
				+              midActionStatic.put("actionstatic_income_" + cid, income + midActionStatic.getOrDefault("actionstatic_income_" + cid, 0.0))
			
 
				+          }
			
 
				+
			
 
				+          if (midTimeDiff.contains("timediff_view_" + cid)){
			
 
				+            featureMap.put("timediff_view", midTimeDiff.getOrDefault("timediff_view_" + cid, 0.0))
			
 
				+          }
			
 
				+          if (midTimeDiff.contains("timediff_click_" + cid)) {
			
 
				+            featureMap.put("timediff_click", midTimeDiff.getOrDefault("timediff_click_" + cid, 0.0))
			
 
				+          }
			
 
				+          if (midTimeDiff.contains("timediff_conver_" + cid)) {
			
 
				+            featureMap.put("timediff_conver", midTimeDiff.getOrDefault("timediff_conver_" + cid, 0.0))
			
 
				+          }
			
 
				+          if (midActionStatic.contains("actionstatic_view_" + cid)) {
			
 
				+            featureMap.put("actionstatic_view", midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0))
			
 
				+          }
			
 
				+          if (midActionStatic.contains("actionstatic_click_" + cid)) {
			
 
				+            featureMap.put("actionstatic_click", midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0))
			
 
				+          }
			
 
				+          if (midActionStatic.contains("actionstatic_conver_" + cid)) {
			
 
				+            featureMap.put("actionstatic_conver", midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0))
			
 
				+          }
			
 
				+          if (midActionStatic.contains("actionstatic_income_" + cid)) {
			
 
				+            featureMap.put("actionstatic_income", midActionStatic.getOrDefault("actionstatic_income_" + cid, 0.0))
			
 
				+          }
			
 
				+          if (midActionStatic.contains("actionstatic_view_" + cid) && midActionStatic.contains("actionstatic_click_" + cid)) {
			
 
				+            featureMap.put("actionstatic_ctr", RankExtractorFeature_20240530.calDiv(
			
 
				+              midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0),
			
 
				+              midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0)
			
 
				+            ))
			
 
				+          }
			
 
				+          if (midActionStatic.contains("actionstatic_view_" + cid) && midActionStatic.contains("timediff_conver_" + cid)) {
			
 
				+            featureMap.put("actionstatic_ctcvr", RankExtractorFeature_20240530.calDiv(
			
 
				+              midActionStatic.getOrDefault("timediff_conver_" + cid, 0.0),
			
 
				+              midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0)
			
 
				+            ))
			
 
				+          }
			
 
				+          if (midActionStatic.contains("actionstatic_conver_" + cid) && midActionStatic.contains("actionstatic_click_" + cid)) {
			
 
				+            featureMap.put("actionstatic_cvr", RankExtractorFeature_20240530.calDiv(
			
 
				+              midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0),
			
 
				+              midActionStatic.getOrDefault("timediff_conver_" + cid, 0.0)
			
 
				+            ))
			
 
				+          }
			
 
				+
			
 
				+          val e1: JSONObject = if (record.isNull("e1_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("e1_feature"))
			
 
				+          val e2: JSONObject = if (record.isNull("e2_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("e2_feature"))
			
 
				+          val title = b1.getOrDefault("cidtitle", "").toString
			
 
				+          if (title.nonEmpty){
			
 
				+            for ((en, prefix1) <- List((e1, "e1"), (e2, "e2"))){
			
 
				+              for (prefix2 <- List("tags_3d", "tags_7d", "tags_14d")){
			
 
				+                if (en.nonEmpty && en.containsKey(prefix2) && en.getString(prefix2).nonEmpty) {
			
 
				+                  val (f1, f2, f3, f4) = funcC34567ForTags(en.getString(prefix2), title)
			
 
				+                  featureMap.put(prefix1 + "_" + prefix2 + "_matchnum", f1)
			
 
				+                  featureMap.put(prefix1 + "_" + prefix2 + "_maxscore", f3)
			
 
				+                  featureMap.put(prefix1 + "_" + prefix2 + "_avgscore", f4)
			
 
				+
			
 
				+                }
			
 
				+              }
			
 
				+            }
			
 
				+          }
			
 
				+
			
 
				+          val d1: JSONObject = if (record.isNull("d1_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("d1_feature"))
			
 
				+          val d2: JSONObject = if (record.isNull("d2_feature")) new JSONObject() else
			
 
				+            JSON.parseObject(record.getString("d2_feature"))
			
 
				+
			
 
				+          if (d1.nonEmpty){
			
 
				+            for (prefix <- List("3h", "6h", "12h", "1d", "3d", "7d")) {
			
 
				+              val view = if (!d1.containsKey("ad_view_" + prefix)) 0D else d1.getIntValue("ad_view_" + prefix).toDouble
			
 
				+              val click = if (!d1.containsKey("ad_click_" + prefix)) 0D else d1.getIntValue("ad_click_" + prefix).toDouble
			
 
				+              val conver = if (!d1.containsKey("ad_conversion_" + prefix)) 0D else d1.getIntValue("ad_conversion_" + prefix).toDouble
			
 
				+              val income = if (!d1.containsKey("ad_income_" + prefix)) 0D else d1.getIntValue("ad_income_" + prefix).toDouble
			
 
				+              val f1 = RankExtractorFeature_20240530.calDiv(click, view)
			
 
				+              val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
			
 
				+              val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
			
 
				+              val f4 = conver
			
 
				+              val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
			
 
				+              featureMap.put("d1_feature" + "_" + prefix + "_" + "ctr", f1)
			
 
				+              featureMap.put("d1_feature" + "_" + prefix + "_" + "ctcvr", f2)
			
 
				+              featureMap.put("d1_feature" + "_" + prefix + "_" + "cvr", f3)
			
 
				+              featureMap.put("d1_feature" + "_" + prefix + "_" + "conver", f4)
			
 
				+              featureMap.put("d1_feature" + "_" + prefix + "_" + "ecpm", f5)
			
 
				+            }
			
 
				+          }
			
 
				+
			
 
				+          val vidRankMaps = scala.collection.mutable.Map[String, scala.collection.immutable.Map[String, Double]]()
			
 
				+          if (d2.nonEmpty){
			
 
				+            d2.foreach(r => {
			
 
				+              val key = r._1
			
 
				+              val value = d2.getString(key).split(",").map(r=> {
			
 
				+                val rList = r.split(":")
			
 
				+                (rList(0), rList(2).toDouble)
			
 
				+              }).toMap
			
 
				+              vidRankMaps.put(key, value)
			
 
				+            })
			
 
				+          }
			
 
				+          for (prefix1 <- List("ctr", "ctcvr", "ecpm")) {
			
 
				+            for (prefix2 <- List("1d", "3d", "7d", "14d")) {
			
 
				+              if (vidRankMaps.contains(prefix1 + "_" + prefix2)){
			
 
				+                val rank = vidRankMaps(prefix1 + "_" + prefix2).getOrDefault(cid, 0.0)
			
 
				+                if (rank >= 1.0){
			
 
				+                  featureMap.put("vid_rank_" + prefix1 + "_" + prefix2, 1.0 / rank)
			
 
				+                }
			
 
				+              }
			
 
				+            }
			
 
				+          }
			
 
				+
			
 
				+
			
 
				+          /*
			
 
				+          广告
			
 
				+            sparse：cid adid adverid targeting_conversion
			
 
				+
			
 
				+            cpa --> 1个
			
 
				+            adverid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr conver ecpm  --> 30个
			
 
				+            cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
			
 
				+            地理//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
			
 
				+            app//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
			
 
				+            手机品牌//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
			
 
				+            系统 无数据
			
 
				+            week//cid下的 7d 14d、 ctr ctcvr cvr ecpm conver --> 10个
			
 
				+            hour//cid下的 7d 14d、 ctr ctcvr cvr ecpm conver --> 10个
			
 
				+
			
 
				+          用户
			
 
				+            用户历史 点击/转化 的title tag；3d 7d 14d； cid的title； 数量/最高分/平均分 --> 18个
			
 
				+            用户历史 14d 看过/点过/转化次数/income； ctr cvr ctcvr ecpm；  --> 8个
			
 
				+
			
 
				+            用户到cid的ui特征 --> 10个
			
 
				+              1/用户最近看过这个cid的时间间隔
			
 
				+              1/用户最近点过这个cid的时间间隔
			
 
				+              1/用户最近转过这个cid的时间间隔
			
 
				+              用户看过这个cid多少次
			
 
				+              用户点过这个cid多少次
			
 
				+              用户转过这个cid多少次
			
 
				+              用户对这个cid花了多少钱
			
 
				+              用户对这个cid的ctr ctcvr cvr
			
 
				+
			
 
				+          视频
			
 
				+            title与cid的 sim-score-1/-2 无数据
			
 
				+            vid//cid下的 3h 6h 12h 1d 3d 7d 、 ctr ctcvr cvr ecpm conver --> 30个
			
 
				+            vid//cid下的 1d 3d 7d 14d、 ctr ctcvr ecpm 的rank值 倒数 --> 12个
			
 
				+
			
 
				+           */
			
 
				+
			
 
				+
			
 
				+
			
 
				+          //4 处理label信息。
			
 
				+          val labels = new JSONObject
			
 
				+          for (labelKey <- List("ad_is_click", "ad_is_conversion")){
			
 
				+            if (!record.isNull(labelKey)){
			
 
				+              labels.put(labelKey, record.getString(labelKey))
			
 
				+            }
			
 
				+          }
			
 
				+          //5 处理log key表头。
			
 
				+          val apptype = record.getString("apptype")
			
 
				+          val mid = record.getString("mid")
			
 
				+          val headvideoid = record.getString("headvideoid")
			
 
				+          val logKey = (apptype, mid, cid, ts, headvideoid).productIterator.mkString(",")
			
 
				+          val labelKey = labels.toString()
			
 
				+          val featureKey = featureMap.toString()
			
 
				+          //6 拼接数据，保存。
			
 
				+          logKey + "\t" + labelKey + "\t" + featureKey
			
 
				+        })
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val savePartition = dt + hh
			
 
				+      val hdfsPath = savePath + "/" + savePartition
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        odpsData.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      }else{
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  def func(record: Record, schema: TableSchema): Record = {
			
 
				+    record
			
 
				+  }
			
 
				+
			
 
				+  def funcC34567ForTags(tags: String, title: String): Tuple4[Double, String, Double, Double] = {
			
 
				+    // 匹配数量 匹配词 语义最高相似度分 语义平均相似度分
			
 
				+    val tagsList = tags.split(",")
			
 
				+    var d1 = 0.0
			
 
				+    val d2 = new ArrayBuffer[String]()
			
 
				+    var d3 = 0.0
			
 
				+    var d4 = 0.0
			
 
				+    for (tag <- tagsList) {
			
 
				+      if (title.contains(tag)) {
			
 
				+        d1 = d1 + 1.0
			
 
				+        d2.add(tag)
			
 
				+      }
			
 
				+      val score = Similarity.conceptSimilarity(tag, title)
			
 
				+      d3 = if (score > d3) score else d3
			
 
				+      d4 = d4 + score
			
 
				+    }
			
 
				+    d4 = if (tagsList.nonEmpty) d4 / tagsList.size else d4
			
 
				+    (d1, d2.mkString(","), d3, d4)
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_32_bucket_20240622.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_32_bucket_20240622.scala
@@ -0,0 +1,103 @@
 
				+package com.aliyun.odps.spark.examples.makedata_ad
			
 
				+
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyHdfsUtils, ParamUtils}
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import scala.io.Source
			
 
				+/*
			
 
				+
			
 
				+ */
			
 
				+
			
 
				+object makedata_ad_32_bucket_20240622 {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    val loader = getClass.getClassLoader
			
 
				+    val resourceUrl = loader.getResource("20240622_ad_feature_name.txt")
			
 
				+    val content =
			
 
				+      if (resourceUrl != null) {
			
 
				+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrl).close()
			
 
				+        content
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(content)
			
 
				+    val contentList = content.split("\n")
			
 
				+      .map(r=> r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r=> r.nonEmpty).toList
			
 
				+
			
 
				+
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/20240620*")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/32_bucket_data/")
			
 
				+    val fileName = param.getOrElse("fileName", "20240620_100")
			
 
				+    val sampleRate = param.getOrElse("sampleRate", "1.0").toDouble
			
 
				+    val bucketNum = param.getOrElse("bucketNum", "100").toInt
			
 
				+
			
 
				+    val data = sc.textFile(readPath)
			
 
				+    println("问题数据数量：" + data.filter(r=>r.split("\t").length != 3).count())
			
 
				+    val data1 = data.map(r => {
			
 
				+      val rList = r.split("\t")
			
 
				+      val jsons = JSON.parseObject(rList(2))
			
 
				+      val doubles = scala.collection.mutable.Map[String, Double]()
			
 
				+      jsons.foreach(r =>{
			
 
				+        doubles.put(r._1, jsons.getDoubleValue(r._1))
			
 
				+      })
			
 
				+      doubles
			
 
				+    }).sample(false, sampleRate ).repartition(20)
			
 
				+
			
 
				+    val result = new ArrayBuffer[String]()
			
 
				+
			
 
				+    for (i <- contentList.indices){
			
 
				+      println("特征:" + contentList(i))
			
 
				+      val data2 = data1.map(r => r.getOrDefault(contentList(i), 0D)).filter(_ > 1E-8).collect().sorted
			
 
				+      val len = data2.length
			
 
				+      if (len == 0){
			
 
				+        result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + "0")
			
 
				+      }else{
			
 
				+        val oneBucketNum = (len - 1) / (bucketNum - 1) + 1 // 确保每个桶至少有一个元素
			
 
				+        val buffers = new ArrayBuffer[Double]()
			
 
				+
			
 
				+        var lastBucketValue = data2(0) // 记录上一个桶的切分点
			
 
				+        for (j <- 0 until len by oneBucketNum) {
			
 
				+          val d = data2(j)
			
 
				+          if (j > 0 && d != lastBucketValue) {
			
 
				+            // 如果当前切分点不同于上一个切分点，则保存当前切分点
			
 
				+            buffers += d
			
 
				+          }
			
 
				+          lastBucketValue = d // 更新上一个桶的切分点
			
 
				+        }
			
 
				+
			
 
				+        // 最后一个桶的结束点应该是数组的最后一个元素
			
 
				+        if (!buffers.contains(data2.last)) {
			
 
				+          buffers += data2.last
			
 
				+        }
			
 
				+        result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(","))
			
 
				+      }
			
 
				+    }
			
 
				+    val data3 = sc.parallelize(result)
			
 
				+
			
 
				+
			
 
				+    // 4 保存数据到hdfs
			
 
				+    val hdfsPath = savePath + "/" + fileName
			
 
				+    if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
			
 
				+      println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+      MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+      data3.repartition(1).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+    } else {
			
 
				+      println("路径不合法，无法写入:" + hdfsPath)
			
 
				+    }
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_33_bucketData_20240622.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_33_bucketData_20240622.scala
@@ -0,0 +1,118 @@
 
				+package com.aliyun.odps.spark.examples.makedata_ad
			
 
				+
			
 
				+import com.alibaba.fastjson.JSON
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
			
 
				+import examples.extractor.ExtractorUtils
			
 
				+import org.apache.hadoop.io.compress.GzipCodec
			
 
				+import org.apache.spark.sql.SparkSession
			
 
				+
			
 
				+import scala.collection.JavaConversions._
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+import scala.io.Source
			
 
				+/*
			
 
				+
			
 
				+ */
			
 
				+
			
 
				+object makedata_ad_33_bucketData_20240622 {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+
			
 
				+    val spark = SparkSession
			
 
				+      .builder()
			
 
				+      .appName(this.getClass.getName)
			
 
				+      .getOrCreate()
			
 
				+    val sc = spark.sparkContext
			
 
				+
			
 
				+    val loader = getClass.getClassLoader
			
 
				+
			
 
				+    val resourceUrlBucket = loader.getResource("20240622_ad_bucket_249.txt")
			
 
				+    val buckets =
			
 
				+      if (resourceUrlBucket != null) {
			
 
				+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
			
 
				+        Source.fromURL(resourceUrlBucket).close()
			
 
				+        buckets
			
 
				+      } else {
			
 
				+        ""
			
 
				+      }
			
 
				+    println(buckets)
			
 
				+    val bucketsMap = buckets.split("\n")
			
 
				+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
			
 
				+      .filter(r => r.nonEmpty)
			
 
				+      .map(r =>{
			
 
				+        val rList = r.split("\t")
			
 
				+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
			
 
				+      }).toMap
			
 
				+    val bucketsMap_br = sc.broadcast(bucketsMap)
			
 
				+
			
 
				+
			
 
				+    // 1 读取参数
			
 
				+    val param = ParamUtils.parseArgs(args)
			
 
				+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/")
			
 
				+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/33_ad_train_data/")
			
 
				+    val beginStr = param.getOrElse("beginStr", "20240620")
			
 
				+    val endStr = param.getOrElse("endStr", "20240620")
			
 
				+    val repartition = param.getOrElse("repartition", "200").toInt
			
 
				+
			
 
				+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
			
 
				+    for (date <- dateRange) {
			
 
				+      println("开始执行:" + date)
			
 
				+      val data = sc.textFile(readPath + "/" + date + "*").map(r=>{
			
 
				+        val rList = r.split("\t")
			
 
				+        val logKey = rList(0)
			
 
				+        val labelKey = rList(1)
			
 
				+        val jsons = JSON.parseObject(rList(2))
			
 
				+        val features = scala.collection.mutable.Map[String, Double]()
			
 
				+        jsons.foreach(r => {
			
 
				+          features.put(r._1, jsons.getDoubleValue(r._1))
			
 
				+        })
			
 
				+        (logKey, labelKey, features)
			
 
				+      })
			
 
				+        .filter{
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val logKeyList = logKey.split(",")
			
 
				+            val apptype = logKeyList(0)
			
 
				+            !Set("12").contains(apptype)
			
 
				+        }
			
 
				+        .map{
			
 
				+          case (logKey, labelKey, features) =>
			
 
				+            val label = JSON.parseObject(labelKey).getOrDefault("ad_is_conversion", "0").toString
			
 
				+            (label, features)
			
 
				+        }
			
 
				+        .mapPartitions(row => {
			
 
				+          val result = new ArrayBuffer[String]()
			
 
				+          val bucketsMap = bucketsMap_br.value
			
 
				+          row.foreach{
			
 
				+            case (label, features) =>
			
 
				+              val featuresBucket = features.map{
			
 
				+                case (name, score) =>
			
 
				+                  if (score > 1E-8) {
			
 
				+                    if (bucketsMap.contains(name)){
			
 
				+                      val (_, buckets) = bucketsMap(name)
			
 
				+                      val scoreNew = 1.0 / (buckets.length + 1) * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
			
 
				+                      name + ":" + scoreNew.toString
			
 
				+                    }else{
			
 
				+                      name + ":" + score.toString
			
 
				+                    }
			
 
				+                  } else {
			
 
				+                    ""
			
 
				+                  }
			
 
				+              }.filter(_.nonEmpty)
			
 
				+              result.add(label + "\t" + featuresBucket.mkString("\t"))
			
 
				+          }
			
 
				+          result.iterator
			
 
				+      })
			
 
				+
			
 
				+      // 4 保存数据到hdfs
			
 
				+      val hdfsPath = savePath + "/" + date
			
 
				+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
			
 
				+        println("删除路径并开始数据写入:" + hdfsPath)
			
 
				+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
			
 
				+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
			
 
				+      } else {
			
 
				+        println("路径不合法，无法写入:" + hdfsPath)
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/myUtils/MyDateUtils.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/myUtils/MyDateUtils.scala
@@ -0,0 +1,246 @@
 
				+package com.aliyun.odps.spark.examples.myUtils
			
 
				+import java.text.SimpleDateFormat
			
 
				+import java.util.{Calendar, Date}
			
 
				+
			
 
				+import org.apache.commons.lang.time.DateUtils
			
 
				+import org.apache.commons.lang3.time.DateUtils.addDays
			
 
				+
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+
			
 
				+object MyDateUtils {
			
 
				+
			
 
				+  val date_sdf = getYesterday()
			
 
				+  val date_sdf_ = getYesterday_()
			
 
				+  val date_sdf_full = ""
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+  // 今天日期
			
 
				+  def getNowDate(): String = {
			
 
				+    var now: Date = new Date()
			
 
				+    var dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyyMMdd")
			
 
				+    var hehe = dateFormat.format(now)
			
 
				+    hehe
			
 
				+  }
			
 
				+  def getNowDate_(): String = {
			
 
				+    var now: Date = new Date()
			
 
				+    var dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd")
			
 
				+    var hehe = dateFormat.format(now)
			
 
				+    hehe
			
 
				+  }
			
 
				+
			
 
				+  // 昨天日期
			
 
				+  def getYesterday(): String = {
			
 
				+    var dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyyMMdd")
			
 
				+    var cal: Calendar = Calendar.getInstance()
			
 
				+    cal.add(Calendar.DATE, -1)
			
 
				+    var yesterday = dateFormat.format(cal.getTime())
			
 
				+    yesterday
			
 
				+  }
			
 
				+  def getYesterday_(): String = {
			
 
				+    var dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd")
			
 
				+    var cal: Calendar = Calendar.getInstance()
			
 
				+    cal.add(Calendar.DATE, -1)
			
 
				+    var yesterday = dateFormat.format(cal.getTime())
			
 
				+    yesterday
			
 
				+  }
			
 
				+
			
 
				+  //本周第一天的日期
			
 
				+  def getNowWeekStart(): String = {
			
 
				+    var period: String = ""
			
 
				+    var cal: Calendar = Calendar.getInstance()
			
 
				+    var df: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd")
			
 
				+    cal.set(Calendar.DAY_OF_WEEK, Calendar.MONDAY)
			
 
				+    //获取本周一的日期
			
 
				+    period = df.format(cal.getTime())
			
 
				+    period
			
 
				+  }
			
 
				+
			
 
				+  // 本周末的日期
			
 
				+  def getNowWeekEnd(): String = {
			
 
				+    var period: String = ""
			
 
				+    var cal: Calendar = Calendar.getInstance();
			
 
				+    var df: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd");
			
 
				+    cal.set(Calendar.DAY_OF_WEEK, Calendar.SUNDAY); //这种输出的是上个星期周日的日期，因为老外把周日当成第一天
			
 
				+    cal.add(Calendar.WEEK_OF_YEAR, 1) // 增加一个星期，才是我们中国人的本周日的日期
			
 
				+    period = df.format(cal.getTime())
			
 
				+    period
			
 
				+  }
			
 
				+
			
 
				+  // 本月的第一天
			
 
				+  def getNowMonthStart(): String = {
			
 
				+    var period: String = ""
			
 
				+    var cal: Calendar = Calendar.getInstance();
			
 
				+    var df: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd");
			
 
				+    cal.set(Calendar.DATE, 1)
			
 
				+    period = df.format(cal.getTime()) //本月第一天
			
 
				+    period
			
 
				+  }
			
 
				+
			
 
				+  // 本月最后一天
			
 
				+  def getNowMonthEnd(): String = {
			
 
				+    var period: String = ""
			
 
				+    var cal: Calendar = Calendar.getInstance();
			
 
				+    var df: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd");
			
 
				+    cal.set(Calendar.DATE, 1)
			
 
				+    cal.roll(Calendar.DATE, -1)
			
 
				+    period = df.format(cal.getTime()) //本月最后一天
			
 
				+    period
			
 
				+  }
			
 
				+
			
 
				+  // "秒"时间戳 转 日期
			
 
				+  def DateFormat(time:String):String={
			
 
				+    var sdf:SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd")
			
 
				+    var date:String = sdf.format(new Date((time.toLong*1000l)))
			
 
				+    date
			
 
				+  }
			
 
				+
			
 
				+  // "秒"时间戳 转 日期
			
 
				+  def DateFormat_yyyyMMdd(time:String):String={
			
 
				+    var sdf:SimpleDateFormat = new SimpleDateFormat("yyyyMMdd")
			
 
				+    var date:String = sdf.format(new Date((time.toLong*1000l)))
			
 
				+    date
			
 
				+  }
			
 
				+
			
 
				+  // "秒"时间戳 转 当天时间
			
 
				+  def timeFormat(time:String):String={
			
 
				+    var sdf:SimpleDateFormat = new SimpleDateFormat("HH:mm:ss")
			
 
				+    var date:String = sdf.format(new Date((time.toLong*1000l)))
			
 
				+    date
			
 
				+  }
			
 
				+
			
 
				+  // date-time格式转成秒
			
 
				+  def tranTimeToLong(tm:String) :Long={
			
 
				+    val fm = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
			
 
				+    val dt = fm.parse(tm)
			
 
				+    val aa = fm.format(dt)
			
 
				+    val tim: Long = dt.getTime()
			
 
				+    tim / 1000
			
 
				+  }
			
 
				+
			
 
				+  // 日期格式转成秒
			
 
				+  def tranTimeString_yyyyMMdd_ToLong(tm:String) :Long={
			
 
				+    val fm = new SimpleDateFormat("yyyyMMdd")
			
 
				+    val dt = fm.parse(tm)
			
 
				+    val aa = fm.format(dt)
			
 
				+    val tim: Long = dt.getTime()
			
 
				+    tim / 1000
			
 
				+  }
			
 
				+
			
 
				+  // 秒转成日期
			
 
				+  def formatDateMillToMut(mill:Long)= {
			
 
				+    val date = new Date(mill)
			
 
				+    date
			
 
				+  }
			
 
				+
			
 
				+  //时间推移
			
 
				+  def getNumDaysBefore(dt:String,num:Int, pattern:String = "yyyyMMdd"): String ={
			
 
				+    val sdf = new SimpleDateFormat(pattern)
			
 
				+    val enddate= sdf.parse(dt)
			
 
				+    val rightNow = Calendar.getInstance()
			
 
				+    rightNow.setTime(enddate)
			
 
				+    rightNow.add(Calendar.DAY_OF_YEAR,-num);//日期减30天
			
 
				+    val begindate =rightNow.getTime()
			
 
				+    val time_begin = sdf.format(begindate)
			
 
				+    time_begin
			
 
				+  }
			
 
				+
			
 
				+  def getNumDaysAfter(dt:String,num:Int, pattern:String = "yyyyMMdd"): String ={
			
 
				+    val sdf = new SimpleDateFormat(pattern)
			
 
				+    val enddate= sdf.parse(dt)
			
 
				+    val rightNow = Calendar.getInstance()
			
 
				+    rightNow.setTime(enddate)
			
 
				+    rightNow.add(Calendar.DAY_OF_YEAR,num);//日期减30天
			
 
				+    val begindate =rightNow.getTime()
			
 
				+    val time_begin = sdf.format(begindate)
			
 
				+    time_begin
			
 
				+  }
			
 
				+
			
 
				+  // "20190101"转"2019-01-01"
			
 
				+  def dt2Dt(dt:String) : String={
			
 
				+    dt.substring(0, 4) + "-" + dt.substring(4, 6) +"-" +dt.substring(6, 8)
			
 
				+  }
			
 
				+
			
 
				+  // 日期区间生产1：从beginStr到endDate
			
 
				+  def fromBeginDate2EndDate(beginStr:String, endStr:String): Array[String] ={
			
 
				+    val date_format = new SimpleDateFormat("yyyyMMdd")
			
 
				+    var from = DateUtils.parseDate(beginStr, Array[String]("yyyyMMdd"))
			
 
				+    val to = DateUtils.parseDate(endStr, Array[String]("yyyyMMdd"))
			
 
				+    var result = new ArrayBuffer[String]()
			
 
				+    while (from.compareTo(to) <= 0) {
			
 
				+      val dateStr = date_format.format(from)
			
 
				+      result.append(dateStr)
			
 
				+      from = DateUtils.addDays(from, 1)
			
 
				+    }
			
 
				+    result.toArray
			
 
				+  }
			
 
				+  // 日期区间生产2：
			
 
				+  def getDateRange(beginStr: String, endStr: String, format: String = "yyyyMMdd"): ArrayBuffer[String] = {
			
 
				+    val ranges = ArrayBuffer[String]()
			
 
				+    val sdf = new SimpleDateFormat(format)
			
 
				+    var dateBegin = sdf.parse(beginStr)
			
 
				+    var dateEnd = sdf.parse(endStr)
			
 
				+    while (dateBegin.compareTo(dateEnd) <= 0) {
			
 
				+      ranges += sdf.format(dateBegin)
			
 
				+      dateBegin = addDays(dateBegin, 1)
			
 
				+    }
			
 
				+    ranges
			
 
				+  }
			
 
				+
			
 
				+  // 日期+小时 时间区间生成
			
 
				+  def getDateHourRange(beginStr: String, endStr: String, format: String = "yyyyMMddHH"): ArrayBuffer[String] = {
			
 
				+    val ranges = ArrayBuffer[String]()
			
 
				+    val sdf = new SimpleDateFormat(format)
			
 
				+    var dateBegin = sdf.parse(beginStr)
			
 
				+    val dateEnd = sdf.parse(endStr)
			
 
				+
			
 
				+    while (dateBegin.compareTo(dateEnd) <= 0) {
			
 
				+      ranges += sdf.format(dateBegin)
			
 
				+      // 将开始时间增加一小时
			
 
				+      dateBegin = addHours(dateBegin, 1)
			
 
				+    }
			
 
				+    ranges
			
 
				+  }
			
 
				+
			
 
				+  import java.util.Date
			
 
				+
			
 
				+  // 辅助函数，用于给定的日期增加小时
			
 
				+  def addHours(date: Date, hours: Int): Date = {
			
 
				+    val cal = Calendar.getInstance()
			
 
				+    cal.setTime(date)
			
 
				+    cal.add(java.util.Calendar.HOUR_OF_DAY, hours)
			
 
				+    cal.getTime
			
 
				+  }
			
 
				+
			
 
				+  import java.time.LocalDate
			
 
				+  import java.time.temporal.ChronoUnit
			
 
				+  def calculateDateDifference(startDate: String, endDate: String): Long = {
			
 
				+    val start = LocalDate.parse(startDate, java.time.format.DateTimeFormatter.BASIC_ISO_DATE)
			
 
				+    val end = LocalDate.parse(endDate, java.time.format.DateTimeFormatter.BASIC_ISO_DATE)
			
 
				+    val days = ChronoUnit.DAYS.between(start, end)
			
 
				+    days
			
 
				+  }
			
 
				+
			
 
				+
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+//    var from = DateUtils.parseDate("2019-09-01", Array[String]("yyyy-MM-dd"))
			
 
				+//    var to = DateUtils.parseDate("2019-09-10", Array[String]("yyyy-MM-dd"))
			
 
				+//
			
 
				+//    val a = from.getTime / 3600
			
 
				+//    val b = to.getTime / 3600
			
 
				+//    println(b-a)
			
 
				+
			
 
				+    var from = getDateHourRange("2024050123", "2024050203")
			
 
				+    from.foreach(println)
			
 
				+
			
 
				+    val partitionPrefix = "dt={},hh={}"
			
 
				+    println(partitionPrefix.stripMargin.format("XX", "YY"))
			
 
				+
			
 
				+    val stdxx = "2024050116"
			
 
				+    val dt = stdxx.substring(0, 8)
			
 
				+    val hh = stdxx.substring(8, 10)
			
 
				+    println(dt)
			
 
				+    println(hh)
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/myUtils/MyHdfsUtils.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/myUtils/MyHdfsUtils.scala
@@ -0,0 +1,148 @@
 
				+package com.aliyun.odps.spark.examples.myUtils
			
 
				+
			
 
				+/**
			
 
				+ * Author: zhangbo58
			
 
				+ * Description:
			
 
				+ *
			
 
				+ */
			
 
				+import org.apache.commons.lang.time.DateUtils
			
 
				+import org.apache.hadoop.conf.Configuration
			
 
				+import org.apache.hadoop.fs.permission.FsPermission
			
 
				+import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
			
 
				+
			
 
				+import scala.collection.mutable.ArrayBuffer
			
 
				+
			
 
				+object MyHdfsUtils {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+    val path = "zhangbo58/"
			
 
				+    //生成FileSystem
			
 
				+    println("获取目录下的一级文件和目录")
			
 
				+    getFilesAndDirs(path).foreach(println)
			
 
				+    println("获取目录下的一级文件")
			
 
				+    getFiles(path).foreach(println)
			
 
				+    println("获取目录下的一级目录")
			
 
				+    getDirs(path).foreach(println)
			
 
				+    println("获取目录下所有文件")
			
 
				+    getAllFiles(path).foreach(println)
			
 
				+  }
			
 
				+
			
 
				+  def getHdfs(path: String): FileSystem = {
			
 
				+    val conf = new Configuration()
			
 
				+    //    FileSystem.get(URI.create(path), conf)
			
 
				+    val fs = org.apache.hadoop.fs.FileSystem.get(new org.apache.hadoop.conf.Configuration())
			
 
				+    fs
			
 
				+  }
			
 
				+
			
 
				+  //获取目录下的一级文件和目录
			
 
				+  def getFilesAndDirs(path: String): Array[Path] = {
			
 
				+    val fs = getHdfs(path).listStatus(new Path(path))
			
 
				+    FileUtil.stat2Paths(fs)
			
 
				+  }
			
 
				+  //获取目录下的一级文件
			
 
				+  def getFiles(path: String): Array[String] = {
			
 
				+    getFilesAndDirs(path).filter(getHdfs(path).getFileStatus(_).isFile())
			
 
				+      .map(_.toString)
			
 
				+  }
			
 
				+  //获取目录下的一级目录
			
 
				+  def getDirs(path: String): Array[String] = {
			
 
				+    getFilesAndDirs(path).filter(getHdfs(path).getFileStatus(_).isDirectory)
			
 
				+      .map(_.toString)
			
 
				+  }
			
 
				+  //获取目录下的所有文件
			
 
				+  def getAllFiles(path: String): ArrayBuffer[String] = {
			
 
				+    val arr = ArrayBuffer[String]()
			
 
				+    val hdfs = getHdfs(path)
			
 
				+    val getPath = getFilesAndDirs(path)
			
 
				+    getPath.foreach(patha => {
			
 
				+      if (hdfs.getFileStatus(patha).isFile())
			
 
				+        arr += patha.toString
			
 
				+      else {
			
 
				+        arr ++= getAllFiles(patha.toString())
			
 
				+      }
			
 
				+    })
			
 
				+    arr
			
 
				+  }
			
 
				+  def ifHDFSHasData(path: String): Boolean = {
			
 
				+    val hdfs_path = new org.apache.hadoop.fs.Path(path.toString)
			
 
				+    val hdfs = org.apache.hadoop.fs.FileSystem.get(new org.apache.hadoop.conf.Configuration())
			
 
				+
			
 
				+    var rst = false
			
 
				+    if (hdfs.exists(hdfs_path)) {
			
 
				+      //路径存在且不为空
			
 
				+      val statusList = hdfs.listStatus(hdfs_path)
			
 
				+      for (status <- statusList if !rst && (status.getPath.toString.contains("part-") || status.getPath.toString.contains("_SUCCESS"))) {
			
 
				+        if (status.getLen > 0) {
			
 
				+          rst = true
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    rst
			
 
				+  }
			
 
				+
			
 
				+  def delete_hdfs_path(path: String): Unit = {
			
 
				+    val hdfs_path = new org.apache.hadoop.fs.Path(path.toString)
			
 
				+    val hdfs = org.apache.hadoop.fs.FileSystem.get(new org.apache.hadoop.conf.Configuration())
			
 
				+
			
 
				+    if (hdfs.exists(hdfs_path)) {
			
 
				+      hdfs.delete(hdfs_path, true)
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  def hdfs_exits(path:String): Boolean = {
			
 
				+    val hdfs_path = new org.apache.hadoop.fs.Path(path.toString)
			
 
				+    val hdfs = org.apache.hadoop.fs.FileSystem.get(new org.apache.hadoop.conf.Configuration())
			
 
				+
			
 
				+    hdfs.exists(hdfs_path)
			
 
				+  }
			
 
				+
			
 
				+
			
 
				+  /* 删除某路径下，不在某段时间内的所有数据。
			
 
				+  * 举例：keepDays=2 dateStr=20191015 => 保留20191015和20191014两天的数据
			
 
				+  */
			
 
				+  def hdfs_delete_not_keep_days(
			
 
				+                                 path:String,
			
 
				+                                 keepDays:Int,
			
 
				+                                 dateStr:String,
			
 
				+                                 pattern:String = "yyyyMMdd"
			
 
				+                               ): Unit ={
			
 
				+    val file_list = this.getFiles(path)
			
 
				+    println("hdfs_delete_not_keep_days-file_list")
			
 
				+    file_list.foreach(println)
			
 
				+
			
 
				+    for (file <- file_list){
			
 
				+      var flag = true
			
 
				+      val date_early = MyDateUtils.getNumDaysBefore(dateStr, keepDays, pattern)
			
 
				+      try{
			
 
				+        val file_split_strs = file.split("/")
			
 
				+        val len = file_split_strs.length
			
 
				+        var file_date = file_split_strs(len-1)
			
 
				+        if (file_date.equals("")){
			
 
				+          file_date = file_split_strs(len-2)
			
 
				+        }
			
 
				+        var date1 = DateUtils.parseDate(file_date, Array[String](pattern)) // 文件中的日期
			
 
				+        var date2 = DateUtils.parseDate(date_early, Array[String](pattern)) // 请求的日期前keepDays的日期
			
 
				+        if (date1.compareTo(date2) >= 0){ // 这个日期之前的数据 全部删除
			
 
				+          flag = false
			
 
				+        }
			
 
				+      }catch {
			
 
				+        case e:Exception =>
			
 
				+          flag = false
			
 
				+      }
			
 
				+
			
 
				+      if (flag){
			
 
				+        MyHdfsUtils.delete_hdfs_path(file.toString)
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * @Author: zhangbo
			
 
				+   * @Description: 给某hdfs路径加权限
			
 
				+   *
			
 
				+   */
			
 
				+
			
 
				+  def give_hdfs_permission(path:String): Unit ={
			
 
				+    getHdfs(path).setPermission(new Path(path), new FsPermission("777"))
			
 
				+  }
			
 
				+}
			
 
				+
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/myUtils/ParamUtils.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/myUtils/ParamUtils.scala
@@ -0,0 +1,40 @@
 
				+package com.aliyun.odps.spark.examples.myUtils
			
 
				+
			
 
				+import scala.collection.mutable
			
 
				+object ParamUtils {
			
 
				+  def parseArgs(args: Array[String]): mutable.HashMap[String, String] = {
			
 
				+    println("args size:" + args.size)
			
 
				+
			
 
				+    val rst = new mutable.HashMap[String, String]() {
			
 
				+      override def default(key: String) = "无参数传入"
			
 
				+    }
			
 
				+    for (a <- args) {
			
 
				+      val key_val = a.split(":")
			
 
				+      if (key_val.length >= 2) {
			
 
				+        // 为了解决hdfs正则化路径时Array变多个的问题
			
 
				+        if (rst.contains(key_val(0))) {
			
 
				+          val value = rst.get(key_val(0)).get
			
 
				+          val newValue = value + "," + key_val.splitAt(1)._2.mkString(":")
			
 
				+          rst += (key_val(0) -> newValue)
			
 
				+          println(key_val(0) + ":" + newValue)
			
 
				+        } else {
			
 
				+          rst += (key_val(0) -> key_val.splitAt(1)._2.mkString(":"))
			
 
				+          println(key_val(0) + ":" + key_val.splitAt(1)._2.mkString(":"))
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    rst
			
 
				+  }
			
 
				+
			
 
				+  def parseLogKey(logKey: String): Tuple7[String, String, String, String, String, String, String] = {
			
 
				+    val l = logKey.split(":")
			
 
				+    val mid = l(0)
			
 
				+    val videoid = l(1)
			
 
				+    val logtimestamp = l(2)
			
 
				+    val apptype = l(3)
			
 
				+    val pagesource_change = l(4)
			
 
				+    val abcode = l(5)
			
 
				+    val video_recommend = l(6)
			
 
				+    (mid, videoid, logtimestamp, apptype, pagesource_change, abcode, video_recommend)
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/myUtils/env.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/myUtils/env.scala
@@ -0,0 +1,39 @@
 
				+package com.aliyun.odps.spark.examples.myUtils
			
 
				+
			
 
				+import org.apache.spark.SparkContext
			
 
				+import org.apache.spark.aliyun.odps.OdpsOps
			
 
				+import org.springframework.data.redis.connection.RedisStandaloneConfiguration
			
 
				+import org.springframework.data.redis.connection.jedis.JedisConnectionFactory
			
 
				+import org.springframework.data.redis.core.RedisTemplate
			
 
				+import org.springframework.data.redis.serializer.StringRedisSerializer
			
 
				+import examples.dataloader.redisBuilderMyself
			
 
				+
			
 
				+object env {
			
 
				+  def getODPS(sparkContext: SparkContext): OdpsOps = {
			
 
				+    val accessKeyId = "LTAIWYUujJAm7CbH"
			
 
				+    val accessKeySecret = "RfSjdiWwED1sGFlsjXv0DlfTnZTG1P"
			
 
				+    val odpsUrl = "http://service.odps.aliyun.com/api"
			
 
				+    val tunnelUrl = "http://dt.cn-hangzhou.maxcompute.aliyun-inc.com"
			
 
				+
			
 
				+    OdpsOps(sparkContext, accessKeyId, accessKeySecret, odpsUrl, tunnelUrl)
			
 
				+  }
			
 
				+
			
 
				+  def getRedisTemplate(): RedisTemplate[String, String] = {
			
 
				+    // redis的公共模版
			
 
				+    val redisSC = new RedisStandaloneConfiguration
			
 
				+    redisSC.setPort(6379)
			
 
				+    redisSC.setPassword("Wqsd@2019")
			
 
				+    redisSC.setHostName("r-bp1pi8wyv6lzvgjy5z.redis.rds.aliyuncs.com")
			
 
				+    val jedisCF = new JedisConnectionFactory(redisSC)
			
 
				+    jedisCF.afterPropertiesSet()
			
 
				+    val redisTemplate = new RedisTemplate[String, String]
			
 
				+    redisTemplate.setDefaultSerializer(new StringRedisSerializer)
			
 
				+    redisTemplate.setConnectionFactory(jedisCF)
			
 
				+    redisTemplate.afterPropertiesSet()
			
 
				+    redisTemplate
			
 
				+  }
			
 
				+
			
 
				+  def getRedisTemplatev2(): RedisTemplate[String, String] = {
			
 
				+    redisBuilderMyself.redisTemplate(redisBuilderMyself.redisConnectionFactory())
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/sparksql/SparkSQL.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/sparksql/SparkSQL.scala
@@ -55,7 +55,7 @@ object SparkSQL {
 
				 
			
 
				     // 写 普通表
			
 
				     df.write.insertInto(tableName) // insertInto语义
			
 
				-    df.writeTo(tableName).overwritePartitions() // insertOverwrite use datasourceV2
			
 
				+//    df.writeTo(tableName).overwritePartitions() // insertOverwrite use datasourceV2
			
 
				 
			
 
				     // 写 分区表
			
 
				     // DataFrameWriter 无法指定分区写入 需要通过临时表再用SQL写入特定分区
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本
+++ b/src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本
@@ -0,0 +1,161 @@
 
				+
			
 
				+【新 上游样本】
			
 
				+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_10_originData_v3 \
			
 
				+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 64 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+tablePart:64 savePath:/dw/recommend/model/10_sample_data_v3/ beginStr:20240227 endStr:20240227 > p10_.log 2>&1 &
			
 
				+
			
 
				+[ros样本生产]
			
 
				+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_12_rosData_v3 \
			
 
				+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 32 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+savePath:/dw/recommend/model/12_ros_data_v3/ beginStr:20240228 endStr:20240228 ifRepart:10 \
			
 
				+> p12_1.log 2>&1 &
			
 
				+
			
 
				+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_12_rosData_v3_noweight \
			
 
				+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 32 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+savePath:/dw/recommend/model/12_ros_data_v3_noweight/ beginStr:20240222 endStr:20240226 ifRepart:10 \
			
 
				+> p12_2.log 2>&1 &
			
 
				+
			
 
				+[str样本生产]
			
 
				+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_11_strData_v3 \
			
 
				+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 64 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+savePath:/dw/recommend/model/11_str_data_v3/ beginStr:20240227 endStr:20240227 ifRepart:100 \
			
 
				+> p11.log 2>&1 &
			
 
				+
			
 
				+
			
 
				+[user写redis]
			
 
				+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_09_user2redis_freq \
			
 
				+--name makedata_09_user2redis_freq \
			
 
				+--master yarn --driver-memory 1G --executor-memory 4G --executor-cores 1 --num-executors 32 \
			
 
				+--conf spark.yarn.executor.memoryoverhead=1024 \
			
 
				+/root/zhangbo/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+date:20240302 tablePart:96 expireDay:3 ifWriteRedisUser:True ifUser:True midDays:14 redisLimit:80000000 \
			
 
				+savePathUser:/dw/recommend/model/09_feature/user/ > p09.log 2>&1 &
			
 
				+
			
 
				+
			
 
				+
			
 
				+--------------
			
 
				+【旧STR 上游样本】
			
 
				+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_06_originData \
			
 
				+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 32 \
			
 
				+--conf spark.yarn.executor.memoryoverhead=1024 \
			
 
				+--conf spark.shuffle.service.enabled=true \
			
 
				+--conf spark.shuffle.service.port=7337 \
			
 
				+--conf spark.shuffle.consolidateFiles=true \
			
 
				+--conf spark.shuffle.manager=sort \
			
 
				+--conf spark.storage.memoryFraction=0.4 \
			
 
				+--conf spark.shuffle.memoryFraction=0.5 \
			
 
				+--conf spark.default.parallelism=200 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+tablePart:64 savePath:/dw/recommend/model/00_sample_data/ beginStr:20240311 endStr:20240312 > p6.log 2>&1 &
			
 
				+【旧STR 训练数据】
			
 
				+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_07_strData \
			
 
				+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 32 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+savePath:/dw/recommend/model/04_str_data/ beginStr:20240311 endStr:20240312 featureVersion:v4 ifRepart:100 \
			
 
				+> p7.log 2>&1 &
			
 
				+
			
 
				+---
			
 
				+
			
 
				+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_13_originData_20240529 \
			
 
				+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+tablePart:64 repartition:32 \
			
 
				+beginStr:2024061600 endStr:2024061623 \
			
 
				+savePath:/dw/recommend/model/13_sample_data/ \
			
 
				+table:alg_recsys_sample_all \
			
 
				+> p13_2024061600.log 2>&1 &
			
 
				+
			
 
				+
			
 
				+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_14_valueData_20240608 \
			
 
				+--master yarn --driver-memory 1G --executor-memory 3G --executor-cores 1 --num-executors 32 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+readPath:/dw/recommend/model/13_sample_data/ \
			
 
				+savePath:/dw/recommend/model/14_feature_data/ \
			
 
				+beginStr:20240615 endStr:20240615 repartition:1000 \
			
 
				+> p14_data_check.log 2>&1 &
			
 
				+
			
 
				+
			
 
				+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_15_bucket_20240608 \
			
 
				+--master yarn --driver-memory 16G --executor-memory 1G --executor-cores 1 --num-executors 16 \
			
 
				+--conf spark.driver.maxResultSize=16G \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+readPath:/dw/recommend/model/14_feature_data/20240606/ fileName:20240606_200_v3 \
			
 
				+bucketNum:200 sampleRate:0.1 \
			
 
				+> p15_data2.log 2>&1 &
			
 
				+
			
 
				+
			
 
				+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_16_bucketData_20240609 \
			
 
				+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+beginStr:20240615 endStr:20240615 repartition:1000 \
			
 
				+> p16_data.log 2>&1 &
			
 
				+
			
 
				+
			
 
				+/dw/recommend/model/13_sample_data/
			
 
				+/dw/recommend/model/14_feature_data/
			
 
				+/dw/recommend/model/16_train_data/
			
 
				+
			
 
				+-----
			
 
				+一个执行：只有用线上打印特征的才执行
			
 
				+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_13_originData_20240529_check \
			
 
				+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+tablePart:64 repartition:32 \
			
 
				+beginStr:2024061500 endStr:2024061523 \
			
 
				+savePath:/dw/recommend/model/13_sample_data_check_print/ \
			
 
				+table:alg_recsys_sample_all_new \
			
 
				+> p13_2024061500_check.log 2>&1 &
			
 
				+
			
 
				+两个都要执行：过滤不需要的样本
			
 
				+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_16_bucketData_20240609_check \
			
 
				+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+readPath:/dw/recommend/model/14_feature_data_check_print/ \
			
 
				+savePath:/dw/recommend/model/16_train_data_check_print/ \
			
 
				+beginStr:20240615 endStr:20240615 repartition:1000 \
			
 
				+> p16_data_check.log 2>&1 &
			
 
				+
			
 
				+/dw/recommend/model/13_sample_data_check/
			
 
				+/dw/recommend/model/13_sample_data_check_print/
			
 
				+/dw/recommend/model/14_feature_data_check/
			
 
				+/dw/recommend/model/14_feature_data_check_print/
			
 
				+/dw/recommend/model/16_train_data_check/
			
 
				+/dw/recommend/model/16_train_data_check_print/
			
 
				+
			
 
				+
			
 
				+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_17_bucketDataPrint_20240617 \
			
 
				+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+> p17_data_check.log 2>&1 &
			
 
				+
			
 
				+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_18_mergehour2day_20240617 \
			
 
				+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+> p18_data_check.log 2>&1 &
			
 
				+
			
 
				+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_17_bucketDataPrint_20240617 \
			
 
				+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+beginStr:2024061800 endStr:2024061814 \
			
 
				+readDate:20240618 \
			
 
				+> p17_data_check.log 2>&1 &
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本-广告
+++ b/src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本-广告
@@ -0,0 +1,34 @@
 
				+
			
 
				+
			
 
				+
			
 
				+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata_ad.makedata_ad_31_originData_20240620 \
			
 
				+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+tablePart:64 repartition:32 \
			
 
				+beginStr:2024062009 endStr:2024062023 \
			
 
				+savePath:/dw/recommend/model/31_ad_sample_data/ \
			
 
				+table:alg_recsys_ad_sample_all \
			
 
				+> p31_2024062008.log 2>&1 &
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata_ad.makedata_ad_32_bucket_20240622 \
			
 
				+--master yarn --driver-memory 16G --executor-memory 1G --executor-cores 1 --num-executors 16 \
			
 
				+--conf spark.driver.maxResultSize=16G \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+> p32_data.log 2>&1 &
			
 
				+
			
 
				+
			
 
				+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata_ad.makedata_ad_33_bucketData_20240622 \
			
 
				+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+beginStr:20240620 endStr:20240620 repartition:400 \
			
 
				+> p33_data.log 2>&1 &
			
 
				+
			
 
				+
			
 
				+/dw/recommend/model/31_ad_sample_data/
			
 
				+/dw/recommend/model/33_ad_train_data/
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本【分析】
+++ b/src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本【分析】
@@ -0,0 +1,8 @@
 
				+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.ana.ana_01_cidvidpk \
			
 
				+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 32 \
			
 
				+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+tablePart:5 \
			
 
				+beginStr:2024060211 endStr:2024060211 \
			
 
				+vidSelect:21006075 cidsSelect:1155,1902 apptype:0 \
			
 
				+> p01_ana.log 2>&1 &
			
--- a/zhangbo/00_copy.sh
+++ b/zhangbo/00_copy.sh
@@ -0,0 +1,29 @@
 
				+#!/bin/sh
			
 
				+
			
 
				+#MVN_PACKAGE="mvn clean install  -T 2C -Dmaven.test.skip=true -Dmaven.compile.fork=true"
			
 
				+JAVA_PATH="/usr/bin/java"
			
 
				+PYTHON_PATH="/usr/bin/python"
			
 
				+UPLOAD_PY_PATH="/root/algo/upload.py"
			
 
				+JAR_PATH="/root/algo/recommend-server/recommend-server-service/target/recommend-server-service.jar"
			
 
				+FM_PATH="/root/algo/alphaFM/bin"
			
 
				+MODEL_PATH="/root/algo/LR_MODEL/"
			
 
				+YESTERDAY="$(date -d '2 days ago' +%Y%m%d)"
			
 
				+LAST30DAY="$(date -d '2 days ago' +%Y%m%d)"
			
 
				+MAIN_CLASS="com.tzld.piaoquan.recommend.server.dataloader.OfflineShareSamplesLoader"
			
 
				+TABLE_NAME="loghubods.alg_recsys_view_sample"
			
 
				+LABEL="share_ornot"
			
 
				+#OSSPATH=""
			
 
				+
			
 
				+
			
 
				+# Train
			
 
				+#mkdir -p ${MODEL_PATH}/${YESTERDAY}
			
 
				+#${JAVA_PATH} -jar ${JAR_PATH} ${TABLE_NAME} ${LAST30DAY} ${YESTERDAY} ${LABEL} | ${FM_PATH}/fm_train -m ${MODEL_PATH}/${YESTERDAY}/model_${YESTERDAY}.txt -dim 0,1,0 -core 8
			
 
				+
			
 
				+#cat ${MODEL_PATH}/${YESTERDAY}/model_${YESTERDAY}.txt | awk -F " " '{print $1,"\t",$2}' > ${MODEL_PATH}/${YESTERDAY}/model_${YESTERDAY}_new.txt
			
 
				+
			
 
				+# Upload
			
 
				+#${UPLOAD_PY_PATH} ${MODEL_PATH}/${YESTERDAY}/model_${YESTERDAY}_new.txt ${OSSPATH}
			
 
				+
			
 
				+# Predict
			
 
				+java -jar ${JAR_PATH} $TABLE_NAME 20231211 20231211 ${LABEL}| ${FM_PATH}/fm_predict -m ${MODEL_PATH}/20231210/model_20231210.txt  -dim 0 -core 8 -out ${MODEL_PATH}/predict_1211.txt
			
 
				+
			
--- a/zhangbo/01_train.sh
+++ b/zhangbo/01_train.sh
@@ -0,0 +1,16 @@
 
				+#!/bin/sh
			
 
				+set -e
			
 
				+set -x
			
 
				+
			
 
				+day=$1
			
 
				+train_path=$2
			
 
				+model_name=$3
			
 
				+bias=$4 # 0,1,0 1,1,0
			
 
				+
			
 
				+HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
			
 
				+$HADOOP fs -text ${train_path}/${day}/* | /root/sunmingze/alphaFM/bin/fm_train -m model/${model_name}_${day}.txt -dim ${bias} -core 8
			
 
				+# -v_l1 ${v_l1} -v_l2 ${v_l2}
			
 
				+
			
 
				+# nohup sh 01_train.sh 20240606 /dw/recommend/model/16_train_data/ model_aka8 1,1,8 >p1_model_aka8.log 2>&1 &
			
 
				+# nohup sh 01_train.sh 20240606 /dw/recommend/model/16_train_data/ model_aka0 1,1,0 >p1_model_aka0.log 2>&1 &
			
 
				+# nohup sh 01_train.sh 20240606 /dw/recommend/model/16_train_data/ model_aka4 1,1,4 >p1_model_aka4.log 2>&1 &
			
--- a/zhangbo/02_train_go.sh
+++ b/zhangbo/02_train_go.sh
@@ -0,0 +1,25 @@
 
				+#!/bin/sh
			
 
				+set -ex
			
 
				+
			
 
				+start_date=$1
			
 
				+end_date=$2
			
 
				+model_name=$3
			
 
				+MODEL_PATH="./model/"
			
 
				+SAMPLE_PATH=$4
			
 
				+bias=$5
			
 
				+HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
			
 
				+FM_TRAIN="/root/sunmingze/alphaFM/bin/fm_train"
			
 
				+
			
 
				+current_date="$start_date"
			
 
				+
			
 
				+while [[ "$current_date" != "$end_date" ]]; do
			
 
				+    echo -------"$current_date"----------
			
 
				+
			
 
				+    yesterday=$(date -d "$current_date - 1 day" +%Y%m%d)
			
 
				+    echo model-day-$yesterday
			
 
				+    echo data-day-$current_date
			
 
				+    $HADOOP fs -text ${SAMPLE_PATH}/$current_date/* | ${FM_TRAIN} -m $MODEL_PATH/${model_name}_$current_date.txt -dim ${bias} -core 8 -im $MODEL_PATH/${model_name}_$yesterday.txt
			
 
				+    current_date=$(date -d "$current_date + 1 day" +%Y%m%d)
			
 
				+done
			
 
				+
			
 
				+# nohup sh 02_train_go.sh 20240615 20240616 model_aka8 /dw/recommend/model/16_train_data/ 1,1,8 >p2_model_aka8.log 2>&1 &
			
--- a/zhangbo/03_predict.sh
+++ b/zhangbo/03_predict.sh
@@ -0,0 +1,33 @@
 
				+#!/bin/sh
			
 
				+set -e
			
 
				+set -x
			
 
				+
			
 
				+day=$1
			
 
				+train_path=$2
			
 
				+model_name=$3
			
 
				+output_file=$4
			
 
				+bias=$5
			
 
				+HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
			
 
				+$HADOOP fs -text ${train_path}/${day}/* | /root/sunmingze/alphaFM/bin/fm_predict -m model/$model_name -dim ${bias} -core 8 -out predict/${output_file}_$day.txt
			
 
				+cat predict/${output_file}_$day.txt | /root/sunmingze/AUC/AUC
			
 
				+
			
 
				+
			
 
				+# nohup sh 03_predict.sh 20240611 /dw/recommend/model/16_train_data/ model_aka0_20240610.txt model_aka0_20240610 0 >p3_model_aka0.log 2>&1 &
			
 
				+# nohup sh 03_predict.sh 20240611 /dw/recommend/model/16_train_data/ model_aka4_20240610.txt model_aka4_20240610 4 >p3_model_aka4.log 2>&1 &
			
 
				+# nohup sh 03_predict.sh 20240613 /dw/recommend/model/16_train_data/ model_aka8_20240612.txt model_aka8_20240612 8 >p3_model_aka8_12.log 2>&1 &
			
 
				+
			
 
				+
			
 
				+# nohup sh 03_predict.sh 20240615 /dw/recommend/model/16_train_data_print_online_merge/ model_aka8_20240608.txt model_aka8_20240608 8 >p3_model_aka8_on.log 2>&1 &
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+# cat tmpfile | /root/sunmingze/alphaFM/bin/fm_predict -m model/model_aka8_20240608.txt -dim 8 -core 1 -out tmpfile_out.txt
			
 
				+
			
 
				+
			
 
				+# nohup sh 03_predict.sh 20240618 /dw/recommend/model/17_for_check_v1/ model_aka8_20240608.txt v1 8 >v1.log 2>&1 &
			
 
				+# nohup sh 03_predict.sh 20240618 /dw/recommend/model/17_for_check_v2/ model_aka8_20240608.txt v2 8 >v2.log 2>&1 &
			
 
				+# nohup sh 03_predict.sh 20240618 /dw/recommend/model/17_for_check_v3/ model_aka8_20240608.txt v3 8 >v3.log 2>&1 &
			
 
				+# nohup sh 03_predict.sh 20240618 /dw/recommend/model/17_for_check_v4/ model_aka8_20240608.txt v4 8 >v4.log 2>&1 &
			
 
				+# nohup sh 03_predict.sh 20240618 /dw/recommend/model/17_for_check_v5/ model_aka8_20240608.txt v4 8 >v5.log 2>&1 &
			
 
				+# nohup sh 03_predict.sh 20240618 /dw/recommend/model/17_for_check_v6/ model_aka8_20240608.txt v4 8 >v6.log 2>&1 &
			
--- a/zhangbo/04_upload.sh
+++ b/zhangbo/04_upload.sh
@@ -0,0 +1,25 @@
 
				+
			
 
				+cat /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_str_mid_20240313.txt | sed '1d' | awk -F " " '{if($2!="0") print $1"\t"$2}' > /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_str_mid_20240313_change.txt
			
 
				+dfs -put /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_str_mid_20240313_change.txt oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/video_str_model/model_str_mid.txt
			
 
				+
			
 
				+cat /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_aka0_20240608.txt | awk -F " " '{print $1"\t"$2}' > /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_aka0_20240608_change.txt
			
 
				+dfs -put /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_aka0_20240608_change.txt oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/zhangbo/model_aka0.txt
			
 
				+
			
 
				+
			
 
				+
			
 
				+cat /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_aka8_20240608.txt |
			
 
				+awk -F " " '{
			
 
				+    if (NR == 1) {
			
 
				+        print $1"\t"$2
			
 
				+    } else {
			
 
				+        split($0, fields, " ");
			
 
				+        OFS="\t";
			
 
				+        line=""
			
 
				+        for (i = 1; i <= 10 && i <= length(fields); i++) {
			
 
				+            line = (line ? line "\t" : "") fields[i];
			
 
				+        }
			
 
				+        print line
			
 
				+    }
			
 
				+}' > /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_aka8_20240608_change.txt
			
 
				+
			
 
				+dfs -put /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_aka8_20240608_change.txt oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/zhangbo/model_aka8.txt
			
--- a/zhangbo/05_update_everyday_2model.sh
+++ b/zhangbo/05_update_everyday_2model.sh
@@ -0,0 +1,151 @@
 
				+#!/bin/sh
			
 
				+set -ex
			
 
				+# 0 全局变量/参数
			
 
				+samplePath=/dw/recommend/model/10_sample_data_v3/
			
 
				+savePath=/dw/recommend/model/12_ros_data_v3/
			
 
				+model_name=model_jerry
			
 
				+today="$(date +%Y%m%d)"
			
 
				+today_early_1="$(date -d '1 days ago' +%Y%m%d)"
			
 
				+yesterday="$(date -d '1 days ago' +%Y%m%d)"
			
 
				+
			
 
				+
			
 
				+HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
			
 
				+FM_TRAIN="/root/sunmingze/alphaFM/bin/fm_train"
			
 
				+MODEL_PATH="/root/zhangbo/recommend-emr-dataprocess/zhangbo/model/"
			
 
				+OSS_PATH="oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/zhangbo/"
			
 
				+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
			
 
				+export PATH=$SPARK_HOME/bin:$PATH
			
 
				+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
			
 
				+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
			
 
				+
			
 
				+# 0 判断上游表是否生产完成，最长等待到11点
			
 
				+source /root/anaconda3/bin/activate py37
			
 
				+max_hour=11
			
 
				+max_minute=00
			
 
				+while true; do
			
 
				+  python_return_code=$(python utils.py --excute_program check_hive --partition ${today_early_1} --project loghubods --table alg_recsys_view_sample_v3)
			
 
				+  if [ $python_return_code -eq 0 ]; then
			
 
				+    echo "Python程序返回0，退出循环。"
			
 
				+    break
			
 
				+  fi
			
 
				+  echo "Python程序返回非0值，等待五分钟后再次调用。"
			
 
				+  sleep 300
			
 
				+  current_hour=$(date +%H)
			
 
				+  current_minute=$(date +%M)
			
 
				+  if (( current_hour > max_hour || (current_hour == max_hour && current_minute >= max_minute) )); then
			
 
				+    echo "最长等待时间已到，失败:${current_hour}-${current_minute}"
			
 
				+    exit 1
			
 
				+  fi
			
 
				+done
			
 
				+#conda deactivate
			
 
				+
			
 
				+# 1 生产数据
			
 
				+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_10_originData_v3 \
			
 
				+--name every_day_origindata_${model_name}_${today} \
			
 
				+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 32 \
			
 
				+/root/zhangbo/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+tablePart:32 savePath:${samplePath} beginStr:${today_early_1} endStr:${today_early_1}
			
 
				+if [ $? -eq 1 ]; then
			
 
				+    echo "Spark原始样本生产任务执行失败"
			
 
				+    exit 1
			
 
				+else
			
 
				+    echo "spark原始样本生产执行成功"
			
 
				+fi
			
 
				+
			
 
				+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_12_rosData_v3 \
			
 
				+--name makedata_12_rosData_v3_${model_name}_${today} \
			
 
				+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 32 \
			
 
				+/root/zhangbo/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+readPath:${samplePath} savePath:${savePath} beginStr:${today_early_1} endStr:${today_early_1} ifRepart:10
			
 
				+if [ $? -eq 1 ]; then
			
 
				+    echo "Spark训练样本-生产任务执行失败-ros"
			
 
				+    exit 1
			
 
				+else
			
 
				+    echo "spark训练样本-生产执行成功-ros"
			
 
				+fi
			
 
				+
			
 
				+# 2 加载上次模型 训练本轮数据 保存本轮模型
			
 
				+end_date=${today}
			
 
				+loop_date=${yesterday}
			
 
				+while [[ "$loop_date" != "$end_date" ]]; do
			
 
				+    echo -------train ${loop_date}----------
			
 
				+    loop_date_model=$(date -d "$loop_date - 1 day" +%Y%m%d)
			
 
				+    $HADOOP fs -text ${savePath}/dt=${loop_date}/* | ${FM_TRAIN} -m ${MODEL_PATH}/${model_name}_${loop_date}.txt \
			
 
				+-dim 0,1,0 -core 8 -im ${MODEL_PATH}/${model_name}_${loop_date_model}.txt
			
 
				+    if [ $? -eq 1 ]; then
			
 
				+        echo "训练失败"
			
 
				+        exit 1
			
 
				+    fi
			
 
				+    echo -------save ${MODEL_PATH}/${model_name}_${loop_date}.txt----------
			
 
				+    loop_date=$(date -d "$loop_date + 1 day" +%Y%m%d)
			
 
				+done
			
 
				+
			
 
				+# 3 本轮模型格式转换
			
 
				+cat ${MODEL_PATH}/${model_name}_${today_early_1}.txt \
			
 
				+| sed '1d' | awk -F " " '{if($2!="0") print $1"\t"$2}' \
			
 
				+> ${MODEL_PATH}/${model_name}_${today_early_1}_change.txt
			
 
				+
			
 
				+# 4 转换后模型上传oss
			
 
				+online_model_path=${OSS_PATH}/${model_name}.txt
			
 
				+$HADOOP fs -test -e ${online_model_path}
			
 
				+if [ $? -eq 0 ]; then
			
 
				+    echo "数据存在, 先删除。"
			
 
				+    $HADOOP fs -rm -r ${online_model_path}
			
 
				+else
			
 
				+    echo "数据不存在"
			
 
				+fi
			
 
				+$HADOOP fs -put ${MODEL_PATH}/${model_name}_${today_early_1}_change.txt ${online_model_path}
			
 
				+
			
 
				+
			
 
				+# 5 str数据生产
			
 
				+savePath=/dw/recommend/model/11_str_data_v3/
			
 
				+model_name=model_tom
			
 
				+
			
 
				+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_11_strData_v3 \
			
 
				+--name makedata_11_strData_v3_${model_name}_${today} \
			
 
				+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 64 \
			
 
				+/root/zhangbo/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+readPath:${samplePath} savePath:${savePath} beginStr:${today_early_1} endStr:${today_early_1} ifRepart:100
			
 
				+if [ $? -eq 1 ]; then
			
 
				+    echo "Spark训练样本-生产任务执行失败-str"
			
 
				+    exit 1
			
 
				+else
			
 
				+    echo "spark训练样本-生产执行成功-str"
			
 
				+fi
			
 
				+# 6 加载上次模型 训练本轮数据 保存本轮模型
			
 
				+end_date=${today}
			
 
				+loop_date=${yesterday}
			
 
				+while [[ "$loop_date" != "$end_date" ]]; do
			
 
				+    echo -------train ${loop_date}----------
			
 
				+    loop_date_model=$(date -d "$loop_date - 1 day" +%Y%m%d)
			
 
				+    $HADOOP fs -text ${savePath}/dt=${loop_date}/* | ${FM_TRAIN} -m ${MODEL_PATH}/${model_name}_${loop_date}.txt \
			
 
				+-dim 0,1,0 -core 8 -im ${MODEL_PATH}/${model_name}_${loop_date_model}.txt
			
 
				+    if [ $? -eq 1 ]; then
			
 
				+        echo "训练失败"
			
 
				+        exit 1
			
 
				+    fi
			
 
				+    echo -------save ${MODEL_PATH}/${model_name}_${loop_date}.txt----------
			
 
				+    loop_date=$(date -d "$loop_date + 1 day" +%Y%m%d)
			
 
				+done
			
 
				+
			
 
				+# 7 本轮模型格式转换
			
 
				+cat ${MODEL_PATH}/${model_name}_${today_early_1}.txt \
			
 
				+| sed '1d' | awk -F " " '{if($2!="0") print $1"\t"$2}' \
			
 
				+> ${MODEL_PATH}/${model_name}_${today_early_1}_change.txt
			
 
				+
			
 
				+# 8 转换后模型上传oss
			
 
				+online_model_path=${OSS_PATH}/${model_name}.txt
			
 
				+$HADOOP fs -test -e ${online_model_path}
			
 
				+if [ $? -eq 0 ]; then
			
 
				+    echo "数据存在, 先删除。"
			
 
				+    $HADOOP fs -rm -r ${online_model_path}
			
 
				+else
			
 
				+    echo "数据不存在"
			
 
				+fi
			
 
				+$HADOOP fs -put ${MODEL_PATH}/${model_name}_${today_early_1}_change.txt ${online_model_path}
			
 
				+
			
 
				+
			
 
				+# nohup sh 05_update_everyday_2model.sh > p5.log 2>&1 &
			
--- a/zhangbo/05_update_everyday_str.sh
+++ b/zhangbo/05_update_everyday_str.sh
@@ -0,0 +1,107 @@
 
				+#!/bin/sh
			
 
				+set -ex
			
 
				+# 0 全局变量/参数
			
 
				+samplePath=/dw/recommend/model/00_sample_data/
			
 
				+savePath=/dw/recommend/model/04_str_data/
			
 
				+model_name=model_str_mid
			
 
				+today="$(date +%Y%m%d)"
			
 
				+today_early_1="$(date -d '1 days ago' +%Y%m%d)"
			
 
				+yesterday="$(date -d '1 days ago' +%Y%m%d)"
			
 
				+
			
 
				+#today=20240129
			
 
				+#today_early_1=20240128
			
 
				+#yesterday=20240128
			
 
				+
			
 
				+HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
			
 
				+FM_TRAIN="/root/sunmingze/alphaFM/bin/fm_train"
			
 
				+MODEL_PATH="/root/zhangbo/recommend-emr-dataprocess/zhangbo/model/"
			
 
				+OSS_PATH="oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/video_str_model/"
			
 
				+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
			
 
				+export PATH=$SPARK_HOME/bin:$PATH
			
 
				+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
			
 
				+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
			
 
				+
			
 
				+# 0 判断上游表是否生产完成，最长等待到12点
			
 
				+source /root/anaconda3/bin/activate py37
			
 
				+max_hour=11
			
 
				+max_minute=00
			
 
				+while true; do
			
 
				+  python_return_code=$(python utils.py --excute_program check_origin_hive --partition ${today_early_1})
			
 
				+  if [ $python_return_code -eq 0 ]; then
			
 
				+    echo "Python程序返回0，退出循环。"
			
 
				+    break
			
 
				+  fi
			
 
				+  echo "Python程序返回非0值，等待五分钟后再次调用。"
			
 
				+  sleep 300
			
 
				+  current_hour=$(date +%H)
			
 
				+  current_minute=$(date +%M)
			
 
				+  if (( current_hour > max_hour || (current_hour == max_hour && current_minute >= max_minute) )); then
			
 
				+    echo "最长等待时间已到，失败:${current_hour}-${current_minute}"
			
 
				+    exit 1
			
 
				+  fi
			
 
				+done
			
 
				+#conda deactivate
			
 
				+
			
 
				+# 1 生产数据
			
 
				+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_06_originData \
			
 
				+--name every_day_origindata_${model_name}_${today} \
			
 
				+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 16 \
			
 
				+/root/zhangbo/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+tablePart:32 savePath:${samplePath} beginStr:${today_early_1} endStr:${today_early_1}
			
 
				+if [ $? -eq 1 ]; then
			
 
				+    echo "Spark原始样本生产任务执行失败"
			
 
				+    exit 1
			
 
				+else
			
 
				+    echo "spark原始样本生产执行成功"
			
 
				+fi
			
 
				+
			
 
				+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_07_strData \
			
 
				+--name every_day_strdata_${model_name}_${today} \
			
 
				+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 16 \
			
 
				+/root/zhangbo/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+readPath:${samplePath} savePath:${savePath} beginStr:${today_early_1} endStr:${today_early_1} featureVersion:v4 ifRepart:100
			
 
				+if [ $? -eq 1 ]; then
			
 
				+    echo "Spark训练样本生产任务执行失败"
			
 
				+    exit 1
			
 
				+else
			
 
				+    echo "spark训练样本生产执行成功"
			
 
				+fi
			
 
				+
			
 
				+
			
 
				+
			
 
				+# 2 加载上次模型 训练本轮数据 保存本轮模型
			
 
				+end_date=${today}
			
 
				+loop_date=${yesterday}
			
 
				+while [[ "$loop_date" != "$end_date" ]]; do
			
 
				+    echo -------train ${loop_date}----------
			
 
				+    loop_date_model=$(date -d "$loop_date - 1 day" +%Y%m%d)
			
 
				+    $HADOOP fs -text ${savePath}/dt=${loop_date}/* | ${FM_TRAIN} -m ${MODEL_PATH}/${model_name}_${loop_date}.txt \
			
 
				+-dim 1,1,0 -core 8 -im ${MODEL_PATH}/${model_name}_${loop_date_model}.txt
			
 
				+    if [ $? -eq 1 ]; then
			
 
				+        echo "训练失败"
			
 
				+        exit 1
			
 
				+    fi
			
 
				+    echo -------save ${MODEL_PATH}/${model_name}_${loop_date}.txt----------
			
 
				+    loop_date=$(date -d "$loop_date + 1 day" +%Y%m%d)
			
 
				+done
			
 
				+
			
 
				+# 3 本轮模型格式转换
			
 
				+cat ${MODEL_PATH}/${model_name}_${today_early_1}.txt \
			
 
				+| sed '1d' | awk -F " " '{if($2!="0") print $1"\t"$2}' \
			
 
				+> ${MODEL_PATH}/${model_name}_${today_early_1}_change.txt
			
 
				+
			
 
				+# 4 转换后模型上传oss
			
 
				+online_model_path=${OSS_PATH}/${model_name}.txt
			
 
				+$HADOOP fs -test -e ${online_model_path}
			
 
				+if [ $? -eq 0 ]; then
			
 
				+    echo "数据存在, 先删除。"
			
 
				+    $HADOOP fs -rm -r ${online_model_path}
			
 
				+else
			
 
				+    echo "数据不存在"
			
 
				+fi
			
 
				+$HADOOP fs -put ${MODEL_PATH}/${model_name}_${today_early_1}_change.txt ${online_model_path}
			
 
				+
			
 
				+
			
 
				+#nohup sh 05_update_everyday_str.sh > p.log 2>&1 &
			
--- a/zhangbo/06_update_everyday_feature.sh
+++ b/zhangbo/06_update_everyday_feature.sh
@@ -0,0 +1,124 @@
 
				+#!/bin/sh
			
 
				+set -ex
			
 
				+# 0 全局变量/参数
			
 
				+today="$(date +%Y%m%d)"
			
 
				+today_early_1="$(date -d '1 days ago' +%Y%m%d)"
			
 
				+
			
 
				+HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
			
 
				+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
			
 
				+export PATH=$SPARK_HOME/bin:$PATH
			
 
				+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
			
 
				+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
			
 
				+
			
 
				+max_hour=11
			
 
				+max_minute=00
			
 
				+
			
 
				+# 0 判断上游表是否生产完成，最长等待到12点
			
 
				+source /root/anaconda3/bin/activate py37
			
 
				+while true; do
			
 
				+  python_return_code=$(python utils.py --excute_program check_item_hive --partition ${today_early_1})
			
 
				+  if [ $python_return_code -eq 0 ]; then
			
 
				+    echo "Python程序返回0，退出循环。"
			
 
				+    break
			
 
				+  fi
			
 
				+  echo "Python程序返回非0值，等待五分钟后再次调用。"
			
 
				+  sleep 300
			
 
				+  current_hour=$(date +%H)
			
 
				+  current_minute=$(date +%M)
			
 
				+  if (( current_hour > max_hour || (current_hour == max_hour && current_minute >= max_minute) )); then
			
 
				+    echo "最长等待时间已到，失败:${current_hour}-${current_minute}"
			
 
				+    exit 1
			
 
				+  fi
			
 
				+done
			
 
				+conda deactivate
			
 
				+# 1 item 生产数据
			
 
				+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_08_item2redis \
			
 
				+--name makedata_08_item2redis_${today} \
			
 
				+--master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 16 \
			
 
				+/root/zhangbo/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+date:${today_early_1} tablePart:32 expireDay:4 ifDebug:False \
			
 
				+ifVideo:True ifWriteRedis:True savePathVideo:/dw/recommend/model/09_feature/video
			
 
				+
			
 
				+if [ $? -eq 1 ]; then
			
 
				+    echo "---------item写入redis执行失败---------"
			
 
				+    exit 1
			
 
				+else
			
 
				+    echo "---------item写入redis执行成功---------"
			
 
				+fi
			
 
				+
			
 
				+# 2 检查user上游表
			
 
				+source /root/anaconda3/bin/activate py37
			
 
				+while true; do
			
 
				+  python_return_code=$(python utils.py --excute_program check_user_hive --partition ${today_early_1})
			
 
				+  if [ $python_return_code -eq 0 ]; then
			
 
				+    echo "Python程序返回0，退出循环。"
			
 
				+    break
			
 
				+  fi
			
 
				+  echo "Python程序返回非0值，等待五分钟后再次调用。"
			
 
				+  sleep 300
			
 
				+  current_hour=$(date +%H)
			
 
				+  current_minute=$(date +%M)
			
 
				+  if (( current_hour > max_hour || (current_hour == max_hour && current_minute >= max_minute) )); then
			
 
				+    echo "最长等待时间已到，失败:${current_hour}-${current_minute}"
			
 
				+    exit 1
			
 
				+  fi
			
 
				+done
			
 
				+
			
 
				+# 3 检查mid 时间，上游表
			
 
				+while true; do
			
 
				+  python_return_code=$(python utils.py --excute_program check_hive --partition ${today_early_1} --project loghubods --table mid_uid)
			
 
				+  if [ $python_return_code -eq 0 ]; then
			
 
				+    echo "Python程序返回0，退出循环。上游表loghubods.mid_uid=${today_early_1} 已生产完毕"
			
 
				+    break
			
 
				+  fi
			
 
				+  echo "Python程序返回非0值，等待五分钟后再次调用。上游表loghubods.mid_uid=${today_early_1} 未完成"
			
 
				+  sleep 300
			
 
				+  current_hour=$(date +%H)
			
 
				+  current_minute=$(date +%M)
			
 
				+  if (( current_hour > max_hour || (current_hour == max_hour && current_minute >= max_minute) )); then
			
 
				+    echo "最长等待时间已到，失败:${current_hour}-${current_minute}"
			
 
				+    exit 1
			
 
				+  fi
			
 
				+done
			
 
				+
			
 
				+conda deactivate
			
 
				+# 4 user 生产数据
			
 
				+#/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+#--class com.aliyun.odps.spark.examples.makedata.makedata_09_user2redis \
			
 
				+#--name makedata_09_user2redis_${today} \
			
 
				+#--master yarn --driver-memory 1G --executor-memory 4G --executor-cores 1 --num-executors 32 \
			
 
				+#--conf spark.yarn.executor.memoryoverhead=1024 \
			
 
				+#/root/zhangbo/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+#date:${today_early_1} tablePart:32 expireDay:3 ifDebug:False \
			
 
				+#ifUser:True ifDeleteRedisUser:False ifWriteRedisUser:True sampleRate:1.0 midDays:7 \
			
 
				+#savePathUser:/dw/recommend/model/feature/user/
			
 
				+
			
 
				+/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
			
 
				+--class com.aliyun.odps.spark.examples.makedata.makedata_09_user2redis_freq \
			
 
				+--name makedata_09_user2redis_freq \
			
 
				+--master yarn --driver-memory 1G --executor-memory 4G --executor-cores 1 --num-executors 32 \
			
 
				+--conf spark.yarn.executor.memoryoverhead=2024 \
			
 
				+--conf spark.shuffle.service.enabled=true \
			
 
				+--conf spark.shuffle.service.port=7337 \
			
 
				+--conf spark.shuffle.consolidateFiles=true \
			
 
				+--conf spark.shuffle.manager=sort \
			
 
				+--conf spark.storage.memoryFraction=0.4 \
			
 
				+--conf spark.shuffle.memoryFraction=0.5 \
			
 
				+--conf spark.default.parallelism=400	\
			
 
				+--conf spark.speculation=true \
			
 
				+--conf spark.speculation.multiplier=10 \
			
 
				+--conf spark.speculation.quantile=0.75	\
			
 
				+--conf spark.network.timeout=120 \
			
 
				+/root/zhangbo/recommend-emr-dataprocess/target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
			
 
				+date:${today_early_1} tablePart:64 expireDay:3 ifWriteRedisUser:True ifUser:True midDays:14 redisLimit:100000000 \
			
 
				+savePathUser:/dw/recommend/model/09_feature/user/
			
 
				+
			
 
				+if [ $? -eq 1 ]; then
			
 
				+    echo "---------user写入redis执行失败---------"
			
 
				+    exit 1
			
 
				+else
			
 
				+    echo "---------user写入redis执行成功---------"
			
 
				+fi
			
 
				+
			
 
				+#nohup sh 06_update_everyday_feature.sh > p6.log 2>&1 &
			
--- a/zhangbo/50_delete_hdfs.sh
+++ b/zhangbo/50_delete_hdfs.sh
@@ -0,0 +1,67 @@
 
				+#!/bin/sh
			
 
				+
			
 
				+export SPARK_HOME=/opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8
			
 
				+export PATH=$SPARK_HOME/bin:$PATH
			
 
				+export HADOOP_CONF_DIR=/etc/taihao-apps/hadoop-conf
			
 
				+export JAVA_HOME=/usr/lib/jvm/java-1.8.0
			
 
				+
			
 
				+DATE="$(date -d '9 days ago' +%Y%m%d)"
			
 
				+HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
			
 
				+
			
 
				+path="/dw/recommend/model/feature/user/dt=${DATE}"
			
 
				+$HADOOP fs -test -e ${path}
			
 
				+if [ $? -eq 0 ]; then
			
 
				+    echo "${path} 数据存在, 删除。"
			
 
				+    $HADOOP fs -rm -r -skipTrash ${path}
			
 
				+else
			
 
				+    echo "${path} 数据不存在"
			
 
				+fi
			
 
				+
			
 
				+path="/dw/recommend/model/feature/video/dt=${DATE}"
			
 
				+$HADOOP fs -test -e ${path}
			
 
				+if [ $? -eq 0 ]; then
			
 
				+    echo "${path} 数据存在, 删除。"
			
 
				+    $HADOOP fs -rm -r -skipTrash ${path}
			
 
				+else
			
 
				+    echo "${path} 数据不存在"
			
 
				+fi
			
 
				+
			
 
				+path="/dw/recommend/model/00_sample_data/dt=${DATE}"
			
 
				+$HADOOP fs -test -e ${path}
			
 
				+if [ $? -eq 0 ]; then
			
 
				+    echo "${path} 数据存在, 删除。"
			
 
				+    $HADOOP fs -rm -r -skipTrash ${path}
			
 
				+else
			
 
				+    echo "${path} 数据不存在"
			
 
				+fi
			
 
				+
			
 
				+
			
 
				+
			
 
				+function delete_path() {
			
 
				+    if [ "$#" -ne 2 ]; then
			
 
				+        echo "Usage: delete_path <early> <path>"
			
 
				+        return 1
			
 
				+    fi
			
 
				+    early=$1
			
 
				+    path=$2
			
 
				+    date="$(date -d "${early} days ago" +%Y%m%d)"
			
 
				+    path_delete=${path}${date}
			
 
				+    $HADOOP fs -test -e ${path_delete}
			
 
				+    if [ $? -eq 0 ]; then
			
 
				+        echo "${path_delete} 数据存在, 删除。"
			
 
				+        if $HADOOP fs -rm -r -skipTrash "${path_delete}"; then
			
 
				+            echo "删除成功。"
			
 
				+        else
			
 
				+            echo "删除失败。"
			
 
				+        fi
			
 
				+    else
			
 
				+        echo "${path_delete} 数据不存在"
			
 
				+    fi
			
 
				+}
			
 
				+
			
 
				+delete_path 7 /dw/recommend/model/11_str_data_v3/dt=
			
 
				+delete_path 7 /dw/recommend/model/12_ros_data_v3/dt=
			
 
				+delete_path 7 /dw/recommend/model/10_sample_data_v3/dt=
			
 
				+delete_path 3 /dw/recommend/model/09_feature/user/all/dt=
			
 
				+delete_path 3 /dw/recommend/model/09_feature/user/true/dt=
			
 
				+delete_path 3 /dw/recommend/model/09_feature/video/dt=
			
--- a/zhangbo/train.sh
+++ b/zhangbo/train.sh
@@ -0,0 +1,28 @@
 
				+#!/bin/sh
			
 
				+
			
 
				+#MVN_PACKAGE="mvn clean install  -T 2C -Dmaven.test.skip=true -Dmaven.compile.fork=true"
			
 
				+JAVA_PATH="/usr/bin/java"
			
 
				+PYTHON_PATH="/usr/bin/python"
			
 
				+UPLOAD_PY_PATH="/root/algo/upload.py"
			
 
				+JAR_PATH="/root/algo/recommend-server/recommend-server-service/target/recommend-server-service.jar"
			
 
				+FM_PATH="/root/algo/alphaFM/bin"
			
 
				+MODEL_PATH="/root/algo/LR_MODEL/"
			
 
				+YESTERDAY="$(date -d '2 days ago' +%Y%m%d)"
			
 
				+LAST30DAY="$(date -d '2 days ago' +%Y%m%d)"
			
 
				+MAIN_CLASS="com.tzld.piaoquan.recommend.server.dataloader.OfflineShareSamplesLoader"
			
 
				+TABLE_NAME="loghubods.alg_recsys_view_sample"
			
 
				+LABEL="share_ornot"
			
 
				+#OSSPATH=""
			
 
				+
			
 
				+
			
 
				+# Train
			
 
				+#mkdir -p ${MODEL_PATH}/${YESTERDAY}
			
 
				+#${JAVA_PATH} -jar ${JAR_PATH} ${TABLE_NAME} ${LAST30DAY} ${YESTERDAY} ${LABEL} | ${FM_PATH}/fm_train -m ${MODEL_PATH}/${YESTERDAY}/model_${YESTERDAY}.txt -dim 0,1,0 -core 8
			
 
				+
			
 
				+#cat ${MODEL_PATH}/${YESTERDAY}/model_${YESTERDAY}.txt | awk -F " " '{print $1,"\t",$2}' > ${MODEL_PATH}/${YESTERDAY}/model_${YESTERDAY}_new.txt
			
 
				+
			
 
				+# Upload
			
 
				+#${UPLOAD_PY_PATH} ${MODEL_PATH}/${YESTERDAY}/model_${YESTERDAY}_new.txt ${OSSPATH}
			
 
				+
			
 
				+# Predict
			
 
				+java -jar ${JAR_PATH} $TABLE_NAME 20231211 20231211 ${LABEL}| ${FM_PATH}/fm_predict -m ${MODEL_PATH}/20231210/model_20231210.txt  -dim 0 -core 8 -out ${MODEL_PATH}/predict_1211.txt
			
--- a/zhangbo/up.sh
+++ b/zhangbo/up.sh
@@ -0,0 +1,14 @@
 
				+#!/bin/sh
			
 
				+
			
 
				+day=$1
			
 
				+root_path="/root/spark-data"
			
 
				+oss_hdfs_path="oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/"
			
 
				+model_path="$root_path/model"
			
 
				+model_online="$model_path/online"
			
 
				+
			
 
				+
			
 
				+cat $model_path/model_ctr_$day.txt | sed '1d' | awk -F " " '{if($2!="0") print $1"\t"$2}' > $model_online/model_ad_ctr.txt
			
 
				+
			
 
				+hdfs dfs -rmr ${oss_hdfs_path}/ad_ctr_model/model_ad_ctr.txt
			
 
				+
			
 
				+hdfs dfs -put $model_online/model_ad_ctr.txt ${oss_hdfs_path}/ad_ctr_model/
			
--- a/zhangbo/up2.sh
+++ b/zhangbo/up2.sh
@@ -0,0 +1,10 @@
 
				+root_path="/root/spark-data"
			
 
				+oss_hdfs_path="oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/"
			
 
				+model_path=$root_path/model
			
 
				+day=$1
			
 
				+
			
 
				+cat /root/spark-data/model/model_share_20231216.txt | sed '1d' | awk -F " " '{if($2!="0") print $1"\t"$2}' > /root/spark-data/model/model_share_now.txt
			
 
				+
			
 
				+dfs -put /root/spark-data/model/model_share_now.txt oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/video_model
			
 
				+
			
 
				+hdfs dfs -put $mdoel_path/model_share_$day.txt ${oss_hdfs_path}/video_str_model
			
--- a/zhangbo/utils.py
+++ b/zhangbo/utils.py
@@ -0,0 +1,99 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+from odps import ODPS
			
 
				+import argparse
			
 
				+
			
 
				+ODPS_CONFIG = {
			
 
				+        'ENDPOINT': 'http://service.cn.maxcompute.aliyun.com/api',
			
 
				+        'ACCESSID': 'LTAIWYUujJAm7CbH',
			
 
				+        'ACCESSKEY': 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P',
			
 
				+}
			
 
				+
			
 
				+def check_data(project, table, partition) -> int:
			
 
				+    """检查数据是否准备好,输出数据条数"""
			
 
				+    odps = ODPS(
			
 
				+        access_id=ODPS_CONFIG['ACCESSID'],
			
 
				+        secret_access_key=ODPS_CONFIG['ACCESSKEY'],
			
 
				+        project=project,
			
 
				+        endpoint=ODPS_CONFIG['ENDPOINT'],
			
 
				+        connect_timeout=3000,
			
 
				+        read_timeout=500000,
			
 
				+        pool_maxsize=1000,
			
 
				+        pool_connections=1000
			
 
				+    )
			
 
				+    try:
			
 
				+        t = odps.get_table(name=table)
			
 
				+        check_res = t.exist_partition(partition_spec=f'dt={partition}')
			
 
				+        if check_res:
			
 
				+            sql = f'select * from {project}.{table} where dt = {partition}'
			
 
				+            with odps.execute_sql(sql=sql).open_reader() as reader:
			
 
				+                data_count = reader.count
			
 
				+        else:
			
 
				+            data_count = 0
			
 
				+    except Exception as e:
			
 
				+        print("error:" + str(e))
			
 
				+        data_count = 0
			
 
				+    return data_count
			
 
				+
			
 
				+
			
 
				+def check_origin_hive(args):
			
 
				+    project = "loghubods"
			
 
				+    table = "alg_recsys_view_sample_v2"
			
 
				+    partition = args.partition
			
 
				+    count = check_data(project, table, partition)
			
 
				+    if count == 0:
			
 
				+        print("1")
			
 
				+        exit(1)
			
 
				+    else:
			
 
				+        print("0")
			
 
				+
			
 
				+def check_item_hive(args):
			
 
				+    project = "loghubods"
			
 
				+    table = "alg_recsys_video_info"
			
 
				+    partition = args.partition
			
 
				+    count = check_data(project, table, partition)
			
 
				+    if count == 0:
			
 
				+        print("1")
			
 
				+        exit(1)
			
 
				+    else:
			
 
				+        print("0")
			
 
				+def check_user_hive(args):
			
 
				+    project = "loghubods"
			
 
				+    table = "alg_recsys_user_info"
			
 
				+    partition = args.partition
			
 
				+    count = check_data(project, table, partition)
			
 
				+    if count == 0:
			
 
				+        print("1")
			
 
				+        exit(1)
			
 
				+    else:
			
 
				+        print("0")
			
 
				+def check_hive(args):
			
 
				+    project = args.project
			
 
				+    table = args.table
			
 
				+    partition = args.partition
			
 
				+    count = check_data(project, table, partition)
			
 
				+    if count == 0:
			
 
				+        print("1")
			
 
				+        exit(1)
			
 
				+    else:
			
 
				+        print("0")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    parser = argparse.ArgumentParser(description='脚本utils')
			
 
				+    parser.add_argument('--excute_program', type=str, help='执行程序')
			
 
				+    parser.add_argument('--partition', type=str, help='表分区')
			
 
				+    parser.add_argument('--project', type=str, help='表空间')
			
 
				+    parser.add_argument('--table', type=str, help='表名')
			
 
				+    args = parser.parse_args()
			
 
				+    if args.excute_program == "check_origin_hive":
			
 
				+        check_origin_hive(args)
			
 
				+    elif args.excute_program == "check_item_hive":
			
 
				+        check_item_hive(args)
			
 
				+    elif args.excute_program == "check_user_hive":
			
 
				+        check_user_hive(args)
			
 
				+    elif args.excute_program == "check_hive":
			
 
				+            check_hive(args)
			
 
				+    else:
			
 
				+        print("无合法参数，验证失败。")
			
 
				+        exit(999)
			
 
				+