|
@@ -3,7 +3,8 @@ package com.aliyun.odps.spark.examples.makedata_recsys.v20250218
|
|
|
import com.alibaba.fastjson.JSON
|
|
|
import com.aliyun.odps.TableSchema
|
|
|
import com.aliyun.odps.data.Record
|
|
|
-import com.aliyun.odps.spark.examples.myUtils.{MyHdfsUtils, ParamUtils, env}
|
|
|
+import com.aliyun.odps.spark.examples.myUtils.{MyHdfsUtils, ParamUtils}
|
|
|
+import examples.utils.StatisticsUtil
|
|
|
import org.apache.hadoop.io.compress.GzipCodec
|
|
|
import org.apache.spark.sql.SparkSession
|
|
|
import org.xm.Similarity
|
|
@@ -31,8 +32,17 @@ object makedata_recsys_41_data_fu_sample_20250218 {
|
|
|
val savePath = param.getOrElse("savePath", "/dw/recommend/model/41_recsys_sample_data/20250221")
|
|
|
val fuSampleRate = param.getOrElse("fuSampleRate", "0.05").toDouble
|
|
|
val whatLabel = param.getOrElse("whatLabel", "is_share")
|
|
|
+ val whatApps = param.getOrElse("whatApps", "0,4,2,32,17,18,21,22,24,25,26,27,28,29,3,30,31,33,34,35,36").split(",").filter(r => r.nonEmpty).toList
|
|
|
|
|
|
val data = sc.textFile(readPath)
|
|
|
+ .filter(line => {
|
|
|
+ val rLine = line.split("\t")
|
|
|
+ val logJson = JSON.parseObject(rLine(0))
|
|
|
+ val page = logJson.getString("page")
|
|
|
+ val recommendPageType = logJson.getString("recommendpagetype")
|
|
|
+
|
|
|
+ whatApps.contains(logJson.getString("apptype")) && StatisticsUtil.isRecommendScene(page, recommendPageType)
|
|
|
+ })
|
|
|
.filter {
|
|
|
line => {
|
|
|
val rLine = line.split("\t")
|