zhangbo 1 yıl önce
ebeveyn
işleme
0f0014992d

+ 156 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_09_user2redis_freq.scala

@@ -0,0 +1,156 @@
+package com.aliyun.odps.spark.examples.makedata
+
+import com.alibaba.fastjson.JSONObject
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.makedata.makedata_06_originData.getFeatureFromSet
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import com.google.gson.GsonBuilder
+import examples.dataloader.RequestContextOffline
+import org.apache.commons.lang.time.DateUtils
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import java.util
+import java.util.concurrent.TimeUnit
+import scala.collection.JavaConversions._
+
+
+object makedata_09_user2redis_freq {
+  def main(args: Array[String]) {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
+    val tablePart = param.getOrElse("tablePart", "64").toInt
+    val date = param.getOrDefault("date", "20231220")
+    val expireDay = param.getOrDefault("expireDay", "3").toInt
+    val ifWriteRedisUser = param.getOrDefault("ifWriteRedisUser", "False").toBoolean
+    val partition = partitionPrefix + date
+    val savePathUser = param.getOrDefault("savePathUser", "")
+    val midDays = param.getOrDefault("midDays", "7").toInt
+
+    //2 读取数据库odps
+    val odpsOps = env.getODPS(sc)
+    val project = "loghubods"
+    val tableUser = "alg_recsys_user_info"
+    val userRedisKeyPrefix = "user_info_4video_"
+
+
+    //3 特征处理
+    println("user特征处理")
+    val userData = odpsOps.readTable(project = project, table = tableUser, partition = partition,
+        transfer = func, numPartition = tablePart)
+      .map(record => {
+        val mid = record.getString("mids")
+        val originFeatureName = Set(
+          "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion",
+          "machineinfo_system", "machineinfo_wechatversion",
+          //"gmt_create_user",
+          "u_1day_exp_cnt", "u_1day_click_cnt", "u_1day_share_cnt", "u_1day_return_cnt",
+          "u_3day_exp_cnt", "u_3day_click_cnt", "u_3day_share_cnt", "u_3day_return_cnt",
+          "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
+          "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt"
+        )
+        val originFeatureMap = getFeatureFromSet(originFeatureName, record)
+        val resultNew = new JSONObject
+        originFeatureName.foreach(r => {
+          if (originFeatureMap.containsKey(r)) {
+            val v = originFeatureMap(r)
+            resultNew.put(r, v)
+          }
+        })
+        (mid, resultNew.toString())
+      })
+    //3 特征原始文件保存
+    if (savePathUser.nonEmpty && savePathUser.startsWith("/dw/recommend/model/")) {
+      val savePathPart = savePathUser + "/all/" + partition
+      MyHdfsUtils.delete_hdfs_path(savePathPart)
+      userData.map(r => r._1 + "\t" + r._2).saveAsTextFile(savePathPart, classOf[GzipCodec])
+    }
+    //4 近期用户统计
+    val dateEarly = MyDateUtils.getNumDaysBefore(date, 0)
+    val midRdd = odpsOps.readTable(project = "loghubods", table = "mid_uid",
+        partition = "dt=" + dateEarly, transfer = func, numPartition = tablePart)
+      .map(r => {
+        val mid = if (r.isNull("mid")) "" else r.getString("mid")
+        val actionTs = if (r.isNull("user_last_action_time")) "" else r.getString("user_last_action_time")
+        (mid, actionTs)
+      }).filter(r => r._1.nonEmpty && r._2.nonEmpty)
+      .filter(r => DateUtils.parseDate(date, Array[String]("yyyyMMdd")).getTime / 1000 - r._2.toLong / 1000 < 3600 * 24 * midDays)
+    println("------------mid处理完毕,近期保留的用户有:" + midRdd.count() + "------------------")
+
+    //5 用户区分
+    val savePathPart = savePathUser + "/" + partition
+    val userDataRead = sc.textFile(savePathPart).filter(_.split("\t").length >= 2)
+      .map(r => {
+        val rList = r.split("\t")
+        (rList(0), rList(1))
+      }).leftOuterJoin(midRdd).map {
+        case (mid, (fea, Some(_))) =>
+          (mid, fea, true)
+        case (mid, (fea, None)) =>
+          (mid, fea, false)
+      }
+    val userDataReadTrue = userDataRead.filter(_._3).map(r => r._1 + "\t" + r._2)
+    val userDataReadFalse = userDataRead.filter(!_._3).map(r => r._1 + "\t" + r._2)
+    if (savePathUser.nonEmpty && savePathUser.startsWith("/dw/recommend/model/")) {
+      val p1 = savePathUser + "/true/" + partition
+      MyHdfsUtils.delete_hdfs_path(p1)
+      userDataReadTrue.saveAsTextFile(p1, classOf[GzipCodec])
+      val p2 = savePathUser + "/false/" + partition
+      MyHdfsUtils.delete_hdfs_path(p2)
+      userDataReadFalse.saveAsTextFile(p2, classOf[GzipCodec])
+    }
+
+
+    if (ifWriteRedisUser) {
+      println("开始处理redis写入")
+      val p1 = savePathUser + "/true/" + partition
+      val userDataRead = sc.textFile(p1).filter(_.split("\t").length >= 2)
+        .map(r => {
+          val rList = r.split("\t")
+          (rList(0), rList(1))
+        })
+      val count = userDataRead.count()
+      println("待写入数据有:" + count)
+      if (count > 200000000) {
+        println("数据量超过2亿,不执行写入。")
+      } else {
+        val userDataTakeRddRun = userDataRead.mapPartitions(row => {
+          val redisFormat = new util.HashMap[String, String]
+          val redisTemplate = env.getRedisTemplate()
+          var i = 1
+          row.foreach {
+            case (key, value) =>
+              if (key.nonEmpty) {
+                redisFormat.put(userRedisKeyPrefix + key, value)
+              }
+              if (i % 1000 == 0) {
+                redisTemplate.opsForValue.multiSet(redisFormat)
+                redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
+                redisFormat.clear()
+              }
+              i = i + 1
+          }
+          redisTemplate.opsForValue.multiSet(redisFormat)
+          redisFormat.keySet.foreach(r => redisTemplate.expire(r, 24 * expireDay, TimeUnit.HOURS))
+          redisFormat.clear()
+          redisFormat.iterator
+        })
+        println("user写入成功:put in redis.count=" + userDataTakeRddRun.count())
+      }
+    }
+  }
+
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+
+}

+ 8 - 8
src/main/scala/com/aliyun/odps/spark/examples/myUtils/MyDateUtils.scala

@@ -199,13 +199,13 @@ object MyDateUtils {
 
 
   def main(args: Array[String]): Unit = {
-    var from = DateUtils.parseDate("2019-09-01", Array[String]("yyyy-MM-dd"))
-    var to = DateUtils.parseDate("2019-09-10", Array[String]("yyyy-MM-dd"))
-
-    val a = from.getTime / 3600
-    val b = to.getTime / 3600
-    println(b-a)
-    //    val date = "2019-05-01"
-    //    println(dt2Dt("20190101"))
+//    var from = DateUtils.parseDate("2019-09-01", Array[String]("yyyy-MM-dd"))
+//    var to = DateUtils.parseDate("2019-09-10", Array[String]("yyyy-MM-dd"))
+//
+//    val a = from.getTime / 3600
+//    val b = to.getTime / 3600
+//    println(b-a)
+
+    var from = DateUtils.parseDate("20240228", Array[String]("yyyyMMdd")).getTime / 1000
   }
 }

+ 2 - 2
zhangbo/01_train.sh

@@ -11,6 +11,6 @@ HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
 $HADOOP fs -text ${train_path}/dt=$day/* | /root/sunmingze/alphaFM/bin/fm_train -m model/${model_name}_${day}.txt -dim ${bias} -core 8
 # -v_l1 ${v_l1} -v_l2 ${v_l2}
 
-# nohup sh 01_train.sh 20240222 /dw/recommend/model/11_str_data_v3 model_tom 0,1,0 >p1_model_tom.log 2>&1 &
-# nohup sh 01_train.sh 20240222 /dw/recommend/model/12_ros_data_v3 model_jerry 0,1,0 >p1_model_jerry.log 2>&1 &
+# nohup sh 01_train.sh 20240222 /dw/recommend/model/11_str_data_v3 model_tom112 1,1,2 >p1_model_tom112.log 2>&1 &
+# nohup sh 01_train.sh 20240222 /dw/recommend/model/12_ros_data_v3_noweight model_jerry_noweight 0,1,0 >p1_model_jerry_noweight.log 2>&1 &
 

+ 2 - 2
zhangbo/02_train_go.sh

@@ -22,5 +22,5 @@ while [[ "$current_date" != "$end_date" ]]; do
     current_date=$(date -d "$current_date + 1 day" +%Y%m%d)
 done
 
-# nohup sh 02_train_go.sh 20240223 20240226 model_tom /dw/recommend/model/11_str_data_v3/ 0,1,0 >p2_model_tom.log 2>&1 &
-# nohup sh 02_train_go.sh 20240223 20240226 model_jerry /dw/recommend/model/12_ros_data_v3/ 0,1,0 >p2_model_jerry.log 2>&1 &
+# nohup sh 02_train_go.sh 20240223 20240226 model_tom112 /dw/recommend/model/11_str_data_v3/ 1,1,2 >p2_model_tom112.log 2>&1 &
+# nohup sh 02_train_go.sh 20240223 20240226 model_jerry_noweight /dw/recommend/model/12_ros_data_v3_noweight/ 0,1,0 >p2_model_jerry_noweight.log 2>&1 &

+ 2 - 1
zhangbo/03_predict.sh

@@ -11,5 +11,6 @@ HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
 $HADOOP fs -text ${train_path}/dt=$day/* | /root/sunmingze/alphaFM/bin/fm_predict -m model/$model_name -dim ${bias} -core 8 -out predict/${output_file}_$day.txt
 cat predict/${output_file}_$day.txt | /root/sunmingze/AUC/AUC
 
+# nohup sh 03_predict.sh 20240226 /dw/recommend/model/11_str_data_v3/ model_tom112_20240225.txt model_tom112_20240225 2 >p3_model_tom112_20240225.log 2>&1 &
 # nohup sh 03_predict.sh 20240226 /dw/recommend/model/12_ros_data_v3/ model_jerry_20240222.txt model_jerry_20240222 0,1,0 >p3_model_jerry_20240222.log 2>&1 &
-# nohup sh 03_predict.sh 20240226 /dw/recommend/model/11_str_data_v3/ model_tom_20240222.txt model_tom_20240222 0,1,0 >p3_model_tom_20240222.log 2>&1 &
+# nohup sh 03_predict.sh 20240226 /dw/recommend/model/12_ros_data_v3/ model_jerry_noweight_20240225.txt model_jerry_noweight_20240225 0,1,0 >p3_model_jerry_noweight_20240225.log 2>&1 &

+ 6 - 0
zhangbo/04_upload.sh

@@ -19,3 +19,9 @@ dfs -put /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_ros_v2_2024
 cat /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_str_mid_20240112.txt | sed '1d' | awk -F " " '{if($2!="0") print $1"\t"$2}' > /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_str_mid_20240112_change.txt
 dfs -put /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_str_mid_20240112_change.txt oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/video_str_model/model_str_mid.txt
 
+
+cat /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_tom_20240225.txt | sed '1d' | awk -F " " '{if($2!="0") print $1"\t"$2}' > /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_tom_20240225_change.txt
+dfs -put /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_tom_20240225_change.txt oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/zhangbo/model_tom.txt
+
+cat /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_jerry_20240225.txt | sed '1d' | awk -F " " '{if($2!="0") print $1"\t"$2}' > /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_jerry_20240225_change.txt
+dfs -put /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_jerry_20240225_change.txt oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/zhangbo/model_jerry.txt