|
@@ -17,7 +17,7 @@ object video_dssm_sampler {
|
|
|
|
|
|
// 配置参数
|
|
|
private val CONFIG = Map(
|
|
|
- "shuffle.partitions" -> "200",
|
|
|
+ "shuffle.partitions" -> "400",
|
|
|
"memory.fraction" -> "0.8",
|
|
|
"default.parallelism" -> "200"
|
|
|
)
|
|
@@ -129,10 +129,10 @@ object video_dssm_sampler {
|
|
|
transfer = funcPositive,
|
|
|
numPartition = CONFIG("shuffle.partitions").toInt
|
|
|
).sample(false, 0.001) // 随机抽样千分之一的数据
|
|
|
- .persist(StorageLevel.MEMORY_AND_DISK)
|
|
|
+ .persist(StorageLevel.MEMORY_AND_DISK_SER)
|
|
|
println("开始执行partiton:" + partition)
|
|
|
|
|
|
- val positivePairs = spark.createDataFrame(rdd, schema).persist(StorageLevel.MEMORY_AND_DISK)
|
|
|
+ val positivePairs = spark.createDataFrame(rdd, schema).persist(StorageLevel.MEMORY_AND_DISK_SER)
|
|
|
stats.positiveSamplesCount = positivePairs.count()
|
|
|
logger.info(s"start read vid list for date $dt")
|
|
|
|
|
@@ -186,7 +186,7 @@ object video_dssm_sampler {
|
|
|
negativeSamplesDF
|
|
|
.withColumn("label", lit(0))
|
|
|
.withColumn("logid", concat(lit("neg_"), monotonically_increasing_id()))
|
|
|
- ).persist(StorageLevel.MEMORY_AND_DISK)
|
|
|
+ ).persist(StorageLevel.MEMORY_AND_DISK_SER)
|
|
|
|
|
|
// 6. 获取左侧特征
|
|
|
// 读取L1类别统计特征
|
|
@@ -301,7 +301,7 @@ object video_dssm_sampler {
|
|
|
col("vid_left_cate_l1_feature"),
|
|
|
col("vid_left_cate_l2_feature")
|
|
|
)
|
|
|
- .persist(StorageLevel.MEMORY_AND_DISK)
|
|
|
+ .persist(StorageLevel.MEMORY_AND_DISK_SER)
|
|
|
|
|
|
|
|
|
|
|
@@ -365,9 +365,7 @@ object video_dssm_sampler {
|
|
|
col("vid_right_cate_l1_feature"),
|
|
|
col("vid_right_cate_l2_feature")
|
|
|
)
|
|
|
- .persist(StorageLevel.MEMORY_AND_DISK)
|
|
|
-
|
|
|
-
|
|
|
+ .persist(StorageLevel.MEMORY_AND_DISK_SER)
|
|
|
|
|
|
|
|
|
|