Browse Source

增加hive表字段校验

xueyiming 20 hours ago
parent
commit
9982698936

+ 24 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketDataFromOriginToHive_20250228.scala

@@ -69,6 +69,7 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
       }).toMap
     val bucketsMap_br = sc.broadcast(bucketsMap)
     val denseFeatureNames = bucketsMap.keySet
+    val lowerCaseDenseFeatureNames = bucketsMap.keySet.map(_.toLowerCase)
     val sparseFeatureNames = Set(
       "cid", "adid", "adverid", "targeting_conversion",
       "region", "city", "brand",
@@ -90,6 +91,29 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
 
     // 2 读取odps+表信息
     val odpsOps = env.getODPS(sc)
+
+    val tableSchema = odpsOps.getTableSchema(project, outputTable, isPartition = false)
+
+    // 检查所有字段,收集非法字段
+    val invalidFields = tableSchema.flatMap { case (fieldName, _) =>
+      // 跳过 has_click 和 has_conversion 列
+      if (fieldName != "has_click" && fieldName != "has_conversion") {
+        if (!lowerCaseDenseFeatureNames.contains(fieldName) && !sparseFeatureNames.contains(fieldName)) {
+          Some(fieldName) // 收集缺少字段
+        } else {
+          None
+        }
+      } else {
+        None
+      }
+    }.toList
+
+    // 如果存在非法字段,抛出标准异常
+    if (invalidFields.nonEmpty) {
+      throw new IllegalArgumentException(s"缺少字段: ${invalidFields.mkString(", ")}")
+    }
+
+
     // 3 循环执行数据生产
     val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
     for (dt <- dateRange) {

+ 23 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketDataFromOriginToHive_20250522.scala

@@ -67,6 +67,7 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250522 {
       }).toMap
     val bucketsMap_br = sc.broadcast(bucketsMap)
     val denseFeatureNames = bucketsMap.keySet
+    val lowerCaseDenseFeatureNames = bucketsMap.keySet.map(_.toLowerCase)
     val sparseFeatureNames = Set(
       "cid", "adid", "adverid", "targeting_conversion",
       "region", "city", "brand",
@@ -88,6 +89,28 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250522 {
 
     // 2 读取odps+表信息
     val odpsOps = env.getODPS(sc)
+
+    val tableSchema = odpsOps.getTableSchema(project, outputTable, isPartition = false)
+
+    // 检查所有字段,收集非法字段
+    val invalidFields = tableSchema.flatMap { case (fieldName, _) =>
+      // 跳过 has_click 和 has_conversion 列
+      if (fieldName != "has_click" && fieldName != "has_conversion") {
+        if (!lowerCaseDenseFeatureNames.contains(fieldName) && !sparseFeatureNames.contains(fieldName)) {
+          Some(fieldName) // 收集缺少字段
+        } else {
+          None
+        }
+      } else {
+        None
+      }
+    }.toList
+
+    // 如果存在非法字段,抛出标准异常
+    if (invalidFields.nonEmpty) {
+      throw new IllegalArgumentException(s"缺少字段: ${invalidFields.mkString(", ")}")
+    }
+
     // 3 循环执行数据生产
     val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
     for (dt <- dateRange) {