9 bulan lalu · a54d3fe9cc
--- a/ad/model_predict_analyse.py
+++ b/ad/model_predict_analyse.py
@@ -1,17 +1,48 @@
 
				-import argparse
			
 
				-import sys
			
 
				 import gzip
			
 
				+import pandas as pd
			
 
				 
			
 
				-from pyspark.sql import SparkSession
			
 
				+from hdfs import InsecureClient
			
 
				 
			
 
				+client = InsecureClient("http://master-1-1.c-7f31a3eea195cb73.cn-hangzhou.emr.aliyuncs.com:9870", user="spark")
			
 
				 
			
 
				-def read_predict(hdfs_path: str):
			
 
				-    df = spark.read.text(hdfs_path)
			
 
				-    df.show(truncate=False)
			
 
				+
			
 
				+def read_predict(hdfs_path: str) -> list:
			
 
				+    result = []
			
 
				+    for file in client.list(hdfs_path):
			
 
				+        with client.read(hdfs_path + file) as reader:
			
 
				+            with gzip.GzipFile(fileobj=reader, mode="rb") as gz_file:
			
 
				+                for line in gz_file.read().decode("utf-8").split("\n"):
			
 
				+                    split = line.split("\t")
			
 
				+                    if len(split) != 4:
			
 
				+                        continue
			
 
				+                    cid = split[3].split("_")[0]
			
 
				+                    label = split[0]
			
 
				+                    score = split[2].replace("[", "").replace("]", "").split(",")[1]
			
 
				+
			
 
				+                    result.append({
			
 
				+                        "cid": cid,
			
 
				+                        "label": label,
			
 
				+                        "score": score
			
 
				+                    })
			
 
				+
			
 
				+    return result
			
 
				 
			
 
				 
			
 
				 def _main():
			
 
				-    read_predict("/dw/recommend/model/34_ad_predict_data/20241004_351_0927_1003_1000/*")
			
 
				+    model1_result = read_predict("/dw/recommend/model/34_ad_predict_data/20241004_351_0927_1003_1000/")
			
 
				+    model2_result = read_predict("/dw/recommend/model/34_ad_predict_data/20241004_351_0927_1003_1000/")
			
 
				+
			
 
				+    m1 = pd.DataFrame(model1_result)
			
 
				+    g1 = m1.groupby("cid").agg(count=('cid', 'size'), average_value=('score', 'mean'))
			
 
				+    # 获取出现次数最多的十个 cid
			
 
				+    most_common_cid1 = g1.nlargest(10, 'count')
			
 
				+    print(most_common_cid1)
			
 
				+
			
 
				+    m2 = pd.DataFrame(model2_result)
			
 
				+    g2 = m2.groupby("cid").agg(count=('cid', 'size'), average_value=('score', 'mean'))
			
 
				+    # 获取出现次数最多的十个 cid
			
 
				+    most_common_cid2 = g2.nlargest(10, 'count')
			
 
				+    print(most_common_cid2)
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
@@ -23,6 +54,4 @@ if __name__ == '__main__':
 
				     # # 判断参数是否正常
			
 
				     # if len(predict_path_list) != 2:
			
 
				     #     sys.exit(1)
			
 
				-    spark = SparkSession.builder.appName("model_predict_analyse").getOrCreate()
			
 
				     _main()
			
 
				-    spark.stop()