| 
														
															@@ -9,6 +9,7 @@ from utils.my_hdfs_client import MyHDFSClient 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 # 引用 paddle inference 推理库 
														 | 
														
														 | 
														
															 # 引用 paddle inference 推理库 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 import paddle.inference as paddle_infer 
														 | 
														
														 | 
														
															 import paddle.inference as paddle_infer 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 import json 
														 | 
														
														 | 
														
															 import json 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+from concurrent.futures import ThreadPoolExecutor 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 hadoop_home = "/app/env/hadoop-3.2.4"  # Hadoop 安装目录 
														 | 
														
														 | 
														
															 hadoop_home = "/app/env/hadoop-3.2.4"  # Hadoop 安装目录 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 configs = { 
														 | 
														
														 | 
														
															 configs = { 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -18,10 +19,7 @@ configs = { 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 hdfs_client = MyHDFSClient(hadoop_home, configs) 
														 | 
														
														 | 
														
															 hdfs_client = MyHDFSClient(hadoop_home, configs) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															- 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															- 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 def main(): 
														 | 
														
														 | 
														
															 def main(): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															- 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     init_model_path = "/app/output_model_dssm" 
														 | 
														
														 | 
														
															     init_model_path = "/app/output_model_dssm" 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     client = HangZhouOSSClient("art-recommend") 
														 | 
														
														 | 
														
															     client = HangZhouOSSClient("art-recommend") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     oss_object_name = "dyp/dssm.tar.gz" 
														 | 
														
														 | 
														
															     oss_object_name = "dyp/dssm.tar.gz" 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -29,11 +27,30 @@ def main(): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     compress.uncompress_tar("model.tar.gz", init_model_path) 
														 | 
														
														 | 
														
															     compress.uncompress_tar("model.tar.gz", init_model_path) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     assert os.path.exists(init_model_path) 
														 | 
														
														 | 
														
															     assert os.path.exists(init_model_path) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    model_file=os.path.join(init_model_path, "dssm.pdmodel") 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    params_file=os.path.join(init_model_path, "dssm.pdiparams") 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    self.model_file=os.path.join(init_model_path, "dssm.pdmodel") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    self.params_file=os.path.join(init_model_path, "dssm.pdiparams") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    max_workers=2 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    spilt_file_list=[ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    ['/dw/recommend/model/56_dssm_i2i_itempredData/20241206/part-00017.gz'], 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    ['/dw/recommend/model/56_dssm_i2i_itempredData/20241206/part-00017.gz'] 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    ] 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    future_list=[] 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    with ThreadPoolExecutor(max_workers=max_workers) as executor: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        for i, file_list in enumerate(split_file_list): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            future_list.append(executor.submit(thread_task, "thread" + str(i), file_list)) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    # 等待所有任务完成 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    for future in future_list: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        future.result() 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    print("Main program ending") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+def thread_task(name, file_list): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    print(f"Thread {name}: starting file_list:{file_list}"): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															     # 创建 config 
														 | 
														
														 | 
														
															     # 创建 config 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    config = paddle_infer.Config(model_file, params_file) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    config = paddle_infer.Config(self.model_file, self.params_file) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															     # 根据 config 创建 predictor 
														 | 
														
														 | 
														
															     # 根据 config 创建 predictor 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     predictor = paddle_infer.create_predictor(config) 
														 | 
														
														 | 
														
															     predictor = paddle_infer.create_predictor(config) 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -43,39 +60,46 @@ def main(): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     output_names = predictor.get_output_names() 
														 | 
														
														 | 
														
															     output_names = predictor.get_output_names() 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     output_handle = predictor.get_output_handle(output_names[0]) 
														 | 
														
														 | 
														
															     output_handle = predictor.get_output_handle(output_names[0]) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    fi=0 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    file_len = len(file_list) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    for flie in file_list: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        ret, out = hdfs_client._run_cmd(f"text {file}") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        input_data = {} 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        for line in out: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            sample_values = line.rstrip('\n').split('\t') 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            vid, left_features_str = sample_values 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            left_features = [float(x) for x in left_features_str.split(',')] 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            input_data[vid] = left_features 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+             
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        # 设置输入 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        result = [] 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        i=0 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        fi=fi+1 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        count =  len(input_data) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        print(f"Thread {name}: current handle {fi}/{file_len} file {flie} count {count}") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        for k,v in input_data.items(): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            v2 = np.array([v], dtype=np.float32) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            input_handle.copy_from_cpu(v2) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            # 运行predictor 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            predictor.run() 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            # 获取输出 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            output_data = output_handle.copy_to_cpu() # numpy.ndarray类型 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            result.append(k + "\t" + str(output_data.tolist()[0])) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            i=i+1 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            if i % 1000 == 0: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                print(f"Thread {name}: write batch {i}/{count}") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        json_data = json.dumps(result, indent=4)  # indent参数用于美化输出,使其更易读 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        # 写入文件 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        with open('/app/data_' + os.path.basename(flie) + '.json', 'w') as json_file: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+            for s in result: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                json_file.write(s + "\n") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    print(f"Thread {name}: finishing") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    ret, out = hdfs_client._run_cmd("text /dw/recommend/model/56_dssm_i2i_itempredData/20241206/part-00016.gz") 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    input_data = {} 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    for line in out: 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        sample_values = line.rstrip('\n').split('\t') 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        vid, left_features_str = sample_values 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        left_features = [float(x) for x in left_features_str.split(',')] 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        input_data[vid] = left_features 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-         
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															- 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    # 设置输入 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															- 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    result = [] 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    i=0 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    count =  len(input_data) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															- 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    for k,v in input_data.items(): 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        v2 = np.array([v], dtype=np.float32) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        input_handle.copy_from_cpu(v2) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        # 运行predictor 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        predictor.run() 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        # 获取输出 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        output_data = output_handle.copy_to_cpu() # numpy.ndarray类型 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        result.append(k + "\t" + str(output_data.tolist()[0])) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        i=i+1 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        if i % 1000 == 0: 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-            print("write batch {}/{}".format(i, count)) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															- 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    json_data = json.dumps(result, indent=4)  # indent参数用于美化输出,使其更易读 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    # 写入文件 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    with open('/app/data.json', 'w') as json_file: 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        for s in result: 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-            json_file.write(s + "\n") 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 if __name__ == "__main__": 
														 | 
														
														 | 
														
															 if __name__ == "__main__": 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     main() 
														 | 
														
														 | 
														
															     main() 
														 |