丁云鹏 5 mesi fa
parent
commit
bd33b8bf18

+ 16 - 17
recommend-model-produce/src/main/python/tools/static_ps_infer_v2.py

@@ -111,8 +111,11 @@ class Main(object):
 
     def init_fleet_with_gloo(use_gloo=True):
         if use_gloo:
-            os.environ["PADDLE_WITH_GLOO"] = "1"
-            role = role_maker.PaddleCloudRoleMaker()
+            os.environ["PADDLE_WITH_GLOO"] = "0"
+            role = role_maker.PaddleCloudRoleMaker(
+                is_collective=False,
+                init_gloo=False
+            )
             fleet.init(role)
         else:
             fleet.init()
@@ -193,7 +196,7 @@ class Main(object):
                 self.dataset_train_loop(epoch)
 
             epoch_time = time.time() - epoch_start_time
-            epoch_speed = self.example_nums / epoch_time
+
             if use_auc is True:
                 global_auc = get_global_auc(paddle.static.global_scope(),
                                             self.model.stat_pos.name,
@@ -208,15 +211,13 @@ class Main(object):
                 set_zero(self.model.batch_stat_neg.name,
                          paddle.static.global_scope())
                 logger.info(
-                    "Epoch: {}, using time: {} second, ips: {} {}/sec. auc: {}".
-                    format(epoch, epoch_time, epoch_speed, self.count_method,
+                    "Epoch: {}, using time: {} second, ips: {}/sec. auc: {}".
+                    format(epoch, epoch_time, self.count_method,
                            global_auc))
             else:
                 logger.info(
-                    "Epoch: {}, using time {} second, ips {} {}/sec.".format(
-                        epoch, epoch_time, epoch_speed, self.count_method))
-
-            self.train_result_dict["speed"].append(epoch_speed)
+                    "Epoch: {}, using time {} second, ips  {}/sec.".format(
+                        epoch, epoch_time, self.count_method))
 
             model_dir = "{}/{}".format(save_model_path, epoch)
 
@@ -232,22 +233,17 @@ class Main(object):
         self.example_nums = 0
         self.count_method = self.config.get("runner.example_count_method",
                                             "example")
-        if self.count_method == "example":
-            self.example_nums = get_example_num(self.file_list)
-        elif self.count_method == "word":
-            self.example_nums = get_word_num(self.file_list)
-        else:
-            raise ValueError(
-                "Set static_benchmark.example_count_method for example / word for example count."
-            )
 
     def dataset_train_loop(self, epoch):
         logger.info("Epoch: {}, Running Dataset Begin.".format(epoch))
+
         fetch_info = [
             "Epoch {} Var {}".format(epoch, var_name)
             for var_name in self.metrics
         ]
+
         fetch_vars = [var for _, var in self.metrics.items()]
+
         print_step = int(config.get("runner.print_interval"))
 
         debug = config.get("runner.dataset_debug", False)
@@ -268,6 +264,7 @@ class Main(object):
             print_period=print_step,
             debug=debug)
 
+
     def heter_train_loop(self, epoch):
         logger.info(
             "Epoch: {}, Running Begin. Check running metrics at heter_log".
@@ -318,7 +315,9 @@ class Main(object):
 
 if __name__ == "__main__":
     paddle.enable_static()
+
     config = parse_args()
     os.environ["CPU_NUM"] = str(config.get("runner.thread_num"))
     benchmark_main = Main(config)
+
     benchmark_main.run()

+ 1 - 5
recommend-model-produce/src/main/python/tools/static_ps_trainer_v2.py

@@ -102,9 +102,7 @@ class Main(object):
         self.pure_bf16 = self.config['pure_bf16']
 
     def run(self):
-        logger.info("Begin 11111111") 
         self.init_fleet_with_gloo()
-        logger.info("Begin 22222222") 
         self.network()
         if fleet.is_server():
             self.run_server()
@@ -117,12 +115,10 @@ class Main(object):
     def init_fleet_with_gloo(use_gloo=True):
         if use_gloo:
             os.environ["PADDLE_WITH_GLOO"] = "0"
-            logger.info("Begin 11111111222222") 
             role = role_maker.PaddleCloudRoleMaker(
                 is_collective=False,
                 init_gloo=False
-            ) 
-            logger.info("Begin 11111111333333") 
+            )
             fleet.init(role)
             #logger.info("worker_index: %s", fleet.worker_index())
             #logger.info("is_first_worker: %s", fleet.is_first_worker())