瀏覽代碼

Merge branch 'feature_modelService' of algorithm/recommend-model into master

dingyunpeng 5 月之前
父節點
當前提交
aae05c4846

+ 7 - 1
recommend-model-produce/src/main/python/Dockerfile

@@ -1,5 +1,11 @@
 FROM registry.baidubce.com/paddlepaddle/paddle:2.6.1
+RUN pip freeze > requirements.txt
 ENV PYTHONUNBUFFERED=1
+ENV GLOG_v=1
+ENV JAVA_HOME=/app/env/java8
+ENV HADOOP_HOME=/app/env/hadoop-3.2.4
+ENV PATH=$PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
 WORKDIR /app
 COPY . /app
-RUN pip3 install -r requirements.txt
+RUN pip3 install -i https://mirrors.aliyun.com/pypi/simple/ --no-cache-dir -r requirements.txt
+CMD python tools/static_ps_trainer.py -m models/dssm/config_ps.yaml

+ 57 - 0
recommend-model-produce/src/main/python/models/dssm/bq_reader_train_ps.py

@@ -0,0 +1,57 @@
+from paddle.distributed.fleet.data_generator import MultiSlotDataGenerator
+import sys
+
+class DSSMReader(MultiSlotDataGenerator):
+    def __init__(self):
+        super(DSSMReader, self).__init__()
+        self.feature_dim = 5  # 设置特征维度
+
+    def init(self, config=None):
+        pass
+
+    def line_process(self, line):
+        try:
+            # 按tab分割样本的各个字段
+            sample_id, label, left_features, right_features = line.rstrip('\n').split('    ')
+            
+            # 转换label为整数
+            label = int(label)
+            
+            # 处理左右视频特征
+            left_features = [float(x) for x in left_features.split(',')]
+            right_features = [float(x) for x in right_features.split(',')]
+            
+            # 验证特征维度
+            if len(left_features) != self.feature_dim or len(right_features) != self.feature_dim:
+                return None
+            
+            # 构建输出列表
+            output = []
+            #output.append(("sample_id", [sample_id]))  # 样本ID
+            output.append(("label", [label]))          # 标签
+            output.append(("left_features", left_features))   # 左视频特征
+            output.append(("right_features", right_features)) # 右视频特征
+            
+            return output
+
+        except Exception as e:
+            sys.stderr.write(f"Error processing line: {str(e)}\n")
+            return None
+
+    def generate_sample(self, line):
+        def reader():
+            try:
+                result = self.line_process(line)
+                if result is not None:
+                    yield result
+            except Exception as e:
+                sys.stderr.write(f"Error in generate_sample: {str(e)}\n")
+        return reader
+
+if __name__ == "__main__":
+    reader = DSSMReader()
+    reader.init()
+    try:
+        reader.run_from_stdin()
+    except Exception as e:
+        sys.stderr.write(f"Error in main: {str(e)}\n")

+ 46 - 0
recommend-model-produce/src/main/python/models/dssm/config_ps.yaml

@@ -0,0 +1,46 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+runner:
+  train_data_dir: "data/train"
+  train_reader_path: "bq_reader_train"  # importlib format
+  train_batch_size: 8
+  model_save_path: "output_model_dssm"
+
+  reader_type: "QueueDataset"  # DataLoader / QueueDataset / RecDataset
+  pipe_command: "python bq_reader_train_ps.py"
+  thread_num: 1
+  sync_mode: "async"
+
+  use_gpu: False
+  epochs: 10
+  print_interval: 1
+  
+  test_data_dir: "data/test"
+  infer_reader_path: "bq_reader_infer"  # importlib format
+  infer_batch_size: 1
+  infer_load_path: "output_model_dssm"
+  infer_start_epoch: 0
+  infer_end_epoch: 1
+
+hyper_parameters:
+  optimizer:
+    class: adam
+    learning_rate: 0.001
+    strategy: sync
+  trigram_d: 2900
+  neg_num: 1
+  slice_end: 8
+  fc_sizes: [300, 300, 128]
+  fc_acts: ['relu', 'relu', 'relu']

+ 48 - 0
recommend-model-produce/src/main/python/models/dssm/config_ps_hdfs.yaml

@@ -0,0 +1,48 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+runner:
+  train_data_dir: "/dyp"
+  train_reader_path: "bq_reader_train"  # importlib format
+  train_batch_size: 8
+  model_save_path: "output_model_dssm"
+  split_file_list: true
+
+  reader_type: "QueueDataset"  # DataLoader / QueueDataset / RecDataset
+  pipe_command: "python bq_reader_train_ps.py"
+  thread_num: 1
+  sync_mode: "async"
+
+
+  use_gpu: False
+  epochs: 1
+  print_interval: 1
+  
+  test_data_dir: "data/test"
+  infer_reader_path: "bq_reader_infer"  # importlib format
+  infer_batch_size: 1
+  infer_load_path: "output_model_dssm"
+  infer_start_epoch: 0
+  infer_end_epoch: 1
+
+hyper_parameters:
+  optimizer:
+    class: adam
+    learning_rate: 0.001
+    strategy: sync
+  trigram_d: 2900
+  neg_num: 1
+  slice_end: 8
+  fc_sizes: [300, 300, 128]
+  fc_acts: ['relu', 'relu', 'relu']

文件差異過大導致無法顯示
+ 0 - 0
recommend-model-produce/src/main/python/models/dssm/data/test/test.txt


文件差異過大導致無法顯示
+ 0 - 0
recommend-model-produce/src/main/python/models/dssm/data/train/train.txt


+ 116 - 71
recommend-model-produce/src/main/python/models/dssm/net.py

@@ -1,101 +1,146 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 import numpy as np
 
-
 class DSSMLayer(nn.Layer):
-    def __init__(self, trigram_d, neg_num, slice_end, hidden_layers,
-                 hidden_acts):
+    def __init__(self, feature_nums=[5,5,5,5,5], embedding_dim=8, output_dim=16, 
+                 hidden_layers=[40, 32], hidden_acts=["relu", "relu"]):
         super(DSSMLayer, self).__init__()
-
-        self.hidden_layers = [trigram_d] + hidden_layers
+        
+        self.feature_num = len(feature_nums)
+        self.embedding_dim = embedding_dim
+        self.output_dim = output_dim
+        # 第一层的输入维度是所有特征的embedding拼接
+        self.hidden_layers = [self.feature_num * embedding_dim] + hidden_layers + [output_dim]
         self.hidden_acts = hidden_acts
-        self.slice_end = slice_end
+        
+        
+        # 为每个特征创建对应维度的Embedding层
+        self.left_embeddings = nn.LayerList([
+            nn.Embedding(
+                num_embeddings=feature_nums[i],
+                embedding_dim=embedding_dim,
+                weight_attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.XavierNormal()
+                )
+            ) for i in range(self.feature_num)
+        ])
+
+        self.right_embeddings = nn.LayerList([
+            nn.Embedding(
+                num_embeddings=feature_nums[i], 
+                embedding_dim=embedding_dim,
+                weight_attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.XavierNormal()
+                )
+            ) for i in range(self.feature_num)
+        ])
 
-        self._query_layers = []
+        # 左视频塔
+        self._left_tower = []
         for i in range(len(self.hidden_layers) - 1):
             linear = paddle.nn.Linear(
                 in_features=self.hidden_layers[i],
                 out_features=self.hidden_layers[i + 1],
                 weight_attr=paddle.ParamAttr(
-                    initializer=paddle.nn.initializer.XavierNormal(
-                        fan_in=self.hidden_layers[i],
-                        fan_out=self.hidden_layers[i + 1])),
+                    initializer=paddle.nn.initializer.XavierNormal()
+                ),
                 bias_attr=paddle.ParamAttr(
-                    initializer=paddle.nn.initializer.XavierNormal(
-                        fan_in=self.hidden_layers[i],
-                        fan_out=self.hidden_layers[i + 1])))
-            self.add_sublayer('query_linear_%d' % i, linear)
-            self._query_layers.append(linear)
-            if self.hidden_acts[i] == "relu":
+                    initializer=paddle.nn.initializer.Constant(value=0.0)
+                )
+            )
+            self.add_sublayer('left_linear_%d' % i, linear)
+            self._left_tower.append(linear)
+            
+            if i < len(hidden_acts) and self.hidden_acts[i] == "relu":
                 act = paddle.nn.ReLU()
-                self.add_sublayer('query_act_%d' % i, act)
-                self._query_layers.append(act)
+                self.add_sublayer('left_act_%d' % i, act)
+                self._left_tower.append(act)
 
-        self._doc_layers = []
+        # 右视频塔
+        self._right_tower = []
         for i in range(len(self.hidden_layers) - 1):
             linear = paddle.nn.Linear(
                 in_features=self.hidden_layers[i],
                 out_features=self.hidden_layers[i + 1],
                 weight_attr=paddle.ParamAttr(
-                    initializer=paddle.nn.initializer.XavierNormal(
-                        fan_in=self.hidden_layers[i],
-                        fan_out=self.hidden_layers[i + 1])),
+                    initializer=paddle.nn.initializer.XavierNormal()
+                ),
                 bias_attr=paddle.ParamAttr(
-                    initializer=paddle.nn.initializer.XavierNormal(
-                        fan_in=self.hidden_layers[i],
-                        fan_out=self.hidden_layers[i + 1])))
-            self.add_sublayer('pos_linear_%d' % i, linear)
-            self._doc_layers.append(linear)
-            if self.hidden_acts[i] == "relu":
+                    initializer=paddle.nn.initializer.Constant(value=0.0)
+                )
+            )
+            self.add_sublayer('right_linear_%d' % i, linear)
+            self._right_tower.append(linear)
+            
+            if i < len(hidden_acts) and self.hidden_acts[i] == "relu":
                 act = paddle.nn.ReLU()
-                self.add_sublayer('pos_act_%d' % i, act)
-                self._doc_layers.append(act)
+                self.add_sublayer('right_act_%d' % i, act)
+                self._right_tower.append(act)
+
+    def _process_features(self, features, embeddings):
+        # 将每个特征转换为embedding
+        embedded_features = []
+        for i in range(self.feature_num):
+            feature = paddle.slice(
+                features, 
+                axes=[1], 
+                starts=[i], 
+                ends=[i+1]
+            )
+            feature = paddle.cast(feature, dtype='int64')
+            embedded = embeddings[i](feature)
+
+            embedded_features.append(embedded)
+        
+        # 将所有embedding连接起来
+   
+        return paddle.concat(embedded_features, axis=1)
+
+    def forward(self, left_features, right_features):
+        # 获取两个视频的特征表示      
+        left_vec, right_vec = self.get_vectors(left_features, right_features)
+
+        # 计算相似度
+        sim_score = F.cosine_similarity(
+            left_vec, 
+            right_vec, 
+            axis=1
+        ).reshape([-1, 1])
+
+        return sim_score, left_vec, right_vec
 
-    def forward(self, input_data, is_infer):
-        query_fc = input_data[0]
-        for n_layer in self._query_layers:
-            query_fc = n_layer(query_fc)
-        self.query_fc = query_fc
+    def get_vectors(self, left_features, right_features):
+        """获取两个视频的16维特征向量"""
+        # 处理左视频特征
+        
+        left_embedded = self._process_features(left_features, self.left_embeddings)
+        
+        # left_vec = left_embedded
+        left_vec = paddle.reshape(left_embedded, [-1, self.feature_num * self.embedding_dim])
+              
+        
 
-        doc_pos_fc = input_data[1]
-        for n_layer in self._doc_layers:
-            doc_pos_fc = n_layer(doc_pos_fc)
-        self.doc_pos_fc = doc_pos_fc
+        
+        
+        
+        for i, layer in enumerate(self._left_tower):
+            left_vec = layer(left_vec)
 
-        self.params = [self._query_layers[-2].bias]
+        
+        # 处理右视频特征
+        right_embedded = self._process_features(right_features, self.right_embeddings)
+        # right_vec = right_embedded
+        right_vec = paddle.reshape(right_embedded, [-1, self.feature_num * self.embedding_dim])  
 
-        R_Q_D_p = F.cosine_similarity(
-            query_fc, doc_pos_fc, axis=1).reshape([-1, 1])
+        for layer in self._right_tower:
+            right_vec = layer(right_vec)
 
-        if is_infer:
-            return R_Q_D_p, paddle.ones(shape=[self.slice_end, 1])
+            
+        # 确保输出是L2归一化的
+        left_vec = F.normalize(left_vec, p=2, axis=1)
+        right_vec = F.normalize(right_vec, p=2, axis=1)
+        
+        return left_vec, right_vec
 
-        R_Q_D_ns = []
-        for i in range(len(input_data) - 2):
-            doc_neg_fc_i = input_data[i + 2]
-            for n_layer in self._doc_layers:
-                doc_neg_fc_i = n_layer(doc_neg_fc_i)
-            R_Q_D_n = F.cosine_similarity(
-                query_fc, doc_neg_fc_i, axis=1).reshape([-1, 1])
-            R_Q_D_ns.append(R_Q_D_n)
-        concat_Rs = paddle.concat(x=[R_Q_D_p] + R_Q_D_ns, axis=1)
-        prob = F.softmax(concat_Rs, axis=1)
-        hit_prob = paddle.slice(
-            prob, axes=[0, 1], starts=[0, 0], ends=[self.slice_end, 1])
-        return R_Q_D_p, hit_prob

+ 81 - 54
recommend-model-produce/src/main/python/models/dssm/static_model.py

@@ -1,22 +1,7 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import math
 import paddle
 from net import DSSMLayer
 
-
 class StaticModel():
     def __init__(self, config):
         self.cost = None
@@ -24,59 +9,101 @@ class StaticModel():
         self._init_hyper_parameters()
 
     def _init_hyper_parameters(self):
-        self.trigram_d = self.config.get("hyper_parameters.trigram_d")
-        self.neg_num = self.config.get("hyper_parameters.neg_num")
-        self.hidden_layers = self.config.get("hyper_parameters.fc_sizes")
-        self.hidden_acts = self.config.get("hyper_parameters.fc_acts")
-        self.learning_rate = self.config.get("hyper_parameters.learning_rate")
-        self.slice_end = self.config.get("hyper_parameters.slice_end")
-        self.learning_rate = self.config.get(
-            "hyper_parameters.optimizer.learning_rate")
+        # 修改超参数初始化
+        self.feature_nums = self.config.get("hyper_parameters.feature_nums", [5,5,5,5,5])
+        self.embedding_dim = self.config.get("hyper_parameters.embedding_dim", 8)
+        self.output_dim = self.config.get("hyper_parameters.output_dim", 16)
+        self.hidden_layers = self.config.get("hyper_parameters.hidden_layers", [40, 32])
+        self.hidden_acts = self.config.get("hyper_parameters.hidden_acts", ["relu", "relu"])
+        self.learning_rate = self.config.get("hyper_parameters.optimizer.learning_rate", 0.001)
+        self.margin = self.config.get("hyper_parameters.margin", 0.3)  # 用于损失函数的margin参数
+        self.feature_num = len(self.feature_nums)
 
     def create_feeds(self, is_infer=False):
-        query = paddle.static.data(
-            name="query", shape=[-1, self.trigram_d], dtype='float32')
-        self.prune_feed_vars = [query]
+        # 定义输入数据占位符
+        # sample_id = paddle.static.data(
+        #    name="sample_id", shape=[-1, 1], dtype='int64')
+        feeds_list = []
+        if not is_infer:
+            label = paddle.static.data(
+                name="label", shape=[-1, 1], dtype='float32')
+            feeds_list.append(label)
+                    
+        left_features = paddle.static.data(
+            name="left_features", shape=[-1, self.feature_num], dtype='float32')
+        feeds_list.append(left_features)
+        right_features = paddle.static.data(
+            name="right_features", shape=[-1, self.feature_num], dtype='float32')
+        feeds_list.append(right_features)
+        
+        
 
-        doc_pos = paddle.static.data(
-            name="doc_pos", shape=[-1, self.trigram_d], dtype='float32')
 
-        if is_infer:
-            return [query, doc_pos]
-
-        doc_negs = [
-            paddle.static.data(
-                name="doc_neg_" + str(i),
-                shape=[-1, self.trigram_d],
-                dtype="float32") for i in range(self.neg_num)
-        ]
-        feeds_list = [query, doc_pos] + doc_negs
         return feeds_list
 
     def net(self, input, is_infer=False):
-        dssm_model = DSSMLayer(self.trigram_d, self.neg_num, self.slice_end,
-                               self.hidden_layers, self.hidden_acts)
-        R_Q_D_p, hit_prob = dssm_model.forward(input, is_infer)
-
-        self.inference_target_var = R_Q_D_p
-        self.prune_target_var = dssm_model.query_fc
-        self.train_dump_fields = [dssm_model.query_fc, R_Q_D_p]
-        self.train_dump_params = dssm_model.params
-        self.infer_dump_fields = [dssm_model.doc_pos_fc]
+        # 创建模型实例
+        dssm_model = DSSMLayer(
+            feature_nums=self.feature_nums,
+            embedding_dim=self.embedding_dim,
+            output_dim=self.output_dim,
+            hidden_layers=self.hidden_layers,
+            hidden_acts=self.hidden_acts
+        )
+
+    
+
         if is_infer:
-            fetch_dict = {'query_doc_sim': R_Q_D_p}
+            left_features, right_features = input
+        else:
+            label,left_features, right_features = input
+
+
+        # 获取相似度和特征向量
+        sim_score, left_vec, right_vec = dssm_model(left_features, right_features)
+
+        self.inference_target_var = sim_score
+        self.left_vector = left_vec
+        self.right_vector = right_vec
+
+        if is_infer:
+            fetch_dict = {
+                'similarity': sim_score,
+                'left_vector': left_vec,
+                'right_vector': right_vec
+            }
             return fetch_dict
-        loss = -paddle.sum(paddle.log(hit_prob), axis=-1)
-        avg_cost = paddle.mean(x=loss)
-        # print(avg_cost)
+
+        # 计算损失
+        # 使用带margin的二元交叉熵损失
+        pos_mask = paddle.cast(label > 0.5, 'float32')
+        neg_mask = 1.0 - pos_mask
+        
+        positive_loss = -pos_mask * paddle.log(paddle.clip(sim_score, 1e-8, 1.0))
+        negative_loss = -neg_mask * paddle.log(paddle.clip(1 - sim_score + self.margin, 1e-8, 1.0))
+        
+        loss = positive_loss + negative_loss
+        avg_cost = paddle.mean(loss)
+        
         self._cost = avg_cost
-        fetch_dict = {'Loss': avg_cost}
+
+        # 计算accuracy
+        predictions = paddle.cast(sim_score > 0.5, 'float32')
+        accuracy = paddle.mean(paddle.cast(paddle.equal(predictions, label), 'float32'))
+
+        fetch_dict = {
+            'loss': avg_cost,
+            'accuracy': accuracy,
+            #'similarity': sim_score,
+            #'left_vector': left_vec,
+            #'right_vector': right_vec
+        }
         return fetch_dict
 
     def create_optimizer(self, strategy=None):
         optimizer = paddle.optimizer.Adam(
-            learning_rate=self.learning_rate, lazy_mode=True)
-        if strategy != None:
+            learning_rate=self.learning_rate)
+        if strategy is not None:
             import paddle.distributed.fleet as fleet
             optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(self._cost)

+ 51 - 39
recommend-model-produce/src/main/python/models/wide_and_deep_dataset/model.py

@@ -4,6 +4,7 @@ import paddle.nn.functional as F
 import math
 
 
+
 class WideDeepLayer(nn.Layer):
     def __init__(self, sparse_feature_number, sparse_feature_dim,
                  dense_feature_dim, num_field, layer_sizes):
@@ -14,60 +15,71 @@ class WideDeepLayer(nn.Layer):
         self.num_field = num_field
         self.layer_sizes = layer_sizes
 
-        self.wide_part = paddle.nn.Linear(
-            in_features=self.dense_feature_dim,
-            out_features=1,
-            weight_attr=paddle.ParamAttr(
-                initializer=paddle.nn.initializer.TruncatedNormal(
-                    mean=0.0, std=1.0 / math.sqrt(self.dense_feature_dim))))
-
-        self.embedding = paddle.nn.Embedding(
-            self.sparse_feature_number,
-            self.sparse_feature_dim,
-            sparse=True,
-            weight_attr=paddle.ParamAttr(
-                name="SparseFeatFactors",
-                initializer=paddle.nn.initializer.Uniform()))
-
-        sizes = [sparse_feature_dim * num_field + dense_feature_dim
-                 ] + self.layer_sizes + [1]
-        acts = ["relu" for _ in range(len(self.layer_sizes))] + [None]
-        self._mlp_layers = []
-        for i in range(len(layer_sizes) + 1):
-            linear = paddle.nn.Linear(
-                in_features=sizes[i],
-                out_features=sizes[i + 1],
-                weight_attr=paddle.ParamAttr(
-                    initializer=paddle.nn.initializer.Normal(
-                        std=1.0 / math.sqrt(sizes[i]))))
-            self.add_sublayer('linear_%d' % i, linear)
-            self._mlp_layers.append(linear)
-            if acts[i] == 'relu':
-                act = paddle.nn.ReLU()
-                self.add_sublayer('act_%d' % i, act)
-                self._mlp_layers.append(act)
-
     def forward(self, sparse_inputs, dense_inputs):
         # wide part
-        wide_output = self.wide_part(dense_inputs)
+        wide_w = paddle.static.create_parameter(
+            shape=[self.dense_feature_dim, 1],
+            dtype='float32',
+            name='wide_w',
+            default_initializer=paddle.nn.initializer.TruncatedNormal(
+                mean=0.0, std=1.0 / math.sqrt(self.dense_feature_dim)))
+        wide_b = paddle.static.create_parameter(
+            shape=[1],
+            dtype='float32',
+            name='wide_b',
+            default_initializer=paddle.nn.initializer.Constant(0.0))
+        
+        # 使用paddle.static.nn.fc的正确方式
+        wide_output = paddle.static.nn.fc(
+            x=dense_inputs,  # 使用x而不是input
+            size=1,
+            weight_attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.TruncatedNormal(
+                    mean=0.0, std=1.0 / math.sqrt(self.dense_feature_dim))),
+            bias_attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0))
+        )
 
         # deep part
         sparse_embs = []
-        for s_input in sparse_inputs:
-            #emb = self.embedding(s_input)
-            emb = paddle.static.nn.sparse_embedding(s_input, size = [1024, self.sparse_feature_dim], param_attr=paddle.ParamAttr(name="embedding"))
+        for i, s_input in enumerate(sparse_inputs):
+            emb = paddle.static.nn.embedding(
+                input=s_input,
+                size=[self.sparse_feature_number, self.sparse_feature_dim],
+                param_attr=paddle.ParamAttr(
+                    name=f"embedding_{i}",
+                    initializer=paddle.nn.initializer.Uniform()))
             emb = paddle.reshape(emb, shape=[-1, self.sparse_feature_dim])
             sparse_embs.append(emb)
 
         deep_output = paddle.concat(x=sparse_embs + [dense_inputs], axis=1)
-        for n_layer in self._mlp_layers:
-            deep_output = n_layer(deep_output)
+        
+        # 创建深度网络层
+        sizes = [self.sparse_feature_dim * self.num_field + self.dense_feature_dim] + self.layer_sizes + [1]
+        acts = ["relu" for _ in range(len(self.layer_sizes))] + [None]
+        
+        for i in range(len(sizes) - 1):
+            deep_output = paddle.static.nn.fc(
+                x=deep_output,  # 使用x而不是input
+                size=sizes[i + 1],
+                activation=acts[i],  # 使用activation而不是act
+                weight_attr=paddle.ParamAttr(
+                    name=f'fc_{i}_w',
+                    initializer=paddle.nn.initializer.Normal(
+                        std=1.0 / math.sqrt(sizes[i]))),
+                bias_attr=paddle.ParamAttr(
+                    name=f'fc_{i}_b',
+                    initializer=paddle.nn.initializer.Constant(0.0))
+            )
 
         prediction = paddle.add(x=wide_output, y=deep_output)
         pred = F.sigmoid(prediction)
         return pred
 
 
+
+
+
 class WideDeepModel:
     def __init__(self, sparse_feature_number=1000001, sparse_inputs_slots=27, sparse_feature_dim=10, dense_input_dim=13, fc_sizes=[400, 400, 400]):
         self.sparse_feature_number = sparse_feature_number

+ 2 - 1
recommend-model-produce/src/main/python/requirements.txt

@@ -1 +1,2 @@
-PyYAML==6.0.2
+PyYAML==6.0.2
+oss2==2.19.1

+ 1 - 1
recommend-model-produce/src/main/python/tools/readme.md

@@ -17,7 +17,7 @@
  |  paddle_infer.py  |  使用python版本的预测库进行推理  |  可以支持在linux环境下启动  |  具体用法见[python端预测库推理](../doc/inference.md)  |
  |  rec_client.py  |  在服务器端启动serving服务成功后,部署客户端  |  可以支持在linux环境下启动  |  具体用法见[在线Serving部署](../doc/serving.md)  |
  |  static_ps_online_trainer.py  |  流式训练  |  可以在linux环境下从任意目录通过相对路径启动  |  具体用法见[流式训练](../doc/online_trainer.md)  |
-  |  static_ps_offline_infer.py  |  基于参数服务器(ParameterServer)模式的分布式预测相关代码,目前仅支持使用静态图的方式预测  |  可以在linux环境下从任意目录通过相对路径启动  |  支持在任意目录下运行,以slot_dnn模型为示例,在PaddleRec根目录中运行的命令为:fleetrun --worker_num=1 --server_num=1 tools/static_ps_offline_infer.py -m models/rank/slot_dnn/config_offline_infer.yaml |
+ |  static_ps_offline_infer.py  |  基于参数服务器(ParameterServer)模式的分布式预测相关代码,目前仅支持使用静态图的方式预测  |  可以在linux环境下从任意目录通过相对路径启动  |  支持在任意目录下运行,以slot_dnn模型为示例,在PaddleRec根目录中运行的命令为:fleetrun --worker_num=1 --server_num=1 tools/static_ps_offline_infer.py -m models/rank/slot_dnn/config_offline_infer.yaml |
  |  to_static.py  |  动态图保存下来的模型模型转化为静态图进行inference  |  可以支持在linux环境下启动  |  具体用法见[python端预测库推理](../doc/inference.md)  |
  |  webserver.py  |  使用web方式启动serving服务端  |  可以支持在linux环境下启动  |  具体用法见[在线Serving部署](../doc/serving.md)  |
  |  onoff_diff  |  在线离线一致性检查  |  可以在linux环境下从任意目录通过相对路径启动  |  具体用法见[在线离线一致性检查](../doc/onoff_diff.md)  |

+ 25 - 4
recommend-model-produce/src/main/python/tools/static_ps_trainer.py

@@ -100,7 +100,9 @@ class Main(object):
         self.pure_bf16 = self.config['pure_bf16']
 
     def run(self):
+        logger.info("Begin 11111111") 
         self.init_fleet_with_gloo()
+        logger.info("Begin 22222222") 
         self.network()
         if fleet.is_server():
             self.run_server()
@@ -112,16 +114,27 @@ class Main(object):
 
     def init_fleet_with_gloo(use_gloo=True):
         if use_gloo:
-            os.environ["PADDLE_WITH_GLOO"] = "1"
-            role = role_maker.PaddleCloudRoleMaker()
+            os.environ["PADDLE_WITH_GLOO"] = "0"
+            logger.info("Begin 11111111222222") 
+            role = role_maker.PaddleCloudRoleMaker(
+                is_collective=False,
+                init_gloo=False
+            ) 
+            logger.info("Begin 11111111333333") 
             fleet.init(role)
+            #logger.info("worker_index: %s", fleet.worker_index())
+            #logger.info("is_first_worker: %s", fleet.is_first_worker())
+            #logger.info("worker_num: %s", fleet.worker_num())
+            #logger.info("is_distributed: %s", fleet.is_distributed())
+            #logger.info("mode: %s", fleet.mode)
+
         else:
             fleet.init()
 
     def network(self):
         self.model = get_model(self.config)
         self.input_data = self.model.create_feeds()
-        self.inference_feed_var = self.model.create_feeds(is_infer=False)
+        self.inference_feed_var = self.model.create_feeds(is_infer=True)
         self.init_reader()
         self.metrics = self.model.net(self.input_data)
         self.inference_target_var = self.model.inference_target_var
@@ -242,11 +255,14 @@ class Main(object):
 
     def dataset_train_loop(self, epoch):
         logger.info("Epoch: {}, Running Dataset Begin.".format(epoch))
+        
         fetch_info = [
             "Epoch {} Var {}".format(epoch, var_name)
             for var_name in self.metrics
         ]
+
         fetch_vars = [var for _, var in self.metrics.items()]
+
         print_step = int(config.get("runner.print_interval"))
 
         debug = config.get("runner.dataset_debug", False)
@@ -258,7 +274,8 @@ class Main(object):
                 "dump_fields_path": dump_fields_path,
                 "dump_fields": config.get("runner.dump_fields")
             })
-        print(paddle.static.default_main_program()._fleet_opt)
+        logger.info(paddle.static.default_main_program()._fleet_opt)
+        
         self.exe.train_from_dataset(
             program=paddle.static.default_main_program(),
             dataset=self.reader,
@@ -266,6 +283,7 @@ class Main(object):
             fetch_info=fetch_info,
             print_period=print_step,
             debug=debug)
+        
 
     def heter_train_loop(self, epoch):
         logger.info(
@@ -317,7 +335,10 @@ class Main(object):
 
 if __name__ == "__main__":
     paddle.enable_static()
+ 
     config = parse_args()
     os.environ["CPU_NUM"] = str(config.get("runner.thread_num"))
     benchmark_main = Main(config)
+ 
     benchmark_main.run()
+  

+ 340 - 0
recommend-model-produce/src/main/python/tools/static_ps_trainer_v2.py

@@ -0,0 +1,340 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os
+os.environ['FLAGS_enable_pir_api'] = '0'
+from utils.static_ps.reader_helper_hdfs import get_reader
+from utils.static_ps.program_helper import get_model, get_strategy, set_dump_config
+from utils.static_ps.metric_helper import set_zero, get_global_auc
+from utils.static_ps.common_ps import YamlHelper, is_distributed_env
+import argparse
+import time
+import sys
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle
+
+import warnings
+import logging
+import ast
+import numpy as np
+import struct
+from utils.utils_single import auc
+from utils.oss_client import HangZhouOSSClient
+import utils.compress as compress
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))
+
+root_loger = logging.getLogger()
+for handler in root_loger.handlers[:]:
+    root_loger.removeHandler(handler)
+logging.basicConfig(
+    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("PaddleRec train script")
+    parser.add_argument("-o", "--opt", nargs='*', type=str)
+    parser.add_argument(
+        '-m',
+        '--config_yaml',
+        type=str,
+        required=True,
+        help='config file path')
+    parser.add_argument(
+        '-bf16',
+        '--pure_bf16',
+        type=ast.literal_eval,
+        default=False,
+        help="whether use bf16")
+    args = parser.parse_args()
+    args.abs_dir = os.path.dirname(os.path.abspath(args.config_yaml))
+    yaml_helper = YamlHelper()
+    config = yaml_helper.load_yaml(args.config_yaml)
+    # modify config from command
+    if args.opt:
+        for parameter in args.opt:
+            parameter = parameter.strip()
+            key, value = parameter.split("=")
+            if type(config.get(key)) is int:
+                value = int(value)
+            if type(config.get(key)) is float:
+                value = float(value)
+            if type(config.get(key)) is bool:
+                value = (True if value.lower() == "true" else False)
+            config[key] = value
+    config["yaml_path"] = args.config_yaml
+    config["config_abs_dir"] = args.abs_dir
+    config["pure_bf16"] = args.pure_bf16
+    yaml_helper.print_yaml(config)
+    return config
+
+
+def bf16_to_fp32(val):
+    return np.float32(struct.unpack('<f', struct.pack('<I', val << 16))[0])
+
+
+class Main(object):
+    def __init__(self, config):
+        self.metrics = {}
+        self.config = config
+        self.input_data = None
+        self.reader = None
+        self.exe = None
+        self.train_result_dict = {}
+        self.train_result_dict["speed"] = []
+        self.train_result_dict["auc"] = []
+        self.model = None
+        self.pure_bf16 = self.config['pure_bf16']
+
+    def run(self):
+        logger.info("Begin 11111111") 
+        self.init_fleet_with_gloo()
+        logger.info("Begin 22222222") 
+        self.network()
+        if fleet.is_server():
+            self.run_server()
+        elif fleet.is_worker():
+            self.run_worker()
+            fleet.stop_worker()
+            self.record_result()
+        logger.info("Run Success, Exit.")
+
+    def init_fleet_with_gloo(use_gloo=True):
+        if use_gloo:
+            os.environ["PADDLE_WITH_GLOO"] = "0"
+            logger.info("Begin 11111111222222") 
+            role = role_maker.PaddleCloudRoleMaker(
+                is_collective=False,
+                init_gloo=False
+            ) 
+            logger.info("Begin 11111111333333") 
+            fleet.init(role)
+            #logger.info("worker_index: %s", fleet.worker_index())
+            #logger.info("is_first_worker: %s", fleet.is_first_worker())
+            #logger.info("worker_num: %s", fleet.worker_num())
+            #logger.info("is_distributed: %s", fleet.is_distributed())
+            #logger.info("mode: %s", fleet.mode)
+
+        else:
+            fleet.init()
+
+    def network(self):
+        self.model = get_model(self.config)
+        self.input_data = self.model.create_feeds()
+        self.inference_feed_var = self.model.create_feeds(is_infer=True)
+        self.init_reader()
+        self.metrics = self.model.net(self.input_data)
+        self.inference_target_var = self.model.inference_target_var
+        logger.info("cpu_num: {}".format(os.getenv("CPU_NUM")))
+        self.model.create_optimizer(get_strategy(self.config))
+
+    def run_server(self):
+        logger.info("Run Server Begin")
+        fleet.init_server(config.get("runner.warmup_model_path"))
+        fleet.run_server()
+
+    def run_worker(self):
+        logger.info("Run Worker Begin")
+        use_cuda = int(config.get("runner.use_gpu"))
+        use_auc = config.get("runner.use_auc", False)
+        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+        self.exe = paddle.static.Executor(place)
+
+        with open("./{}_worker_main_program.prototxt".format(
+                fleet.worker_index()), 'w+') as f:
+            f.write(str(paddle.static.default_main_program()))
+        with open("./{}_worker_startup_program.prototxt".format(
+                fleet.worker_index()), 'w+') as f:
+            f.write(str(paddle.static.default_startup_program()))
+
+        self.exe.run(paddle.static.default_startup_program())
+        if self.pure_bf16:
+            self.model.optimizer.amp_init(self.exe.place)
+        fleet.init_worker()
+
+        save_model_path = self.config.get("runner.model_save_path")
+        if save_model_path and (not os.path.exists(save_model_path)):
+            os.makedirs(save_model_path)
+
+        reader_type = self.config.get("runner.reader_type", "QueueDataset")
+        epochs = int(self.config.get("runner.epochs"))
+        sync_mode = self.config.get("runner.sync_mode")
+        opt_info = paddle.static.default_main_program()._fleet_opt
+        if use_auc is True:
+            opt_info['stat_var_names'] = [
+                self.model.stat_pos.name, self.model.stat_neg.name
+            ]
+        else:
+            opt_info['stat_var_names'] = []
+
+        if reader_type == "InmemoryDataset":
+            self.reader.load_into_memory()
+
+        for epoch in range(epochs):
+            epoch_start_time = time.time()
+
+            if sync_mode == "heter":
+                self.heter_train_loop(epoch)
+            elif reader_type == "QueueDataset":
+                self.dataset_train_loop(epoch)
+            elif reader_type == "InmemoryDataset":
+                self.dataset_train_loop(epoch)
+
+            epoch_time = time.time() - epoch_start_time
+
+            if use_auc is True:
+                global_auc = get_global_auc(paddle.static.global_scope(),
+                                            self.model.stat_pos.name,
+                                            self.model.stat_neg.name)
+                self.train_result_dict["auc"].append(global_auc)
+                set_zero(self.model.stat_pos.name,
+                         paddle.static.global_scope())
+                set_zero(self.model.stat_neg.name,
+                         paddle.static.global_scope())
+                set_zero(self.model.batch_stat_pos.name,
+                         paddle.static.global_scope())
+                set_zero(self.model.batch_stat_neg.name,
+                         paddle.static.global_scope())
+                logger.info(
+                    "Epoch: {}, using time: {} second, ips: {}/sec. auc: {}".
+                    format(epoch, epoch_time, self.count_method,
+                           global_auc))
+            else:
+                logger.info(
+                    "Epoch: {}, using time {} second, ips  {}/sec.".format(
+                        epoch, epoch_time, self.count_method))
+
+            model_dir = "{}/{}".format(save_model_path, epoch)
+
+            if is_distributed_env():
+                fleet.save_inference_model(
+                    self.exe, model_dir,
+                    [feed.name for feed in self.inference_feed_var],
+                    self.inference_target_var)
+            else:
+                paddle.static.save_inference_model(
+                    model_dir,
+                    [feed.name for feed in self.inference_feed_var],
+                    [self.inference_target_var], self.exe)
+
+            compress.compress_tar(model_dir, "test")
+            client = HangZhouOSSClient("art-recommend")
+            client.put_object_from_file("dyp/test.tar.gz", "test.tar.gz")
+
+        if reader_type == "InmemoryDataset":
+            self.reader.release_memory()
+
+    def init_reader(self):
+        if fleet.is_server():
+            return
+        self.config["runner.reader_type"] = self.config.get(
+            "runner.reader_type", "QueueDataset")
+        self.reader, self.file_list = get_reader(self.input_data, config)
+        self.example_nums = 0
+        self.count_method = self.config.get("runner.example_count_method",
+                                            "example")
+
+    def dataset_train_loop(self, epoch):
+        logger.info("Epoch: {}, Running Dataset Begin.".format(epoch))
+        
+        fetch_info = [
+            "Epoch {} Var {}".format(epoch, var_name)
+            for var_name in self.metrics
+        ]
+
+        fetch_vars = [var for _, var in self.metrics.items()]
+
+        print_step = int(config.get("runner.print_interval"))
+
+        debug = config.get("runner.dataset_debug", False)
+        if config.get("runner.need_dump"):
+            debug = True
+            dump_fields_path = "{}/{}".format(
+                config.get("runner.dump_fields_path"), epoch)
+            set_dump_config(paddle.static.default_main_program(), {
+                "dump_fields_path": dump_fields_path,
+                "dump_fields": config.get("runner.dump_fields")
+            })
+        logger.info(paddle.static.default_main_program()._fleet_opt)
+        
+        self.exe.train_from_dataset(
+            program=paddle.static.default_main_program(),
+            dataset=self.reader,
+            fetch_list=fetch_vars,
+            fetch_info=fetch_info,
+            print_period=print_step,
+            debug=debug)
+
+
+    def heter_train_loop(self, epoch):
+        logger.info(
+            "Epoch: {}, Running Begin. Check running metrics at heter_log".
+            format(epoch))
+        reader_type = self.config.get("runner.reader_type")
+        if reader_type == "QueueDataset":
+            self.exe.train_from_dataset(
+                program=paddle.static.default_main_program(),
+                dataset=self.reader,
+                debug=config.get("runner.dataset_debug"))
+        elif reader_type == "DataLoader":
+            batch_id = 0
+            train_run_cost = 0.0
+            total_examples = 0
+            self.reader.start()
+            while True:
+                try:
+                    train_start = time.time()
+                    # --------------------------------------------------- #
+                    self.exe.run(program=paddle.static.default_main_program())
+                    # --------------------------------------------------- #
+                    train_run_cost += time.time() - train_start
+                    total_examples += self.config.get("runner.batch_size")
+                    batch_id += 1
+                    print_step = int(config.get("runner.print_period"))
+                    if batch_id % print_step == 0:
+                        profiler_string = ""
+                        profiler_string += "avg_batch_cost: {} sec, ".format(
+                            format((train_run_cost) / print_step, '.5f'))
+                        profiler_string += "avg_samples: {}, ".format(
+                            format(total_examples / print_step, '.5f'))
+                        profiler_string += "ips: {} {}/sec ".format(
+                            format(total_examples / (train_run_cost), '.5f'),
+                            self.count_method)
+                        logger.info("Epoch: {}, Batch: {}, {}".format(
+                            epoch, batch_id, profiler_string))
+                        train_run_cost = 0.0
+                        total_examples = 0
+                except paddle.core.EOFException:
+                    self.reader.reset()
+                    break
+
+    def record_result(self):
+        logger.info("train_result_dict: {}".format(self.train_result_dict))
+        with open("./train_result_dict.txt", 'w+') as f:
+            f.write(str(self.train_result_dict))
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+ 
+    config = parse_args()
+    os.environ["CPU_NUM"] = str(config.get("runner.thread_num"))
+    benchmark_main = Main(config)
+ 
+    benchmark_main.run()
+  

+ 36 - 0
recommend-model-produce/src/main/python/tools/utils/compress.py

@@ -0,0 +1,36 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tarfile
+
+def compress_tar(folder_path, output_filename):
+    # 确保输出文件名以 .tar.gz 结尾
+    if not output_filename.endswith('.tar.gz'):
+        output_filename += '.tar.gz'
+
+    # 创建一个 tarfile 对象,使用 'w:gz' 模式表示写入 gzip 压缩的 tar 包
+    with tarfile.open(output_filename, "w:gz") as tar:
+        # os.walk() 遍历目录
+        for root, dirs, files in os.walk(folder_path):
+            for file in files:
+                # 构建完整的文件路径
+                file_path = os.path.join(root, file)
+                # 将文件添加到 tar 包中,arcname 指定在 tar 包中的相对路径
+                tar.add(file_path, arcname=os.path.relpath(file_path, start=folder_path))
+
+if __name__ == "__main__":
+    folder_path="/Users/dingyunpeng/tardemo"
+    output_filename="/Users/dingyunpeng/tardemo"
+    compress_tar(folder_path, output_filename)

+ 27 - 0
recommend-model-produce/src/main/python/tools/utils/my_hdfs_client.py

@@ -0,0 +1,27 @@
+from paddle.distributed.fleet.utils import HDFSClient
+import os
+
+class ExecuteError(Exception):
+    pass
+class MyHDFSClient(HDFSClient):
+    def _ls_dir(self, fs_path):
+        cmd = "ls " + fs_path
+        ret, lines = super()._run_cmd(cmd)
+
+        if ret != 0:
+            raise ExecuteError(cmd)
+
+        dirs = []
+        files = []
+        for line in lines:
+            arr = line.split()
+            if len(arr) != 8:
+                continue
+
+            p = arr[7].strip()
+            if arr[0][0] == 'd':
+                dirs.append(p)
+            else:
+                files.append(p)
+
+        return dirs, files

+ 30 - 0
recommend-model-produce/src/main/python/tools/utils/oss_client.py

@@ -0,0 +1,30 @@
+import oss2
+import logging
+
+logging.basicConfig(
+    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+access_key_id="LTAI5tHMkNaRhpiDB1yWMZPn"
+access_key_secret="XLi5YUJusVwbbQOaGeGsaRJ1Qyzbui"
+auth = oss2.AuthV4(access_key_id, access_key_secret)
+
+hangzhou_config = {
+    "endpoint" : "https://oss-cn-hangzhou.aliyuncs.com",
+    "inner_endpoint" : "https://oss-cn-hangzhou-internal.aliyuncs.com",
+    "region" : "cn-hangzhou"
+}
+
+class HangZhouOSSClient:
+    def __init__(self, bucket_name):
+        self.bucket_name = bucket_name
+        self.bucket = oss2.Bucket(auth, hangzhou_config["inner_endpoint"], bucket_name, region=hangzhou_config["region"])
+
+    def put_object_from_file(self, object_name, local_file):
+        result = self.bucket.put_object_from_file(object_name, local_file)
+        logger.info("\n status: {} \n request_id: {} \n ETag: {} \n date: {}".format(result.status, result.request_id,
+        result.etag, result.headers['date']))
+
+if __name__ == "__main__":
+    client = HangZhouOSSClient("art-recommend")
+    client.put_object_from_file("dyp/stuuudy.pem", "/Users/dingyunpeng/stuuudy.pem")

+ 304 - 0
recommend-model-produce/src/main/python/tools/utils/static_ps/reader_helper_hdfs.py

@@ -0,0 +1,304 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os
+import sys
+import warnings
+import logging
+import paddle
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+import paddle.distributed as dist
+from utils.my_hdfs_client import MyHDFSClient
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(__dir__)
+from . import common_ps
+
+logging.basicConfig(
+    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+hadoop_home = "/app/env/hadoop-3.2.4"  # Hadoop 安装目录
+configs = {
+    "fs.default.name": "hdfs://192.168.141.208:9000",  # HDFS 名称和端口
+    "hadoop.job.ugi": ""  # HDFS 用户和密码
+}
+hdfs_client = MyHDFSClient(hadoop_home, configs)
+
+
+def get_reader(input_var, config):
+    reader_type = config.get("runner.reader_type")
+    train_data_path = config.get("runner.train_data_dir")
+    assert train_data_path != ""
+
+    assert reader_type in [
+        "QueueDataset", "DataLoader", "RecDataset", "InmemoryDataset", None
+    ]
+    file_list = get_file_list(train_data_path, config)
+    print("train file_list: {}".format(file_list))
+    if reader_type == "QueueDataset":
+        reader_instance = QueueDatset(input_var, file_list, config)
+        return reader_instance.get_reader(), file_list
+    elif reader_type == "InmemoryDataset":
+        reader_instance = InmemoryDatset(input_var, file_list, config)
+        return reader_instance.get_reader(), file_list
+    elif reader_type == "DataLoader":
+        reader_instance = DataLoader(input_var, file_list, config)
+        return reader_instance.get_reader(), file_list
+    elif reader_type == None or reader_type == "RecDataset":
+        reader_instance = RecDatasetReader(input_var, file_list, config)
+        return reader_instance.get_reader(), file_list
+
+
+def get_infer_reader(input_var, config):
+    test_data_path = config.get("runner.test_data_dir")
+    assert test_data_path != ""
+    test_data_path = os.path.join(config["config_abs_dir"], test_data_path)
+    print("test_data_path is: {}".format(test_data_path))
+    file_list = get_file_list(test_data_path, config)
+    print("test file_list: {}".format(file_list))
+    reader_type = config.get("runner.infer_reader_type")
+    if reader_type == "QueueDataset":
+        reader_instance = QueueDatset(input_var, file_list, config)
+        return reader_instance.get_infer_reader(), file_list
+    else:
+        reader_instance = InferDataLoader(input_var, file_list, config)
+        return reader_instance.get_reader(), file_list
+
+
+def get_file_list(data_path, config):
+    dirs,file_list = hdfs_client.ls_dir(data_path)
+    # 如果配置中指定了分割文件列表
+    if config.get("runner.split_file_list"):
+        logger.info("Split file list for worker {}".format(dist.get_rank()))
+        file_list = fleet.util.get_file_shard(file_list)
+    logger.info("File list: {}".format(file_list))
+
+    base_url = f'{configs["fs.default.name"]}'
+    full_paths = [base_url + file for file in file_list]
+
+    return full_paths
+
+
+def get_reader_generator(path, reader_name="Reader"):
+    reader_class = common_ps.lazy_instance_by_fliename(path, reader_name)()
+    return reader_class
+
+
+class RecDatasetReader(object):
+    def __init__(self, input_var, file_list, config):
+        assert isinstance(input_var, list)
+        assert len(file_list) > 0
+        self.input_var = input_var
+        self.file_list = file_list
+        self.config = config
+
+    def get_reader(self):
+        logger.info("Get DataLoader")
+
+        config_abs_dir = self.config.get("config_abs_dir", None)
+        reader_path = self.config.get('runner.train_reader_path', 'reader')
+        reader_path = os.path.join(config_abs_dir, reader_path)
+        logger.info("Reader Path: {}".format(reader_path))
+
+        from paddle.io import DataLoader
+        dataset = common_ps.lazy_instance_by_fliename(reader_path,
+                                                      "RecDataset")
+        print("dataset: {}".format(dataset))
+
+        use_cuda = int(self.config.get("runner.use_gpu"))
+        batch_size = self.config.get('runner.train_batch_size', None)
+        place = paddle.set_device('gpu' if use_cuda else 'cpu')
+
+        generator = dataset(self.file_list, self.config)
+        generator.init()
+        loader = DataLoader(
+            generator, batch_size=batch_size, places=place, drop_last=True)
+        return loader
+
+
+class DataLoader(object):
+    def __init__(self, input_var, file_list, config):
+        assert isinstance(input_var, list)
+        assert len(file_list) > 0
+        self.input_var = input_var
+        self.file_list = file_list
+        self.config = config
+
+    def get_reader(self):
+        logger.info("Get DataLoader")
+        loader = paddle.io.DataLoader.from_generator(
+            feed_list=self.input_var,
+            capacity=64,
+            iterable=False,
+            use_double_buffer=False)
+        path = self.config.get("runner.train_reader_path")
+        path = os.path.join(self.config["config_abs_dir"], path)
+        generator = get_reader_generator(path)
+        generator.init(self.config)
+        batch_size = int(self.config.get("runner.train_batch_size"))
+        batch_generator = self.config.get("runner.batch_generator", False)
+        if batch_generator:
+            loader.set_batch_generator(generator.dataloader(self.file_list))
+        else:
+            loader.set_sample_generator(
+                generator.dataloader(self.file_list),
+                batch_size=batch_size,
+                drop_last=True,
+                places=paddle.static.cpu_places())
+        return loader
+
+
+class InferDataLoader(object):
+    def __init__(self, input_var, file_list, config):
+        assert isinstance(input_var, list)
+        assert len(file_list) > 0
+        self.input_var = input_var
+        self.file_list = file_list
+        self.config = config
+
+    def get_reader(self):
+        logger.info("Get DataLoader")
+        use_cuda = int(self.config.get("runner.use_gpu"))
+        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+        data_dir = self.config.get("runner.test_data_dir", None)
+        batch_size = self.config.get('runner.infer_batch_size', None)
+        reader_path = self.config.get('runner.infer_reader_path', 'reader')
+        num_workers = int(self.config.get('runner.num_workers', 0))
+        config_abs_dir = self.config.get("config_abs_dir", None)
+        data_dir = os.path.join(config_abs_dir, data_dir)
+        file_list = [os.path.join(data_dir, x) for x in os.listdir(data_dir)]
+        user_define_reader = self.config.get('runner.user_define_reader',
+                                             False)
+        logger.info("reader path:{}".format(reader_path))
+        from importlib import import_module
+        reader_class = import_module(reader_path)
+        dataset = reader_class.RecDataset(file_list, config=self.config)
+        loader = paddle.io.DataLoader(
+            dataset,
+            batch_size=batch_size,
+            places=place,
+            drop_last=True,
+            num_workers=num_workers)
+        return loader
+
+
+class QueueDatset(object):
+    def __init__(self, input_var, file_list, config):
+        assert isinstance(input_var, list)
+        assert len(file_list) > 0
+        self.config = config
+        self.input_var = input_var
+        self.file_list = file_list
+        self.parse_ins_id = self.config.get("runner.parse_ins_id")
+        print("parse ins id:", self.parse_ins_id)
+        self.pipe_command = self.config.get("runner.pipe_command")
+        self.train_reader = self.config.get("runner.train_reader_path")
+        assert self.pipe_command != None
+        utils_path = common_ps.get_utils_file_path()
+        print("utils_path: {}".format(utils_path))
+        abs_train_reader = os.path.join(config["config_abs_dir"],
+                                        self.train_reader)
+        print("abs_train_reader is: {}".format(abs_train_reader))
+        self.pipe_command = self.pipe_command.replace(self.train_reader,
+                                                      abs_train_reader)
+        self.pipe_command = "{} {} {}".format(self.pipe_command,
+                                              config.get("yaml_path"),
+                                              utils_path)
+        print("pipe_command is: {}".format(self.pipe_command))
+        self.batch_size = int(self.config.get("runner.train_batch_size"))
+        assert self.batch_size >= 1
+        self.thread_num = int(self.config.get("runner.thread_num"))
+        print("dataset init thread_num:", self.thread_num)
+        assert self.thread_num >= 1
+
+    def get_reader(self):
+        logger.info("Get Train Dataset")
+        dataset = paddle.distributed.QueueDataset()
+        dataset.init(
+            use_var=self.input_var,
+            pipe_command=self.pipe_command,
+            batch_size=self.batch_size,
+            thread_num=self.thread_num,
+            fs_name=configs["fs.default.name"],  # 设置 HDFS 名称
+            fs_ugi=configs["hadoop.job.ugi"],  # 设置 HDFS 用户组信息
+            download_cmd=""
+            )
+        print("dataset get_reader thread_num:", self.thread_num)
+
+        dataset.set_filelist(self.file_list)
+        return dataset
+
+    def get_infer_reader(self):
+        logger.info("Get Infer Dataset")
+        dataset = paddle.distributed.QueueDataset()
+        self.infer_batch_size = int(self.config.get("runner.infer_batch_size"))
+        self.infer_thread_num = self.thread_num
+        dataset.init(
+            use_var=self.input_var,
+            pipe_command=self.pipe_command,
+            batch_size=self.infer_batch_size,
+            thread_num=self.infer_thread_num)
+        print("dataset get_infer_reader thread_num:", self.infer_thread_num)
+        dataset.set_filelist(self.file_list)
+        return dataset
+
+
+class InmemoryDatset(object):
+    def __init__(self, input_var, file_list, config):
+        assert isinstance(input_var, list)
+        assert len(file_list) > 0
+        self.config = config
+        self.input_var = input_var
+        self.file_list = file_list
+        self.pipe_command = self.config.get("runner.pipe_command")
+        self.train_reader = self.config.get("runner.train_reader_path")
+        assert self.pipe_command != None
+        utils_path = common_ps.get_utils_file_path()
+        print("utils_path: {}".format(utils_path))
+        abs_train_reader = os.path.join(config["config_abs_dir"],
+                                        self.train_reader)
+        self.pipe_command = self.pipe_command.replace(self.train_reader,
+                                                      abs_train_reader)
+        self.pipe_command = "{} {} {}".format(self.pipe_command,
+                                              config.get("yaml_path"),
+                                              utils_path)
+        print(self.pipe_command)
+        self.batch_size = int(self.config.get("runner.train_batch_size"))
+        assert self.batch_size >= 1
+        self.thread_num = int(self.config.get("runner.thread_num"))
+        assert self.thread_num >= 1
+        self.parse_ins_id = self.config.get("runner.parse_ins_id", False)
+        self.parse_content = self.config.get("runner.parse_content", False)
+        self.fs_name = self.config.get("runner.fs_name", "")
+        self.fs_ugi = self.config.get("runner.fs_ugi", "")
+        print("hdfs config:", self.fs_name, self.fs_ugi)
+        self.use_gpu = self.config.get("runner.use_gpu", False)
+
+    def get_reader(self):
+        logger.info("Get InmemoryDataset")
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset._set_use_ps_gpu(self.use_gpu)
+        dataset.init(
+            use_var=self.input_var,
+            pipe_command=self.pipe_command,
+            batch_size=self.batch_size,
+            thread_num=self.thread_num,
+            fs_name=self.fs_name,
+            fs_ugi=self.fs_ugi)
+        dataset.set_filelist(self.file_list)
+        dataset.update_settings(
+            parse_ins_id=self.parse_ins_id, parse_content=self.parse_content)
+        return dataset

部分文件因文件數量過多而無法顯示