1 gadu atpakaļ · 45766f389a
--- a/recommend-model-produce/src/main/python/models/dssm/config_ps.yaml
+++ b/recommend-model-produce/src/main/python/models/dssm/config_ps.yaml
--- a/recommend-model-produce/src/main/python/models/dssmdemo/__init__.py
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/__init__.py
@@ -0,0 +1,13 @@
 
				+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_infer.py
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_infer.py
@@ -0,0 +1,41 @@
 
				+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+from __future__ import print_function
			
 
				+import numpy as np
			
 
				+
			
 
				+from paddle.io import IterableDataset
			
 
				+
			
 
				+
			
 
				+class RecDataset(IterableDataset):
			
 
				+    def __init__(self, file_list, config):
			
 
				+        super(RecDataset, self).__init__()
			
 
				+        self.file_list = file_list
			
 
				+
			
 
				+    def __iter__(self):
			
 
				+        full_lines = []
			
 
				+        for file in self.file_list:
			
 
				+            with open(file, "r") as rf:
			
 
				+                for line in rf:
			
 
				+                    output_list = []
			
 
				+                    features = line.rstrip('\n').split('\t')
			
 
				+                    query = [
			
 
				+                        float(feature) for feature in features[0].split(',')
			
 
				+                    ]
			
 
				+                    output_list.append(np.array(query).astype('float32'))
			
 
				+                    pos_doc = [
			
 
				+                        float(feature) for feature in features[1].split(',')
			
 
				+                    ]
			
 
				+                    output_list.append(np.array(pos_doc).astype('float32'))
			
 
				+                    yield output_list
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_train.py
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_train.py
@@ -0,0 +1,48 @@
 
				+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+from __future__ import print_function
			
 
				+import numpy as np
			
 
				+
			
 
				+from paddle.io import IterableDataset
			
 
				+
			
 
				+
			
 
				+class RecDataset(IterableDataset):
			
 
				+    def __init__(self, file_list, config):
			
 
				+        super(RecDataset, self).__init__()
			
 
				+        self.file_list = file_list
			
 
				+
			
 
				+    def __iter__(self):
			
 
				+        full_lines = []
			
 
				+        for file in self.file_list:
			
 
				+            with open(file, "r") as rf:
			
 
				+                for line in rf:
			
 
				+                    output_list = []
			
 
				+                    features = line.rstrip('\n').split('\t')
			
 
				+                    query = [
			
 
				+                        float(feature) for feature in features[0].split(',')
			
 
				+                    ]
			
 
				+                    output_list.append(np.array(query).astype('float32'))
			
 
				+                    pos_doc = [
			
 
				+                        float(feature) for feature in features[1].split(',')
			
 
				+                    ]
			
 
				+                    output_list.append(np.array(pos_doc).astype('float32'))
			
 
				+
			
 
				+                    for i in range(len(features) - 2):
			
 
				+                        output_list.append(
			
 
				+                            np.array([
			
 
				+                                float(feature)
			
 
				+                                for feature in features[i + 2].split(',')
			
 
				+                            ]).astype('float32'))
			
 
				+                    yield output_list
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_train_insid.py
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_train_insid.py
@@ -0,0 +1,82 @@
 
				+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+import sys
			
 
				+import yaml
			
 
				+import six
			
 
				+import os
			
 
				+import copy
			
 
				+import paddle.distributed.fleet as fleet
			
 
				+import logging
			
 
				+
			
 
				+logging.basicConfig(
			
 
				+    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+class Reader(fleet.MultiSlotStringDataGenerator):
			
 
				+    def init(self, config):
			
 
				+        self.config = config
			
 
				+        self.neg_num = self.config.get("hyper_parameters.neg_num")
			
 
				+
			
 
				+    def line_process(self, line):
			
 
				+        data = line.rstrip('\n').split('\t')
			
 
				+        ins_id = [data[0]]
			
 
				+        content = [data[1]]
			
 
				+        features = data[2:]
			
 
				+        query = features[0].split(',')
			
 
				+        pos_doc = features[1].split(',')
			
 
				+
			
 
				+        neg_doc_list = []
			
 
				+        for i in range(self.neg_num):
			
 
				+            neg_doc_list.append(features[i + 2].split(','))
			
 
				+
			
 
				+        return [ins_id, content, query, pos_doc] + neg_doc_list
			
 
				+
			
 
				+    def generate_sample(self, line):
			
 
				+        "Dataset Generator"
			
 
				+
			
 
				+        def reader():
			
 
				+            input_data = self.line_process(line)
			
 
				+            feature_name = ["insid", "content", "query", "pos_doc"]
			
 
				+            for i in range(self.neg_num):
			
 
				+                feature_name.append("neg_doc_{}".format(i))
			
 
				+            yield zip(feature_name, input_data)
			
 
				+
			
 
				+        return reader
			
 
				+
			
 
				+    def dataloader(self, file_list):
			
 
				+        "DataLoader Pyreader Generator"
			
 
				+
			
 
				+        def reader():
			
 
				+            for file in file_list:
			
 
				+                with open(file, 'r') as f:
			
 
				+                    for line in f:
			
 
				+                        input_data = self.line_process(line)
			
 
				+                        yield input_data
			
 
				+
			
 
				+        return reader
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    yaml_path = sys.argv[1]
			
 
				+    utils_path = sys.argv[2]
			
 
				+    sys.path.append(utils_path)
			
 
				+    import common_ps
			
 
				+    yaml_helper = common_ps.YamlHelper()
			
 
				+    config = yaml_helper.load_yaml(yaml_path)
			
 
				+
			
 
				+    r = Reader()
			
 
				+    r.init(config)
			
 
				+    # r.init(None)
			
 
				+    r.run_from_stdin()
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/config.yaml
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/config.yaml
@@ -0,0 +1,41 @@
 
				+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+runner:
			
 
				+  train_data_dir: "data/train"
			
 
				+  train_reader_path: "bq_reader_train"  # importlib format
			
 
				+  train_batch_size: 8
			
 
				+  model_save_path: "output_model_dssm"
			
 
				+
			
 
				+  use_gpu: False
			
 
				+  epochs: 1
			
 
				+  print_interval: 10
			
 
				+  
			
 
				+  test_data_dir: "data/test"
			
 
				+  infer_reader_path: "bq_reader_infer"  # importlib format
			
 
				+  infer_batch_size: 1
			
 
				+  infer_load_path: "output_model_dssm"
			
 
				+  infer_start_epoch: 0
			
 
				+  infer_end_epoch: 1
			
 
				+
			
 
				+hyper_parameters:
			
 
				+  optimizer:
			
 
				+    class: adam
			
 
				+    learning_rate: 0.001
			
 
				+    strategy: sync
			
 
				+  trigram_d: 2900
			
 
				+  neg_num: 1
			
 
				+  slice_end: 8
			
 
				+  fc_sizes: [300, 300, 128]
			
 
				+  fc_acts: ['relu', 'relu', 'relu']
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/config_bigdata.yaml
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/config_bigdata.yaml
@@ -0,0 +1,42 @@
 
				+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+
			
 
				+runner:
			
 
				+  train_data_dir: "../../../datasets/BQ_dssm/big_train"
			
 
				+  train_reader_path: "bq_reader_train"  # importlib format
			
 
				+  train_batch_size: 128
			
 
				+  model_save_path: "output_model_all_dssm"
			
 
				+
			
 
				+  use_gpu: False
			
 
				+  epochs: 1
			
 
				+  print_interval: 1
			
 
				+  
			
 
				+  test_data_dir: "../../../datasets/BQ_dssm/big_test"
			
 
				+  infer_reader_path: "bq_reader_infer"  # importlib format
			
 
				+  infer_batch_size: 1
			
 
				+  infer_load_path: "output_model_all_dssm"
			
 
				+  infer_start_epoch: 0
			
 
				+  infer_end_epoch: 1
			
 
				+
			
 
				+hyper_parameters:
			
 
				+  optimizer:
			
 
				+    class: adam
			
 
				+    learning_rate: 0.001
			
 
				+    strategy: sync
			
 
				+  trigram_d: 5913
			
 
				+  neg_num: 1
			
 
				+  slice_end: 128
			
 
				+  fc_sizes: [300, 300, 128]
			
 
				+  fc_acts: ['relu', 'relu', 'relu']
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/config_online.yaml
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/config_online.yaml
@@ -0,0 +1,70 @@
 
				+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+runner:
			
 
				+  train_data_dir: "data/train_with_insid"
			
 
				+  test_data_dir: "data/test_with_insid"
			
 
				+  # train_reader_path: "bq_reader_train"  # importlib format
			
 
				+  days: "{20210803..20210804}"
			
 
				+  pass_per_day: 1
			
 
				+  train_batch_size: 8
			
 
				+  test_batch_size: 8
			
 
				+  model_save_path: "output_model_dssm"
			
 
				+
			
 
				+  reader_type: "InmemoryDataset"  # DataLoader / QueueDataset / RecDataset / InmemoryDataset
			
 
				+  pipe_command: "python3 bq_reader_train_insid.py"
			
 
				+
			
 
				+  sync_mode: "async"
			
 
				+  # thread_num: 1
			
 
				+  train_thread_num: 1
			
 
				+  test_thread_num: 1
			
 
				+
			
 
				+  use_gpu: False
			
 
				+  epochs: 1
			
 
				+  print_interval: 1
			
 
				+
			
 
				+  dataset_debug: False
			
 
				+
			
 
				+  # when you need to prune net, please set need_prune to True,
			
 
				+  # and need to set prune_feed_vars and prune_target_var in static_model.py
			
 
				+  need_prune: True
			
 
				+
			
 
				+  parse_ins_id: True
			
 
				+  parse_content: True
			
 
				+  
			
 
				+  # when you need to dump fileds and params in training, please set need_train_dump to True,
			
 
				+  # and need to set train_dump_fields and train_dump_params in static_model.py
			
 
				+  need_train_dump: True
			
 
				+  # train_dump_fields_dir: "afs:/xxx"
			
 
				+  train_dump_fields_dir: "./train_dump_data"
			
 
				+
			
 
				+  # when you need to dump fileds in inference, please set need_infer_dump to True,
			
 
				+  # and need to set infer_dump_fields in static_model.py
			
 
				+  need_infer_dump: True
			
 
				+  # infer_dump_fields_dir: "afs:/xxx"
			
 
				+  infer_dump_fields_dir: "./infer_dump_data"
			
 
				+
			
 
				+  fs_name: "afs://xxx"
			
 
				+  fs_ugi: "xxx,xxx"
			
 
				+  
			
 
				+hyper_parameters:
			
 
				+  optimizer:
			
 
				+    class: adam
			
 
				+    learning_rate: 0.001
			
 
				+    strategy: sync
			
 
				+  trigram_d: 2900
			
 
				+  neg_num: 1
			
 
				+  slice_end: 8
			
 
				+  fc_sizes: [300, 300, 128]
			
 
				+  fc_acts: ['relu', 'relu', 'relu']
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/config_ps.yaml
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/config_ps.yaml
@@ -0,0 +1,46 @@
 
				+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+runner:
			
 
				+  train_data_dir: "data/train"
			
 
				+  train_reader_path: "bq_reader_train"  # importlib format
			
 
				+  train_batch_size: 8
			
 
				+  model_save_path: "output_model_dssm"
			
 
				+
			
 
				+  reader_type: "QueueDataset"  # DataLoader / QueueDataset / RecDataset
			
 
				+  pipe_command: "python bq_reader_train_ps.py"
			
 
				+  thread_num: 1
			
 
				+  sync_mode: "async"
			
 
				+
			
 
				+  use_gpu: False
			
 
				+  epochs: 10
			
 
				+  print_interval: 10
			
 
				+  
			
 
				+  test_data_dir: "data/test"
			
 
				+  infer_reader_path: "bq_reader_infer"  # importlib format
			
 
				+  infer_batch_size: 1
			
 
				+  infer_load_path: "output_model_dssm"
			
 
				+  infer_start_epoch: 0
			
 
				+  infer_end_epoch: 1
			
 
				+
			
 
				+hyper_parameters:
			
 
				+  optimizer:
			
 
				+    class: adam
			
 
				+    learning_rate: 0.001
			
 
				+    strategy: sync
			
 
				+  trigram_d: 2900
			
 
				+  neg_num: 1
			
 
				+  slice_end: 8
			
 
				+  fc_sizes: [300, 300, 128]
			
 
				+  fc_acts: ['relu', 'relu', 'relu']
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/data/data_process.sh
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/data/data_process.sh
@@ -0,0 +1,24 @@
 
				+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+#!/bin/bash
			
 
				+
			
 
				+wget https://paddlerec.bj.bcebos.com/dssm%2Fbq.tar.gz
			
 
				+tar xzf dssm%2Fbq.tar.gz
			
 
				+rm -f dssm%2Fbq.tar.gz
			
 
				+mv bq/train.txt ./raw_data.txt
			
 
				+python3 preprocess.py
			
 
				+mkdir big_train
			
 
				+mv train.txt ./big_train
			
 
				+mkdir big_test
			
 
				+mv test.txt ./big_test
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/data/prepare_dump_data.sh
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/data/prepare_dump_data.sh
@@ -0,0 +1,27 @@
 
				+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+#!/bin/bash
			
 
				+
			
 
				+
			
 
				+cat train/train.txt | awk -F'\t' 'BEGIN{OFS="\t"}{print NR, "item_"NR, $0}' > data_with_lineid
			
 
				+for i in 20210803 20210804
			
 
				+do
			
 
				+    for j in 1
			
 
				+    do
			
 
				+        mkdir -p train_with_insid/$i/$j
			
 
				+        cp data_with_lineid train_with_insid/$i/$j
			
 
				+        mkdir -p test_with_insid/$i/$j
			
 
				+        cp data_with_lineid test_with_insid/$i/$j
			
 
				+    done
			
 
				+done
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/data/preprocess.py
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/data/preprocess.py
--- a/recommend-model-produce/src/main/python/models/dssmdemo/data/test/test.txt
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/data/test/test.txt
--- a/recommend-model-produce/src/main/python/models/dssmdemo/data/train/train.txt
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/data/train/train.txt
--- a/recommend-model-produce/src/main/python/models/dssmdemo/dygraph_model.py
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/dygraph_model.py
@@ -0,0 +1,93 @@
 
				+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import paddle
			
 
				+import paddle.nn as nn
			
 
				+import paddle.nn.functional as F
			
 
				+import math
			
 
				+import net
			
 
				+
			
 
				+
			
 
				+class DygraphModel():
			
 
				+    # define model
			
 
				+    def create_model(self, config):
			
 
				+        trigram_d = config.get('hyper_parameters.trigram_d', None)
			
 
				+        neg_num = config.get('hyper_parameters.neg_num', None)
			
 
				+        slice_end = config.get('hyper_parameters.slice_end', None)
			
 
				+        fc_sizes = config.get('hyper_parameters.fc_sizes', None)
			
 
				+        fc_acts = config.get('hyper_parameters.fc_acts', None)
			
 
				+
			
 
				+        DSSM_model = net.DSSMLayer(trigram_d, neg_num, slice_end, fc_sizes,
			
 
				+                                   fc_acts)
			
 
				+        return DSSM_model
			
 
				+
			
 
				+    # define feeds which convert numpy of batch data to paddle.tensor 
			
 
				+    def create_feeds_train(self, batch_data, trigram_d):
			
 
				+        query = paddle.to_tensor(batch_data[0].numpy().astype('float32')
			
 
				+                                 .reshape(-1, trigram_d))
			
 
				+        doc_pos = paddle.to_tensor(batch_data[1].numpy().astype('float32')
			
 
				+                                   .reshape(-1, trigram_d))
			
 
				+        doc_negs = []
			
 
				+        for ele in batch_data[2:]:
			
 
				+            doc_negs.append(
			
 
				+                paddle.to_tensor(ele.numpy().astype('float32').reshape(
			
 
				+                    -1, trigram_d)))
			
 
				+        return [query, doc_pos] + doc_negs
			
 
				+
			
 
				+    def create_feeds_infer(self, batch_data, trigram_d):
			
 
				+        query = paddle.to_tensor(batch_data[0].numpy().astype('float32')
			
 
				+                                 .reshape(-1, trigram_d))
			
 
				+        doc_pos = paddle.to_tensor(batch_data[1].numpy().astype('float32')
			
 
				+                                   .reshape(-1, trigram_d))
			
 
				+        return [query, doc_pos]
			
 
				+
			
 
				+    # define loss function by predicts and label
			
 
				+    def create_loss(self, hit_prob):
			
 
				+        loss = -paddle.sum(paddle.log(hit_prob), axis=-1)
			
 
				+        avg_cost = paddle.mean(x=loss)
			
 
				+        return avg_cost
			
 
				+
			
 
				+    # define optimizer 
			
 
				+    def create_optimizer(self, dy_model, config):
			
 
				+        lr = config.get("hyper_parameters.optimizer.learning_rate", 0.001)
			
 
				+        optimizer = paddle.optimizer.Adam(
			
 
				+            learning_rate=lr, parameters=dy_model.parameters())
			
 
				+        return optimizer
			
 
				+
			
 
				+    # define metrics such as auc/acc
			
 
				+    # multi-task need to define multi metric
			
 
				+    def create_metrics(self):
			
 
				+        metrics_list_name = []
			
 
				+        metrics_list = []
			
 
				+        return metrics_list, metrics_list_name
			
 
				+
			
 
				+    # construct train forward phase  
			
 
				+    def train_forward(self, dy_model, metrics_list, batch_data, config):
			
 
				+        trigram_d = config.get('hyper_parameters.trigram_d', None)
			
 
				+        inputs = self.create_feeds_train(batch_data, trigram_d)
			
 
				+
			
 
				+        R_Q_D_p, hit_prob = dy_model.forward(inputs, False)
			
 
				+        loss = self.create_loss(hit_prob)
			
 
				+        # update metrics
			
 
				+        print_dict = {"loss": loss}
			
 
				+        return loss, metrics_list, print_dict
			
 
				+
			
 
				+    def infer_forward(self, dy_model, metrics_list, batch_data, config):
			
 
				+        trigram_d = config.get('hyper_parameters.trigram_d', None)
			
 
				+        inputs = self.create_feeds_infer(batch_data, trigram_d)
			
 
				+
			
 
				+        R_Q_D_p, hit_prob = dy_model.forward(inputs, True)
			
 
				+        # update metrics
			
 
				+        print_dict = {"query_doc_sim": R_Q_D_p}
			
 
				+        return metrics_list, print_dict
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/net.py
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/net.py
@@ -0,0 +1,101 @@
 
				+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import paddle
			
 
				+import paddle.nn as nn
			
 
				+import paddle.nn.functional as F
			
 
				+import numpy as np
			
 
				+
			
 
				+
			
 
				+class DSSMLayer(nn.Layer):
			
 
				+    def __init__(self, trigram_d, neg_num, slice_end, hidden_layers,
			
 
				+                 hidden_acts):
			
 
				+        super(DSSMLayer, self).__init__()
			
 
				+
			
 
				+        self.hidden_layers = [trigram_d] + hidden_layers
			
 
				+        self.hidden_acts = hidden_acts
			
 
				+        self.slice_end = slice_end
			
 
				+
			
 
				+        self._query_layers = []
			
 
				+        for i in range(len(self.hidden_layers) - 1):
			
 
				+            linear = paddle.nn.Linear(
			
 
				+                in_features=self.hidden_layers[i],
			
 
				+                out_features=self.hidden_layers[i + 1],
			
 
				+                weight_attr=paddle.ParamAttr(
			
 
				+                    initializer=paddle.nn.initializer.XavierNormal(
			
 
				+                        fan_in=self.hidden_layers[i],
			
 
				+                        fan_out=self.hidden_layers[i + 1])),
			
 
				+                bias_attr=paddle.ParamAttr(
			
 
				+                    initializer=paddle.nn.initializer.XavierNormal(
			
 
				+                        fan_in=self.hidden_layers[i],
			
 
				+                        fan_out=self.hidden_layers[i + 1])))
			
 
				+            self.add_sublayer('query_linear_%d' % i, linear)
			
 
				+            self._query_layers.append(linear)
			
 
				+            if self.hidden_acts[i] == "relu":
			
 
				+                act = paddle.nn.ReLU()
			
 
				+                self.add_sublayer('query_act_%d' % i, act)
			
 
				+                self._query_layers.append(act)
			
 
				+
			
 
				+        self._doc_layers = []
			
 
				+        for i in range(len(self.hidden_layers) - 1):
			
 
				+            linear = paddle.nn.Linear(
			
 
				+                in_features=self.hidden_layers[i],
			
 
				+                out_features=self.hidden_layers[i + 1],
			
 
				+                weight_attr=paddle.ParamAttr(
			
 
				+                    initializer=paddle.nn.initializer.XavierNormal(
			
 
				+                        fan_in=self.hidden_layers[i],
			
 
				+                        fan_out=self.hidden_layers[i + 1])),
			
 
				+                bias_attr=paddle.ParamAttr(
			
 
				+                    initializer=paddle.nn.initializer.XavierNormal(
			
 
				+                        fan_in=self.hidden_layers[i],
			
 
				+                        fan_out=self.hidden_layers[i + 1])))
			
 
				+            self.add_sublayer('pos_linear_%d' % i, linear)
			
 
				+            self._doc_layers.append(linear)
			
 
				+            if self.hidden_acts[i] == "relu":
			
 
				+                act = paddle.nn.ReLU()
			
 
				+                self.add_sublayer('pos_act_%d' % i, act)
			
 
				+                self._doc_layers.append(act)
			
 
				+
			
 
				+    def forward(self, input_data, is_infer):
			
 
				+        query_fc = input_data[0]
			
 
				+        for n_layer in self._query_layers:
			
 
				+            query_fc = n_layer(query_fc)
			
 
				+        self.query_fc = query_fc
			
 
				+
			
 
				+        doc_pos_fc = input_data[1]
			
 
				+        for n_layer in self._doc_layers:
			
 
				+            doc_pos_fc = n_layer(doc_pos_fc)
			
 
				+        self.doc_pos_fc = doc_pos_fc
			
 
				+
			
 
				+        self.params = [self._query_layers[-2].bias]
			
 
				+
			
 
				+        R_Q_D_p = F.cosine_similarity(
			
 
				+            query_fc, doc_pos_fc, axis=1).reshape([-1, 1])
			
 
				+
			
 
				+        if is_infer:
			
 
				+            return R_Q_D_p, paddle.ones(shape=[self.slice_end, 1])
			
 
				+
			
 
				+        R_Q_D_ns = []
			
 
				+        for i in range(len(input_data) - 2):
			
 
				+            doc_neg_fc_i = input_data[i + 2]
			
 
				+            for n_layer in self._doc_layers:
			
 
				+                doc_neg_fc_i = n_layer(doc_neg_fc_i)
			
 
				+            R_Q_D_n = F.cosine_similarity(
			
 
				+                query_fc, doc_neg_fc_i, axis=1).reshape([-1, 1])
			
 
				+            R_Q_D_ns.append(R_Q_D_n)
			
 
				+        concat_Rs = paddle.concat(x=[R_Q_D_p] + R_Q_D_ns, axis=1)
			
 
				+        prob = F.softmax(concat_Rs, axis=1)
			
 
				+        hit_prob = paddle.slice(
			
 
				+            prob, axes=[0, 1], starts=[0, 0], ends=[self.slice_end, 1])
			
 
				+        return R_Q_D_p, hit_prob
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/readme.md
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/readme.md
@@ -0,0 +1,140 @@
 
				+# DSSM文本匹配模型
			
 
				+
			
 
				+**[AI Studio在线运行环境](https://aistudio.baidu.com/aistudio/projectdetail/3238124)**
			
 
				+
			
 
				+以下是本例的简要目录结构及说明： 
			
 
				+
			
 
				+```
			
 
				+├── data #样例数据
			
 
				+    ├── train
			
 
				+        ├── train.txt #训练数据样例
			
 
				+    ├── test
			
 
				+        ├── test.txt #测试数据样例
			
 
				+    ├── preprocess.py #数据处理程序
			
 
				+    ├── data_process #数据一键处理脚本
			
 
				+├── __init__.py
			
 
				+├── README.md #文档
			
 
				+├── config.yaml # sample数据配置
			
 
				+├── config_bigdata.yaml # 全量数据配置
			
 
				+├── net.py # 模型核心组网（动静统一）
			
 
				+├── static_model.py # 构建静态图
			
 
				+├── dygraph_model.py # 构建动态图
			
 
				+├── transform.py #将数据整理成合适的格式方便计算指标
			
 
				+├── run.sh #全量数据集中的训练脚本，从训练到预测并计算指标
			
 
				+├── bq_reader_train.py #训练时数据读取程序
			
 
				+├── bq_reader_infer.py #预测时数据读取程序
			
 
				+```
			
 
				+
			
 
				+注：在阅读该示例前，建议您先了解以下内容：
			
 
				+
			
 
				+[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)  
			
 
				+[readthedocs文档](https://paddlerec.readthedocs.io/en/latest/models/match/dssm.html)  
			
 
				+
			
 
				+
			
 
				+## 内容
			
 
				+
			
 
				+- [模型简介](#模型简介)
			
 
				+- [数据准备](#数据准备)
			
 
				+- [运行环境](#运行环境)
			
 
				+- [快速开始](#快速开始)
			
 
				+- [模型组网](#模型组网)
			
 
				+- [效果复现](#效果复现)
			
 
				+- [进阶使用](#进阶使用)
			
 
				+- [FAQ](#FAQ)
			
 
				+
			
 
				+## 模型简介
			
 
				+DSSM是Deep Structured Semantic Model的缩写，即我们通常说的基于深度网络的语义模型，其核心思想是将query和doc映射到到共同维度的语义空间中，通过最大化query和doc语义向量之间的余弦相似度，从而训练得到隐含语义模型，达到检索的目的。DSSM有很广泛的应用，比如：搜索引擎检索，广告相关性，问答系统，机器翻译等。    
			
 
				+
			
 
				+## 数据准备
			
 
				+BQ是一个智能客服中文问句匹配数据集，该数据集是自动问答系统语料，共有120,000对句子对，并标注了句子对相似度值。数据中存在错别字、语法不规范等问题，但更加贴近工业场景。
			
 
				+原始数据集样例：
			
 
				+```
			
 
				+请问一天是否都是限定只能转入或转出都是五万。    微众多少可以赎回短期理财        0
			
 
				+微粒咨询电话号码多少    你们的人工客服电话是多少        1
			
 
				+已经在银行换了新预留号码。      我现在换了电话号码，这个需要更换吗      1
			
 
				+每个字段以tab键分隔，第1，2列表示两个文本。第3列表示类别（0或1，0表示两个文本不相似，1表示两个文本相似）。
			
 
				+```
			
 
				+
			
 
				+## 运行环境
			
 
				+PaddlePaddle>=2.0
			
 
				+
			
 
				+python 2.7/3.5/3.6/3.7
			
 
				+
			
 
				+os : windows/linux/macos 
			
 
				+12234
			
 
				+## 快速开始
			
 
				+本文提供了样例数据可以供您快速体验，在任意目录下均可执行。在dssm模型目录的快速执行命令如下： 
			
 
				+```bash
			
 
				+# 进入模型目录
			
 
				+# cd models/match/dssm # 在任意目录均可运行
			
 
				+# 动态图训练
			
 
				+python -u ../../../tools/trainer.py -m config.yaml # 全量数据运行config_bigdata.yaml 
			
 
				+# 动态图预测
			
 
				+python -u ../../../tools/infer.py -m config.yaml 
			
 
				+
			
 
				+# 静态图训练
			
 
				+python -u ../../../tools/static_trainer.py -m config.yaml # 全量数据运行config_bigdata.yaml 
			
 
				+# 静态图预测
			
 
				+python -u ../../../tools/static_infer.py -m config.yaml 
			
 
				+``` 
			
 
				+
			
 
				+## 模型组网
			
 
				+DSSM 的输入采用 BOW（Bag of words）的方式，相当于把字向量的位置信息抛弃了，整个句子里的词都放在一个袋子里了。将一个句子用这种方式转化为一个向量输入DNN中。  
			
 
				+Query 和 Doc 的语义相似性可以用这两个向量的 cosine 距离表示，然后通过softmax 函数选出与Query语义最相似的样本 Doc 。  
			
 
				+
			
 
				+模型的具体细节可以阅读论文[DSSM](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cikm2013_DSSM_fullversion.pdf):
			
 
				+<p align="center">
			
 
				+<img align="center" src="../../../doc/imgs/dssm.png">
			
 
				+<p>
			
 
				+
			
 
				+## 效果复现
			
 
				+为了方便使用者能够快速的跑通每一个模型，我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。  
			
 
				+在全量数据下模型的指标如下：  
			
 
				+
			
 
				+| 模型 | 正序率 | batch_size | epoch_num| Time of each epoch |
			
 
				+| :------| :------ | :------ | :------| :------ | 
			
 
				+| DSSM | 0.93 | 128 | 1 | 约9分钟 |  
			
 
				+
			
 
				+1. 确认您当前所在目录为PaddleRec/models/match/dssm
			
 
				+2. 进入paddlerec/datasets/BQ_dssm目录下，执行该脚本，会从国内源的服务器上下载我们预处理完成的BQ全量数据集，并解压到指定文件夹。  
			
 
				+``` bash
			
 
				+cd ../../../datasets/BQ_dssm
			
 
				+sh run.sh
			
 
				+```
			
 
				+3. 切回模型目录,直接一键运行：bash run.sh 即可得到复现的论文效果.
			
 
				+执行该脚本后，会开始自动训练并测试模型，将测试的结果保存到result.txt文件，再执行transform.py整理格式，最后通过执行../../../tools/cal_pos_neg.py进行评估得到数据的正序率指标   
			
 
				+```bash
			
 
				+cd - # 切回模型目录
			
 
				+bash run.sh #动态图训练并测试，最后得到指标
			
 
				+```
			
 
				+
			
 
				+## 进阶使用
			
 
				+DSSM作为推荐系统中一种向量召回的方式，一般需要将doc侧的向量预先计算出来，灌入向量搜索引擎（例如milvus）中，同时保存的模型仅为query侧的模型。线上使用阶段，输入query侧的数据，计算出query侧向量后，直接通过向量搜索引擎召回对应的doc。  
			
 
				+一般在训练的过程中，增加预测阶段，dump出全量的doc侧向量，需要做如下修改： 
			
 
				+1. 为了区分dump出的向量，预测阶段使用的数据需要增加insid和content两个字段，其中insid唯一标记样本，content指明对应的doc。并在数据处理脚本中对这两个字段进行解析，详见bq_reader_train_insid.py脚本。
			
 
				+2. dataset选择InmemoryDataset，同时设置
			
 
				+```python
			
 
				+dataset.set_parse_ins_id(True)
			
 
				+dataset.set_parse_content(True)
			
 
				+```
			
 
				+3. 在static_model.py中配置需要dump的变量（doc侧最上层输出）
			
 
				+```python
			
 
				+self.infer_dump_fields = [dssm_model.doc_pos_fc]
			
 
				+```
			
 
				+4. 配置文件中，打开预测阶段的dump功能，并配置dump_path
			
 
				+```bash
			
 
				+need_infer_dump: True
			
 
				+infer_dump_fields_dir: "./infer_dump_data"
			
 
				+```
			
 
				+保存模型时，只需要保存query侧网络
			
 
				+1. 配置文件中，打开裁剪网络开关
			
 
				+```bash
			
 
				+need_prune: True
			
 
				+```
			
 
				+2. 在static_model.py中配置裁剪网络的输入和输出
			
 
				+```python
			
 
				+self.prune_feed_vars = [query]
			
 
				+self.prune_target_var = dssm_model.query_fc
			
 
				+```
			
 
				+  
			
 
				+## FAQ
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/run.sh
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/run.sh
@@ -0,0 +1,19 @@
 
				+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+#!/bin/bash
			
 
				+echo "................run................."
			
 
				+python -u ../../../tools/trainer.py -m config_bigdata.yaml
			
 
				+python -u ../../../tools/infer.py -m config_bigdata.yaml &> result.txt
			
 
				+python transform.py
			
 
				+python ../../../tools/cal_pos_neg.py pair.txt
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/static_model.py
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/static_model.py
@@ -0,0 +1,85 @@
 
				+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import math
			
 
				+import paddle
			
 
				+from net import DSSMLayer
			
 
				+
			
 
				+
			
 
				+class StaticModel():
			
 
				+    def __init__(self, config):
			
 
				+        self.cost = None
			
 
				+        self.config = config
			
 
				+        self._init_hyper_parameters()
			
 
				+
			
 
				+    def _init_hyper_parameters(self):
			
 
				+        self.trigram_d = self.config.get("hyper_parameters.trigram_d")
			
 
				+        self.neg_num = self.config.get("hyper_parameters.neg_num")
			
 
				+        self.hidden_layers = self.config.get("hyper_parameters.fc_sizes")
			
 
				+        self.hidden_acts = self.config.get("hyper_parameters.fc_acts")
			
 
				+        self.learning_rate = self.config.get("hyper_parameters.learning_rate")
			
 
				+        self.slice_end = self.config.get("hyper_parameters.slice_end")
			
 
				+        self.learning_rate = self.config.get(
			
 
				+            "hyper_parameters.optimizer.learning_rate")
			
 
				+
			
 
				+    def create_feeds(self, is_infer=False):
			
 
				+        query = paddle.static.data(
			
 
				+            name="query", shape=[-1, self.trigram_d], dtype='float32')
			
 
				+        self.prune_feed_vars = [query]
			
 
				+
			
 
				+        doc_pos = paddle.static.data(
			
 
				+            name="doc_pos", shape=[-1, self.trigram_d], dtype='float32')
			
 
				+
			
 
				+        if is_infer:
			
 
				+            return [query, doc_pos]
			
 
				+
			
 
				+        doc_negs = [
			
 
				+            paddle.static.data(
			
 
				+                name="doc_neg_" + str(i),
			
 
				+                shape=[-1, self.trigram_d],
			
 
				+                dtype="float32") for i in range(self.neg_num)
			
 
				+        ]
			
 
				+        feeds_list = [query, doc_pos] + doc_negs
			
 
				+        return feeds_list
			
 
				+
			
 
				+    def net(self, input, is_infer=False):
			
 
				+        dssm_model = DSSMLayer(self.trigram_d, self.neg_num, self.slice_end,
			
 
				+                               self.hidden_layers, self.hidden_acts)
			
 
				+        R_Q_D_p, hit_prob = dssm_model.forward(input, is_infer)
			
 
				+
			
 
				+        self.inference_target_var = R_Q_D_p
			
 
				+        self.prune_target_var = dssm_model.query_fc
			
 
				+        self.train_dump_fields = [dssm_model.query_fc, R_Q_D_p]
			
 
				+        self.train_dump_params = dssm_model.params
			
 
				+        self.infer_dump_fields = [dssm_model.doc_pos_fc]
			
 
				+        if is_infer:
			
 
				+            fetch_dict = {'query_doc_sim': R_Q_D_p}
			
 
				+            return fetch_dict
			
 
				+        loss = -paddle.sum(paddle.log(hit_prob), axis=-1)
			
 
				+        avg_cost = paddle.mean(x=loss)
			
 
				+        # print(avg_cost)
			
 
				+        self._cost = avg_cost
			
 
				+        fetch_dict = {'Loss': avg_cost}
			
 
				+        return fetch_dict
			
 
				+
			
 
				+    def create_optimizer(self, strategy=None):
			
 
				+        optimizer = paddle.optimizer.Adam(
			
 
				+            learning_rate=self.learning_rate, lazy_mode=True)
			
 
				+        if strategy != None:
			
 
				+            import paddle.distributed.fleet as fleet
			
 
				+            optimizer = fleet.distributed_optimizer(optimizer, strategy)
			
 
				+        optimizer.minimize(self._cost)
			
 
				+
			
 
				+    def infer_net(self, input):
			
 
				+        return self.net(input, is_infer=True)
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/transform.py
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/transform.py
@@ -0,0 +1,78 @@
 
				+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import random
			
 
				+import numpy as np
			
 
				+import sklearn.metrics
			
 
				+
			
 
				+filename = './result.txt'
			
 
				+f = open(filename, "r")
			
 
				+lines = f.readlines()
			
 
				+f.close()
			
 
				+result = []
			
 
				+for line in lines:
			
 
				+    if "query_doc_sim" in str(line):
			
 
				+        result.append(line)
			
 
				+result = result[:-1]
			
 
				+f = open(filename, "w")
			
 
				+for i in range(len(result)):
			
 
				+    f.write(str(result[i]))
			
 
				+f.close()
			
 
				+
			
 
				+label = []
			
 
				+filename = '../../../datasets/BQ_dssm/label.txt'
			
 
				+f = open(filename, "r")
			
 
				+#f.readline()
			
 
				+num = 0
			
 
				+for line in f.readlines():
			
 
				+    num = num + 1
			
 
				+    line = line.strip()
			
 
				+    label.append(line)
			
 
				+f.close()
			
 
				+print(num)
			
 
				+
			
 
				+filename = './result.txt'
			
 
				+sim = []
			
 
				+for line in open(filename):
			
 
				+    line = line.strip().split(",")
			
 
				+    line[3] = line[3].split(":")
			
 
				+    line = line[3][1].strip(" ")
			
 
				+    line = line.strip("[")
			
 
				+    line = line.strip("]")
			
 
				+    sim.append(float(line))
			
 
				+
			
 
				+filename = '../../../datasets/BQ_dssm/big_test/test.txt'
			
 
				+f = open(filename, "r")
			
 
				+#f.readline()
			
 
				+query = []
			
 
				+for line in f.readlines():
			
 
				+    line = line.strip().split("\t")
			
 
				+    query.append(line[0])
			
 
				+f.close()
			
 
				+
			
 
				+
			
 
				+def takeFirst(x):
			
 
				+    return x[0]
			
 
				+
			
 
				+
			
 
				+filename = 'pair.txt'
			
 
				+line = []
			
 
				+print(len(query), len(sim), len(label))
			
 
				+for i in range(len(sim)):
			
 
				+    line.append([str(query[i]), str(sim[i]), str(label[i])])
			
 
				+line.sort(key=takeFirst)
			
 
				+f = open(filename, "w")
			
 
				+for i in line:
			
 
				+    f.write(i[0] + "\t" + i[1] + "\t" + i[2] + "\n")
			
 
				+f.close()
			
--- a/recommend-model-produce/src/main/python/tools/static_ps_trainer_v2.py
+++ b/recommend-model-produce/src/main/python/tools/static_ps_trainer_v2.py
@@ -24,7 +24,7 @@ import time
 
				 import sys
			
 
				 import paddle.distributed.fleet as fleet
			
 
				 import paddle.distributed.fleet.base.role_maker as role_maker
			
 
				-from paddle.distributed.fleet.utils import HDFSClient
			
 
				+from hdfs import InsecureClient
			
 
				 import paddle
			
 
				 import paddle.distributed as dist
			
 
				 from paddle.io import Dataset, DataLoader
			
@@ -47,25 +47,21 @@ logging.basicConfig(
 
				     format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
			
 
				 logger = logging.getLogger(__name__)
			
 
				 
			
 
				-hadoop_home = "/home/client/hadoop-client/hadoop/"
			
 
				-configs = {
			
 
				-    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
			
 
				-    "hadoop.job.ugi": "hello,hello123"
			
 
				-}
			
 
				-
			
 
				+client = InsecureClient('http://192.168.141.208:50070', user='root')
			
 
				 
			
 
				 class HDFSDataset(Dataset):
			
 
				     def __init__(self, hdfs_path, batch_size=32):
			
 
				         self.hdfs_path = hdfs_path
			
 
				         self.batch_size = batch_size
			
 
				-        self.dir_names, self.file_names = client.list_dirs(hdfs_path)
			
 
				-        client = HDFSClient(hadoop_home, configs)
			
 
				+        self.file_names = [f for f in self.client.list(hdfs_path) if f.type == 'FILE']
			
 
				+
			
 
				 
			
 
				     def __getitem__(self, idx):
			
 
				         # 读取单个样本的逻辑
			
 
				         file_name = self.file_names[idx]
			
 
				-        data = reader.read()
			
 
				-        return data
			
 
				+        with client.read('/path/to/file.txt') as reader:
			
 
				+            data = reader.read()
			
 
				+            return data
			
 
				 
			
 
				     def __len__(self):
			
 
				         return len(self.file_names)
			
@@ -276,18 +272,8 @@ class Main(object):
 
				         ]
			
 
				         fetch_vars = [var for _, var in self.metrics.items()]
			
 
				         print_step = int(config.get("runner.print_interval"))
			
 
				-
			
 
				-        debug = config.get("runner.dataset_debug", False)
			
 
				-        if config.get("runner.need_dump"):
			
 
				-            debug = True
			
 
				-            dump_fields_path = "{}/{}".format(
			
 
				-                config.get("runner.dump_fields_path"), epoch)
			
 
				-            set_dump_config(paddle.static.default_main_program(), {
			
 
				-                "dump_fields_path": dump_fields_path,
			
 
				-                "dump_fields": config.get("runner.dump_fields")
			
 
				-            })
			
 
				         print(paddle.static.default_main_program()._fleet_opt)
			
 
				-        dataset = HDFSDataset(hdfs_path='hdfs://namenode:port/path/to/data')
			
 
				+        dataset = HDFSDataset(hdfs_path='/path/to/data')
			
 
				         # 创建分布式采样器
			
 
				         sampler = DistributedSampler(dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank())
			
 
				         loader = DataLoader(dataset, batch_size=32, sampler=sampler)