7 maanden geleden · 23c96c1024
--- a/recommend-model-produce/src/main/python/models/dssmdemo/__init__.py
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/__init__.py
@@ -1,13 +0,0 @@
 
				-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_infer.py
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_infer.py
@@ -1,41 +0,0 @@
 
				-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-from __future__ import print_function
			
 
				-import numpy as np
			
 
				-
			
 
				-from paddle.io import IterableDataset
			
 
				-
			
 
				-
			
 
				-class RecDataset(IterableDataset):
			
 
				-    def __init__(self, file_list, config):
			
 
				-        super(RecDataset, self).__init__()
			
 
				-        self.file_list = file_list
			
 
				-
			
 
				-    def __iter__(self):
			
 
				-        full_lines = []
			
 
				-        for file in self.file_list:
			
 
				-            with open(file, "r") as rf:
			
 
				-                for line in rf:
			
 
				-                    output_list = []
			
 
				-                    features = line.rstrip('\n').split('\t')
			
 
				-                    query = [
			
 
				-                        float(feature) for feature in features[0].split(',')
			
 
				-                    ]
			
 
				-                    output_list.append(np.array(query).astype('float32'))
			
 
				-                    pos_doc = [
			
 
				-                        float(feature) for feature in features[1].split(',')
			
 
				-                    ]
			
 
				-                    output_list.append(np.array(pos_doc).astype('float32'))
			
 
				-                    yield output_list
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_train.py
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_train.py
@@ -1,48 +0,0 @@
 
				-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-from __future__ import print_function
			
 
				-import numpy as np
			
 
				-
			
 
				-from paddle.io import IterableDataset
			
 
				-
			
 
				-
			
 
				-class RecDataset(IterableDataset):
			
 
				-    def __init__(self, file_list, config):
			
 
				-        super(RecDataset, self).__init__()
			
 
				-        self.file_list = file_list
			
 
				-
			
 
				-    def __iter__(self):
			
 
				-        full_lines = []
			
 
				-        for file in self.file_list:
			
 
				-            with open(file, "r") as rf:
			
 
				-                for line in rf:
			
 
				-                    output_list = []
			
 
				-                    features = line.rstrip('\n').split('\t')
			
 
				-                    query = [
			
 
				-                        float(feature) for feature in features[0].split(',')
			
 
				-                    ]
			
 
				-                    output_list.append(np.array(query).astype('float32'))
			
 
				-                    pos_doc = [
			
 
				-                        float(feature) for feature in features[1].split(',')
			
 
				-                    ]
			
 
				-                    output_list.append(np.array(pos_doc).astype('float32'))
			
 
				-
			
 
				-                    for i in range(len(features) - 2):
			
 
				-                        output_list.append(
			
 
				-                            np.array([
			
 
				-                                float(feature)
			
 
				-                                for feature in features[i + 2].split(',')
			
 
				-                            ]).astype('float32'))
			
 
				-                    yield output_list
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_train_insid.py
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_train_insid.py
@@ -1,82 +0,0 @@
 
				-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-import sys
			
 
				-import yaml
			
 
				-import six
			
 
				-import os
			
 
				-import copy
			
 
				-import paddle.distributed.fleet as fleet
			
 
				-import logging
			
 
				-
			
 
				-logging.basicConfig(
			
 
				-    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
			
 
				-logger = logging.getLogger(__name__)
			
 
				-
			
 
				-
			
 
				-class Reader(fleet.MultiSlotStringDataGenerator):
			
 
				-    def init(self, config):
			
 
				-        self.config = config
			
 
				-        self.neg_num = self.config.get("hyper_parameters.neg_num")
			
 
				-
			
 
				-    def line_process(self, line):
			
 
				-        data = line.rstrip('\n').split('\t')
			
 
				-        ins_id = [data[0]]
			
 
				-        content = [data[1]]
			
 
				-        features = data[2:]
			
 
				-        query = features[0].split(',')
			
 
				-        pos_doc = features[1].split(',')
			
 
				-
			
 
				-        neg_doc_list = []
			
 
				-        for i in range(self.neg_num):
			
 
				-            neg_doc_list.append(features[i + 2].split(','))
			
 
				-
			
 
				-        return [ins_id, content, query, pos_doc] + neg_doc_list
			
 
				-
			
 
				-    def generate_sample(self, line):
			
 
				-        "Dataset Generator"
			
 
				-
			
 
				-        def reader():
			
 
				-            input_data = self.line_process(line)
			
 
				-            feature_name = ["insid", "content", "query", "pos_doc"]
			
 
				-            for i in range(self.neg_num):
			
 
				-                feature_name.append("neg_doc_{}".format(i))
			
 
				-            yield zip(feature_name, input_data)
			
 
				-
			
 
				-        return reader
			
 
				-
			
 
				-    def dataloader(self, file_list):
			
 
				-        "DataLoader Pyreader Generator"
			
 
				-
			
 
				-        def reader():
			
 
				-            for file in file_list:
			
 
				-                with open(file, 'r') as f:
			
 
				-                    for line in f:
			
 
				-                        input_data = self.line_process(line)
			
 
				-                        yield input_data
			
 
				-
			
 
				-        return reader
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    yaml_path = sys.argv[1]
			
 
				-    utils_path = sys.argv[2]
			
 
				-    sys.path.append(utils_path)
			
 
				-    import common_ps
			
 
				-    yaml_helper = common_ps.YamlHelper()
			
 
				-    config = yaml_helper.load_yaml(yaml_path)
			
 
				-
			
 
				-    r = Reader()
			
 
				-    r.init(config)
			
 
				-    # r.init(None)
			
 
				-    r.run_from_stdin()
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/config.yaml
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/config.yaml
@@ -1,41 +0,0 @@
 
				-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-runner:
			
 
				-  train_data_dir: "data/train"
			
 
				-  train_reader_path: "bq_reader_train"  # importlib format
			
 
				-  train_batch_size: 8
			
 
				-  model_save_path: "output_model_dssm"
			
 
				-
			
 
				-  use_gpu: False
			
 
				-  epochs: 1
			
 
				-  print_interval: 10
			
 
				-  
			
 
				-  test_data_dir: "data/test"
			
 
				-  infer_reader_path: "bq_reader_infer"  # importlib format
			
 
				-  infer_batch_size: 1
			
 
				-  infer_load_path: "output_model_dssm"
			
 
				-  infer_start_epoch: 0
			
 
				-  infer_end_epoch: 1
			
 
				-
			
 
				-hyper_parameters:
			
 
				-  optimizer:
			
 
				-    class: adam
			
 
				-    learning_rate: 0.001
			
 
				-    strategy: sync
			
 
				-  trigram_d: 2900
			
 
				-  neg_num: 1
			
 
				-  slice_end: 8
			
 
				-  fc_sizes: [300, 300, 128]
			
 
				-  fc_acts: ['relu', 'relu', 'relu']
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/config_bigdata.yaml
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/config_bigdata.yaml
@@ -1,42 +0,0 @@
 
				-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-
			
 
				-runner:
			
 
				-  train_data_dir: "../../../datasets/BQ_dssm/big_train"
			
 
				-  train_reader_path: "bq_reader_train"  # importlib format
			
 
				-  train_batch_size: 128
			
 
				-  model_save_path: "output_model_all_dssm"
			
 
				-
			
 
				-  use_gpu: False
			
 
				-  epochs: 1
			
 
				-  print_interval: 1
			
 
				-  
			
 
				-  test_data_dir: "../../../datasets/BQ_dssm/big_test"
			
 
				-  infer_reader_path: "bq_reader_infer"  # importlib format
			
 
				-  infer_batch_size: 1
			
 
				-  infer_load_path: "output_model_all_dssm"
			
 
				-  infer_start_epoch: 0
			
 
				-  infer_end_epoch: 1
			
 
				-
			
 
				-hyper_parameters:
			
 
				-  optimizer:
			
 
				-    class: adam
			
 
				-    learning_rate: 0.001
			
 
				-    strategy: sync
			
 
				-  trigram_d: 5913
			
 
				-  neg_num: 1
			
 
				-  slice_end: 128
			
 
				-  fc_sizes: [300, 300, 128]
			
 
				-  fc_acts: ['relu', 'relu', 'relu']
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/config_online.yaml
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/config_online.yaml
@@ -1,70 +0,0 @@
 
				-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-runner:
			
 
				-  train_data_dir: "data/train_with_insid"
			
 
				-  test_data_dir: "data/test_with_insid"
			
 
				-  # train_reader_path: "bq_reader_train"  # importlib format
			
 
				-  days: "{20210803..20210804}"
			
 
				-  pass_per_day: 1
			
 
				-  train_batch_size: 8
			
 
				-  test_batch_size: 8
			
 
				-  model_save_path: "output_model_dssm"
			
 
				-
			
 
				-  reader_type: "InmemoryDataset"  # DataLoader / QueueDataset / RecDataset / InmemoryDataset
			
 
				-  pipe_command: "python3 bq_reader_train_insid.py"
			
 
				-
			
 
				-  sync_mode: "async"
			
 
				-  # thread_num: 1
			
 
				-  train_thread_num: 1
			
 
				-  test_thread_num: 1
			
 
				-
			
 
				-  use_gpu: False
			
 
				-  epochs: 1
			
 
				-  print_interval: 1
			
 
				-
			
 
				-  dataset_debug: False
			
 
				-
			
 
				-  # when you need to prune net, please set need_prune to True,
			
 
				-  # and need to set prune_feed_vars and prune_target_var in static_model.py
			
 
				-  need_prune: True
			
 
				-
			
 
				-  parse_ins_id: True
			
 
				-  parse_content: True
			
 
				-  
			
 
				-  # when you need to dump fileds and params in training, please set need_train_dump to True,
			
 
				-  # and need to set train_dump_fields and train_dump_params in static_model.py
			
 
				-  need_train_dump: True
			
 
				-  # train_dump_fields_dir: "afs:/xxx"
			
 
				-  train_dump_fields_dir: "./train_dump_data"
			
 
				-
			
 
				-  # when you need to dump fileds in inference, please set need_infer_dump to True,
			
 
				-  # and need to set infer_dump_fields in static_model.py
			
 
				-  need_infer_dump: True
			
 
				-  # infer_dump_fields_dir: "afs:/xxx"
			
 
				-  infer_dump_fields_dir: "./infer_dump_data"
			
 
				-
			
 
				-  fs_name: "afs://xxx"
			
 
				-  fs_ugi: "xxx,xxx"
			
 
				-  
			
 
				-hyper_parameters:
			
 
				-  optimizer:
			
 
				-    class: adam
			
 
				-    learning_rate: 0.001
			
 
				-    strategy: sync
			
 
				-  trigram_d: 2900
			
 
				-  neg_num: 1
			
 
				-  slice_end: 8
			
 
				-  fc_sizes: [300, 300, 128]
			
 
				-  fc_acts: ['relu', 'relu', 'relu']
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/config_ps.yaml
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/config_ps.yaml
@@ -1,46 +0,0 @@
 
				-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-runner:
			
 
				-  train_data_dir: "data/train"
			
 
				-  train_reader_path: "bq_reader_train"  # importlib format
			
 
				-  train_batch_size: 8
			
 
				-  model_save_path: "output_model_dssm"
			
 
				-
			
 
				-  reader_type: "QueueDataset"  # DataLoader / QueueDataset / RecDataset
			
 
				-  pipe_command: "python bq_reader_train_ps.py"
			
 
				-  thread_num: 1
			
 
				-  sync_mode: "async"
			
 
				-
			
 
				-  use_gpu: False
			
 
				-  epochs: 10
			
 
				-  print_interval: 10
			
 
				-  
			
 
				-  test_data_dir: "data/test"
			
 
				-  infer_reader_path: "bq_reader_infer"  # importlib format
			
 
				-  infer_batch_size: 1
			
 
				-  infer_load_path: "output_model_dssm"
			
 
				-  infer_start_epoch: 0
			
 
				-  infer_end_epoch: 1
			
 
				-
			
 
				-hyper_parameters:
			
 
				-  optimizer:
			
 
				-    class: adam
			
 
				-    learning_rate: 0.001
			
 
				-    strategy: sync
			
 
				-  trigram_d: 2900
			
 
				-  neg_num: 1
			
 
				-  slice_end: 8
			
 
				-  fc_sizes: [300, 300, 128]
			
 
				-  fc_acts: ['relu', 'relu', 'relu']
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/data/data_process.sh
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/data/data_process.sh
@@ -1,24 +0,0 @@
 
				-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-#!/bin/bash
			
 
				-
			
 
				-wget https://paddlerec.bj.bcebos.com/dssm%2Fbq.tar.gz
			
 
				-tar xzf dssm%2Fbq.tar.gz
			
 
				-rm -f dssm%2Fbq.tar.gz
			
 
				-mv bq/train.txt ./raw_data.txt
			
 
				-python3 preprocess.py
			
 
				-mkdir big_train
			
 
				-mv train.txt ./big_train
			
 
				-mkdir big_test
			
 
				-mv test.txt ./big_test
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/data/prepare_dump_data.sh
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/data/prepare_dump_data.sh
@@ -1,27 +0,0 @@
 
				-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-#!/bin/bash
			
 
				-
			
 
				-
			
 
				-cat train/train.txt | awk -F'\t' 'BEGIN{OFS="\t"}{print NR, "item_"NR, $0}' > data_with_lineid
			
 
				-for i in 20210803 20210804
			
 
				-do
			
 
				-    for j in 1
			
 
				-    do
			
 
				-        mkdir -p train_with_insid/$i/$j
			
 
				-        cp data_with_lineid train_with_insid/$i/$j
			
 
				-        mkdir -p test_with_insid/$i/$j
			
 
				-        cp data_with_lineid test_with_insid/$i/$j
			
 
				-    done
			
 
				-done
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/data/preprocess.py
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/data/preprocess.py
--- a/recommend-model-produce/src/main/python/models/dssmdemo/data/test/test.txt
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/data/test/test.txt
--- a/recommend-model-produce/src/main/python/models/dssmdemo/data/train/train.txt
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/data/train/train.txt
--- a/recommend-model-produce/src/main/python/models/dssmdemo/dygraph_model.py
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/dygraph_model.py
@@ -1,93 +0,0 @@
 
				-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-import paddle
			
 
				-import paddle.nn as nn
			
 
				-import paddle.nn.functional as F
			
 
				-import math
			
 
				-import net
			
 
				-
			
 
				-
			
 
				-class DygraphModel():
			
 
				-    # define model
			
 
				-    def create_model(self, config):
			
 
				-        trigram_d = config.get('hyper_parameters.trigram_d', None)
			
 
				-        neg_num = config.get('hyper_parameters.neg_num', None)
			
 
				-        slice_end = config.get('hyper_parameters.slice_end', None)
			
 
				-        fc_sizes = config.get('hyper_parameters.fc_sizes', None)
			
 
				-        fc_acts = config.get('hyper_parameters.fc_acts', None)
			
 
				-
			
 
				-        DSSM_model = net.DSSMLayer(trigram_d, neg_num, slice_end, fc_sizes,
			
 
				-                                   fc_acts)
			
 
				-        return DSSM_model
			
 
				-
			
 
				-    # define feeds which convert numpy of batch data to paddle.tensor 
			
 
				-    def create_feeds_train(self, batch_data, trigram_d):
			
 
				-        query = paddle.to_tensor(batch_data[0].numpy().astype('float32')
			
 
				-                                 .reshape(-1, trigram_d))
			
 
				-        doc_pos = paddle.to_tensor(batch_data[1].numpy().astype('float32')
			
 
				-                                   .reshape(-1, trigram_d))
			
 
				-        doc_negs = []
			
 
				-        for ele in batch_data[2:]:
			
 
				-            doc_negs.append(
			
 
				-                paddle.to_tensor(ele.numpy().astype('float32').reshape(
			
 
				-                    -1, trigram_d)))
			
 
				-        return [query, doc_pos] + doc_negs
			
 
				-
			
 
				-    def create_feeds_infer(self, batch_data, trigram_d):
			
 
				-        query = paddle.to_tensor(batch_data[0].numpy().astype('float32')
			
 
				-                                 .reshape(-1, trigram_d))
			
 
				-        doc_pos = paddle.to_tensor(batch_data[1].numpy().astype('float32')
			
 
				-                                   .reshape(-1, trigram_d))
			
 
				-        return [query, doc_pos]
			
 
				-
			
 
				-    # define loss function by predicts and label
			
 
				-    def create_loss(self, hit_prob):
			
 
				-        loss = -paddle.sum(paddle.log(hit_prob), axis=-1)
			
 
				-        avg_cost = paddle.mean(x=loss)
			
 
				-        return avg_cost
			
 
				-
			
 
				-    # define optimizer 
			
 
				-    def create_optimizer(self, dy_model, config):
			
 
				-        lr = config.get("hyper_parameters.optimizer.learning_rate", 0.001)
			
 
				-        optimizer = paddle.optimizer.Adam(
			
 
				-            learning_rate=lr, parameters=dy_model.parameters())
			
 
				-        return optimizer
			
 
				-
			
 
				-    # define metrics such as auc/acc
			
 
				-    # multi-task need to define multi metric
			
 
				-    def create_metrics(self):
			
 
				-        metrics_list_name = []
			
 
				-        metrics_list = []
			
 
				-        return metrics_list, metrics_list_name
			
 
				-
			
 
				-    # construct train forward phase  
			
 
				-    def train_forward(self, dy_model, metrics_list, batch_data, config):
			
 
				-        trigram_d = config.get('hyper_parameters.trigram_d', None)
			
 
				-        inputs = self.create_feeds_train(batch_data, trigram_d)
			
 
				-
			
 
				-        R_Q_D_p, hit_prob = dy_model.forward(inputs, False)
			
 
				-        loss = self.create_loss(hit_prob)
			
 
				-        # update metrics
			
 
				-        print_dict = {"loss": loss}
			
 
				-        return loss, metrics_list, print_dict
			
 
				-
			
 
				-    def infer_forward(self, dy_model, metrics_list, batch_data, config):
			
 
				-        trigram_d = config.get('hyper_parameters.trigram_d', None)
			
 
				-        inputs = self.create_feeds_infer(batch_data, trigram_d)
			
 
				-
			
 
				-        R_Q_D_p, hit_prob = dy_model.forward(inputs, True)
			
 
				-        # update metrics
			
 
				-        print_dict = {"query_doc_sim": R_Q_D_p}
			
 
				-        return metrics_list, print_dict
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/net.py
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/net.py
@@ -1,101 +0,0 @@
 
				-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-import paddle
			
 
				-import paddle.nn as nn
			
 
				-import paddle.nn.functional as F
			
 
				-import numpy as np
			
 
				-
			
 
				-
			
 
				-class DSSMLayer(nn.Layer):
			
 
				-    def __init__(self, trigram_d, neg_num, slice_end, hidden_layers,
			
 
				-                 hidden_acts):
			
 
				-        super(DSSMLayer, self).__init__()
			
 
				-
			
 
				-        self.hidden_layers = [trigram_d] + hidden_layers
			
 
				-        self.hidden_acts = hidden_acts
			
 
				-        self.slice_end = slice_end
			
 
				-
			
 
				-        self._query_layers = []
			
 
				-        for i in range(len(self.hidden_layers) - 1):
			
 
				-            linear = paddle.nn.Linear(
			
 
				-                in_features=self.hidden_layers[i],
			
 
				-                out_features=self.hidden_layers[i + 1],
			
 
				-                weight_attr=paddle.ParamAttr(
			
 
				-                    initializer=paddle.nn.initializer.XavierNormal(
			
 
				-                        fan_in=self.hidden_layers[i],
			
 
				-                        fan_out=self.hidden_layers[i + 1])),
			
 
				-                bias_attr=paddle.ParamAttr(
			
 
				-                    initializer=paddle.nn.initializer.XavierNormal(
			
 
				-                        fan_in=self.hidden_layers[i],
			
 
				-                        fan_out=self.hidden_layers[i + 1])))
			
 
				-            self.add_sublayer('query_linear_%d' % i, linear)
			
 
				-            self._query_layers.append(linear)
			
 
				-            if self.hidden_acts[i] == "relu":
			
 
				-                act = paddle.nn.ReLU()
			
 
				-                self.add_sublayer('query_act_%d' % i, act)
			
 
				-                self._query_layers.append(act)
			
 
				-
			
 
				-        self._doc_layers = []
			
 
				-        for i in range(len(self.hidden_layers) - 1):
			
 
				-            linear = paddle.nn.Linear(
			
 
				-                in_features=self.hidden_layers[i],
			
 
				-                out_features=self.hidden_layers[i + 1],
			
 
				-                weight_attr=paddle.ParamAttr(
			
 
				-                    initializer=paddle.nn.initializer.XavierNormal(
			
 
				-                        fan_in=self.hidden_layers[i],
			
 
				-                        fan_out=self.hidden_layers[i + 1])),
			
 
				-                bias_attr=paddle.ParamAttr(
			
 
				-                    initializer=paddle.nn.initializer.XavierNormal(
			
 
				-                        fan_in=self.hidden_layers[i],
			
 
				-                        fan_out=self.hidden_layers[i + 1])))
			
 
				-            self.add_sublayer('pos_linear_%d' % i, linear)
			
 
				-            self._doc_layers.append(linear)
			
 
				-            if self.hidden_acts[i] == "relu":
			
 
				-                act = paddle.nn.ReLU()
			
 
				-                self.add_sublayer('pos_act_%d' % i, act)
			
 
				-                self._doc_layers.append(act)
			
 
				-
			
 
				-    def forward(self, input_data, is_infer):
			
 
				-        query_fc = input_data[0]
			
 
				-        for n_layer in self._query_layers:
			
 
				-            query_fc = n_layer(query_fc)
			
 
				-        self.query_fc = query_fc
			
 
				-
			
 
				-        doc_pos_fc = input_data[1]
			
 
				-        for n_layer in self._doc_layers:
			
 
				-            doc_pos_fc = n_layer(doc_pos_fc)
			
 
				-        self.doc_pos_fc = doc_pos_fc
			
 
				-
			
 
				-        self.params = [self._query_layers[-2].bias]
			
 
				-
			
 
				-        R_Q_D_p = F.cosine_similarity(
			
 
				-            query_fc, doc_pos_fc, axis=1).reshape([-1, 1])
			
 
				-
			
 
				-        if is_infer:
			
 
				-            return R_Q_D_p, paddle.ones(shape=[self.slice_end, 1])
			
 
				-
			
 
				-        R_Q_D_ns = []
			
 
				-        for i in range(len(input_data) - 2):
			
 
				-            doc_neg_fc_i = input_data[i + 2]
			
 
				-            for n_layer in self._doc_layers:
			
 
				-                doc_neg_fc_i = n_layer(doc_neg_fc_i)
			
 
				-            R_Q_D_n = F.cosine_similarity(
			
 
				-                query_fc, doc_neg_fc_i, axis=1).reshape([-1, 1])
			
 
				-            R_Q_D_ns.append(R_Q_D_n)
			
 
				-        concat_Rs = paddle.concat(x=[R_Q_D_p] + R_Q_D_ns, axis=1)
			
 
				-        prob = F.softmax(concat_Rs, axis=1)
			
 
				-        hit_prob = paddle.slice(
			
 
				-            prob, axes=[0, 1], starts=[0, 0], ends=[self.slice_end, 1])
			
 
				-        return R_Q_D_p, hit_prob
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/readme.md
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/readme.md
@@ -1,140 +0,0 @@
 
				-# DSSM文本匹配模型
			
 
				-
			
 
				-**[AI Studio在线运行环境](https://aistudio.baidu.com/aistudio/projectdetail/3238124)**
			
 
				-
			
 
				-以下是本例的简要目录结构及说明： 
			
 
				-
			
 
				-```
			
 
				-├── data #样例数据
			
 
				-    ├── train
			
 
				-        ├── train.txt #训练数据样例
			
 
				-    ├── test
			
 
				-        ├── test.txt #测试数据样例
			
 
				-    ├── preprocess.py #数据处理程序
			
 
				-    ├── data_process #数据一键处理脚本
			
 
				-├── __init__.py
			
 
				-├── README.md #文档
			
 
				-├── config.yaml # sample数据配置
			
 
				-├── config_bigdata.yaml # 全量数据配置
			
 
				-├── net.py # 模型核心组网（动静统一）
			
 
				-├── static_model.py # 构建静态图
			
 
				-├── dygraph_model.py # 构建动态图
			
 
				-├── transform.py #将数据整理成合适的格式方便计算指标
			
 
				-├── run.sh #全量数据集中的训练脚本，从训练到预测并计算指标
			
 
				-├── bq_reader_train.py #训练时数据读取程序
			
 
				-├── bq_reader_infer.py #预测时数据读取程序
			
 
				-```
			
 
				-
			
 
				-注：在阅读该示例前，建议您先了解以下内容：
			
 
				-
			
 
				-[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)  
			
 
				-[readthedocs文档](https://paddlerec.readthedocs.io/en/latest/models/match/dssm.html)  
			
 
				-
			
 
				-
			
 
				-## 内容
			
 
				-
			
 
				-- [模型简介](#模型简介)
			
 
				-- [数据准备](#数据准备)
			
 
				-- [运行环境](#运行环境)
			
 
				-- [快速开始](#快速开始)
			
 
				-- [模型组网](#模型组网)
			
 
				-- [效果复现](#效果复现)
			
 
				-- [进阶使用](#进阶使用)
			
 
				-- [FAQ](#FAQ)
			
 
				-
			
 
				-## 模型简介
			
 
				-DSSM是Deep Structured Semantic Model的缩写，即我们通常说的基于深度网络的语义模型，其核心思想是将query和doc映射到到共同维度的语义空间中，通过最大化query和doc语义向量之间的余弦相似度，从而训练得到隐含语义模型，达到检索的目的。DSSM有很广泛的应用，比如：搜索引擎检索，广告相关性，问答系统，机器翻译等。    
			
 
				-
			
 
				-## 数据准备
			
 
				-BQ是一个智能客服中文问句匹配数据集，该数据集是自动问答系统语料，共有120,000对句子对，并标注了句子对相似度值。数据中存在错别字、语法不规范等问题，但更加贴近工业场景。
			
 
				-原始数据集样例：
			
 
				-```
			
 
				-请问一天是否都是限定只能转入或转出都是五万。    微众多少可以赎回短期理财        0
			
 
				-微粒咨询电话号码多少    你们的人工客服电话是多少        1
			
 
				-已经在银行换了新预留号码。      我现在换了电话号码，这个需要更换吗      1
			
 
				-每个字段以tab键分隔，第1，2列表示两个文本。第3列表示类别（0或1，0表示两个文本不相似，1表示两个文本相似）。
			
 
				-```
			
 
				-
			
 
				-## 运行环境
			
 
				-PaddlePaddle>=2.0
			
 
				-
			
 
				-python 2.7/3.5/3.6/3.7
			
 
				-
			
 
				-os : windows/linux/macos 
			
 
				-12234
			
 
				-## 快速开始
			
 
				-本文提供了样例数据可以供您快速体验，在任意目录下均可执行。在dssm模型目录的快速执行命令如下： 
			
 
				-```bash
			
 
				-# 进入模型目录
			
 
				-# cd models/match/dssm # 在任意目录均可运行
			
 
				-# 动态图训练
			
 
				-python -u ../../../tools/trainer.py -m config.yaml # 全量数据运行config_bigdata.yaml 
			
 
				-# 动态图预测
			
 
				-python -u ../../../tools/infer.py -m config.yaml 
			
 
				-
			
 
				-# 静态图训练
			
 
				-python -u ../../../tools/static_trainer.py -m config.yaml # 全量数据运行config_bigdata.yaml 
			
 
				-# 静态图预测
			
 
				-python -u ../../../tools/static_infer.py -m config.yaml 
			
 
				-``` 
			
 
				-
			
 
				-## 模型组网
			
 
				-DSSM 的输入采用 BOW（Bag of words）的方式，相当于把字向量的位置信息抛弃了，整个句子里的词都放在一个袋子里了。将一个句子用这种方式转化为一个向量输入DNN中。  
			
 
				-Query 和 Doc 的语义相似性可以用这两个向量的 cosine 距离表示，然后通过softmax 函数选出与Query语义最相似的样本 Doc 。  
			
 
				-
			
 
				-模型的具体细节可以阅读论文[DSSM](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cikm2013_DSSM_fullversion.pdf):
			
 
				-<p align="center">
			
 
				-<img align="center" src="../../../doc/imgs/dssm.png">
			
 
				-<p>
			
 
				-
			
 
				-## 效果复现
			
 
				-为了方便使用者能够快速的跑通每一个模型，我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。  
			
 
				-在全量数据下模型的指标如下：  
			
 
				-
			
 
				-| 模型 | 正序率 | batch_size | epoch_num| Time of each epoch |
			
 
				-| :------| :------ | :------ | :------| :------ | 
			
 
				-| DSSM | 0.93 | 128 | 1 | 约9分钟 |  
			
 
				-
			
 
				-1. 确认您当前所在目录为PaddleRec/models/match/dssm
			
 
				-2. 进入paddlerec/datasets/BQ_dssm目录下，执行该脚本，会从国内源的服务器上下载我们预处理完成的BQ全量数据集，并解压到指定文件夹。  
			
 
				-``` bash
			
 
				-cd ../../../datasets/BQ_dssm
			
 
				-sh run.sh
			
 
				-```
			
 
				-3. 切回模型目录,直接一键运行：bash run.sh 即可得到复现的论文效果.
			
 
				-执行该脚本后，会开始自动训练并测试模型，将测试的结果保存到result.txt文件，再执行transform.py整理格式，最后通过执行../../../tools/cal_pos_neg.py进行评估得到数据的正序率指标   
			
 
				-```bash
			
 
				-cd - # 切回模型目录
			
 
				-bash run.sh #动态图训练并测试，最后得到指标
			
 
				-```
			
 
				-
			
 
				-## 进阶使用
			
 
				-DSSM作为推荐系统中一种向量召回的方式，一般需要将doc侧的向量预先计算出来，灌入向量搜索引擎（例如milvus）中，同时保存的模型仅为query侧的模型。线上使用阶段，输入query侧的数据，计算出query侧向量后，直接通过向量搜索引擎召回对应的doc。  
			
 
				-一般在训练的过程中，增加预测阶段，dump出全量的doc侧向量，需要做如下修改： 
			
 
				-1. 为了区分dump出的向量，预测阶段使用的数据需要增加insid和content两个字段，其中insid唯一标记样本，content指明对应的doc。并在数据处理脚本中对这两个字段进行解析，详见bq_reader_train_insid.py脚本。
			
 
				-2. dataset选择InmemoryDataset，同时设置
			
 
				-```python
			
 
				-dataset.set_parse_ins_id(True)
			
 
				-dataset.set_parse_content(True)
			
 
				-```
			
 
				-3. 在static_model.py中配置需要dump的变量（doc侧最上层输出）
			
 
				-```python
			
 
				-self.infer_dump_fields = [dssm_model.doc_pos_fc]
			
 
				-```
			
 
				-4. 配置文件中，打开预测阶段的dump功能，并配置dump_path
			
 
				-```bash
			
 
				-need_infer_dump: True
			
 
				-infer_dump_fields_dir: "./infer_dump_data"
			
 
				-```
			
 
				-保存模型时，只需要保存query侧网络
			
 
				-1. 配置文件中，打开裁剪网络开关
			
 
				-```bash
			
 
				-need_prune: True
			
 
				-```
			
 
				-2. 在static_model.py中配置裁剪网络的输入和输出
			
 
				-```python
			
 
				-self.prune_feed_vars = [query]
			
 
				-self.prune_target_var = dssm_model.query_fc
			
 
				-```
			
 
				-  
			
 
				-## FAQ
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/run.sh
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/run.sh
@@ -1,19 +0,0 @@
 
				-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-#!/bin/bash
			
 
				-echo "................run................."
			
 
				-python -u ../../../tools/trainer.py -m config_bigdata.yaml
			
 
				-python -u ../../../tools/infer.py -m config_bigdata.yaml &> result.txt
			
 
				-python transform.py
			
 
				-python ../../../tools/cal_pos_neg.py pair.txt
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/static_model.py
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/static_model.py
@@ -1,85 +0,0 @@
 
				-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-import math
			
 
				-import paddle
			
 
				-from net import DSSMLayer
			
 
				-
			
 
				-
			
 
				-class StaticModel():
			
 
				-    def __init__(self, config):
			
 
				-        self.cost = None
			
 
				-        self.config = config
			
 
				-        self._init_hyper_parameters()
			
 
				-
			
 
				-    def _init_hyper_parameters(self):
			
 
				-        self.trigram_d = self.config.get("hyper_parameters.trigram_d")
			
 
				-        self.neg_num = self.config.get("hyper_parameters.neg_num")
			
 
				-        self.hidden_layers = self.config.get("hyper_parameters.fc_sizes")
			
 
				-        self.hidden_acts = self.config.get("hyper_parameters.fc_acts")
			
 
				-        self.learning_rate = self.config.get("hyper_parameters.learning_rate")
			
 
				-        self.slice_end = self.config.get("hyper_parameters.slice_end")
			
 
				-        self.learning_rate = self.config.get(
			
 
				-            "hyper_parameters.optimizer.learning_rate")
			
 
				-
			
 
				-    def create_feeds(self, is_infer=False):
			
 
				-        query = paddle.static.data(
			
 
				-            name="query", shape=[-1, self.trigram_d], dtype='float32')
			
 
				-        self.prune_feed_vars = [query]
			
 
				-
			
 
				-        doc_pos = paddle.static.data(
			
 
				-            name="doc_pos", shape=[-1, self.trigram_d], dtype='float32')
			
 
				-
			
 
				-        if is_infer:
			
 
				-            return [query, doc_pos]
			
 
				-
			
 
				-        doc_negs = [
			
 
				-            paddle.static.data(
			
 
				-                name="doc_neg_" + str(i),
			
 
				-                shape=[-1, self.trigram_d],
			
 
				-                dtype="float32") for i in range(self.neg_num)
			
 
				-        ]
			
 
				-        feeds_list = [query, doc_pos] + doc_negs
			
 
				-        return feeds_list
			
 
				-
			
 
				-    def net(self, input, is_infer=False):
			
 
				-        dssm_model = DSSMLayer(self.trigram_d, self.neg_num, self.slice_end,
			
 
				-                               self.hidden_layers, self.hidden_acts)
			
 
				-        R_Q_D_p, hit_prob = dssm_model.forward(input, is_infer)
			
 
				-
			
 
				-        self.inference_target_var = R_Q_D_p
			
 
				-        self.prune_target_var = dssm_model.query_fc
			
 
				-        self.train_dump_fields = [dssm_model.query_fc, R_Q_D_p]
			
 
				-        self.train_dump_params = dssm_model.params
			
 
				-        self.infer_dump_fields = [dssm_model.doc_pos_fc]
			
 
				-        if is_infer:
			
 
				-            fetch_dict = {'query_doc_sim': R_Q_D_p}
			
 
				-            return fetch_dict
			
 
				-        loss = -paddle.sum(paddle.log(hit_prob), axis=-1)
			
 
				-        avg_cost = paddle.mean(x=loss)
			
 
				-        # print(avg_cost)
			
 
				-        self._cost = avg_cost
			
 
				-        fetch_dict = {'Loss': avg_cost}
			
 
				-        return fetch_dict
			
 
				-
			
 
				-    def create_optimizer(self, strategy=None):
			
 
				-        optimizer = paddle.optimizer.Adam(
			
 
				-            learning_rate=self.learning_rate, lazy_mode=True)
			
 
				-        if strategy != None:
			
 
				-            import paddle.distributed.fleet as fleet
			
 
				-            optimizer = fleet.distributed_optimizer(optimizer, strategy)
			
 
				-        optimizer.minimize(self._cost)
			
 
				-
			
 
				-    def infer_net(self, input):
			
 
				-        return self.net(input, is_infer=True)
			
--- a/recommend-model-produce/src/main/python/models/dssmdemo/transform.py
+++ b/recommend-model-produce/src/main/python/models/dssmdemo/transform.py
@@ -1,78 +0,0 @@
 
				-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-import random
			
 
				-import numpy as np
			
 
				-import sklearn.metrics
			
 
				-
			
 
				-filename = './result.txt'
			
 
				-f = open(filename, "r")
			
 
				-lines = f.readlines()
			
 
				-f.close()
			
 
				-result = []
			
 
				-for line in lines:
			
 
				-    if "query_doc_sim" in str(line):
			
 
				-        result.append(line)
			
 
				-result = result[:-1]
			
 
				-f = open(filename, "w")
			
 
				-for i in range(len(result)):
			
 
				-    f.write(str(result[i]))
			
 
				-f.close()
			
 
				-
			
 
				-label = []
			
 
				-filename = '../../../datasets/BQ_dssm/label.txt'
			
 
				-f = open(filename, "r")
			
 
				-#f.readline()
			
 
				-num = 0
			
 
				-for line in f.readlines():
			
 
				-    num = num + 1
			
 
				-    line = line.strip()
			
 
				-    label.append(line)
			
 
				-f.close()
			
 
				-print(num)
			
 
				-
			
 
				-filename = './result.txt'
			
 
				-sim = []
			
 
				-for line in open(filename):
			
 
				-    line = line.strip().split(",")
			
 
				-    line[3] = line[3].split(":")
			
 
				-    line = line[3][1].strip(" ")
			
 
				-    line = line.strip("[")
			
 
				-    line = line.strip("]")
			
 
				-    sim.append(float(line))
			
 
				-
			
 
				-filename = '../../../datasets/BQ_dssm/big_test/test.txt'
			
 
				-f = open(filename, "r")
			
 
				-#f.readline()
			
 
				-query = []
			
 
				-for line in f.readlines():
			
 
				-    line = line.strip().split("\t")
			
 
				-    query.append(line[0])
			
 
				-f.close()
			
 
				-
			
 
				-
			
 
				-def takeFirst(x):
			
 
				-    return x[0]
			
 
				-
			
 
				-
			
 
				-filename = 'pair.txt'
			
 
				-line = []
			
 
				-print(len(query), len(sim), len(label))
			
 
				-for i in range(len(sim)):
			
 
				-    line.append([str(query[i]), str(sim[i]), str(label[i])])
			
 
				-line.sort(key=takeFirst)
			
 
				-f = open(filename, "w")
			
 
				-for i in line:
			
 
				-    f.write(i[0] + "\t" + i[1] + "\t" + i[2] + "\n")
			
 
				-f.close()
			
--- a/recommend-model-produce/src/main/python/tools/static_ps_trainer_v2.py
+++ b/recommend-model-produce/src/main/python/tools/static_ps_trainer_v2.py
@@ -24,10 +24,7 @@ import time
 
				 import sys
			
 
				 import paddle.distributed.fleet as fleet
			
 
				 import paddle.distributed.fleet.base.role_maker as role_maker
			
 
				-from hdfs import InsecureClient
			
 
				 import paddle
			
 
				-import paddle.distributed as dist
			
 
				-from paddle.io import Dataset, DataLoader
			
 
				 
			
 
				 import warnings
			
 
				 import logging
			
@@ -36,7 +33,6 @@ import numpy as np
 
				 import struct
			
 
				 from utils.utils_single import auc
			
 
				 
			
 
				-
			
 
				 __dir__ = os.path.dirname(os.path.abspath(__file__))
			
 
				 sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))
			
 
				 
			
@@ -47,25 +43,6 @@ logging.basicConfig(
 
				     format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
			
 
				 logger = logging.getLogger(__name__)
			
 
				 
			
 
				-client = InsecureClient('http://192.168.141.208:50070', user='root')
			
 
				-
			
 
				-class HDFSDataset(Dataset):
			
 
				-    def __init__(self, hdfs_path, batch_size=32):
			
 
				-        self.hdfs_path = hdfs_path
			
 
				-        self.batch_size = batch_size
			
 
				-        self.file_names = [f for f in self.client.list(hdfs_path) if f.type == 'FILE']
			
 
				-
			
 
				-
			
 
				-    def __getitem__(self, idx):
			
 
				-        # 读取单个样本的逻辑
			
 
				-        file_name = self.file_names[idx]
			
 
				-        with client.read('/path/to/file.txt') as reader:
			
 
				-            data = reader.read()
			
 
				-            return data
			
 
				-
			
 
				-    def __len__(self):
			
 
				-        return len(self.file_names)
			
 
				-
			
 
				 
			
 
				 def parse_args():
			
 
				     parser = argparse.ArgumentParser("PaddleRec train script")
			
@@ -123,9 +100,9 @@ class Main(object):
 
				         self.pure_bf16 = self.config['pure_bf16']
			
 
				 
			
 
				     def run(self):
			
 
				-        logger.info("Begin 11111111")
			
 
				+        logger.info("Begin 11111111") 
			
 
				         self.init_fleet_with_gloo()
			
 
				-        logger.info("Begin 22222222")
			
 
				+        logger.info("Begin 22222222") 
			
 
				         self.network()
			
 
				         if fleet.is_server():
			
 
				             self.run_server()
			
@@ -138,12 +115,12 @@ class Main(object):
 
				     def init_fleet_with_gloo(use_gloo=True):
			
 
				         if use_gloo:
			
 
				             os.environ["PADDLE_WITH_GLOO"] = "0"
			
 
				-            logger.info("Begin 11111111222222")
			
 
				+            logger.info("Begin 11111111222222") 
			
 
				             role = role_maker.PaddleCloudRoleMaker(
			
 
				                 is_collective=False,
			
 
				                 init_gloo=False
			
 
				             ) 
			
 
				-            logger.info("Begin 11111111333333")
			
 
				+            logger.info("Begin 11111111333333") 
			
 
				             fleet.init(role)
			
 
				             #logger.info("worker_index: %s", fleet.worker_index())
			
 
				             #logger.info("is_first_worker: %s", fleet.is_first_worker())
			
@@ -203,10 +180,19 @@ class Main(object):
 
				         else:
			
 
				             opt_info['stat_var_names'] = []
			
 
				 
			
 
				+        if reader_type == "InmemoryDataset":
			
 
				+            self.reader.load_into_memory()
			
 
				 
			
 
				         for epoch in range(epochs):
			
 
				             epoch_start_time = time.time()
			
 
				-            self.dataset_train_loop(epoch)
			
 
				+
			
 
				+            if sync_mode == "heter":
			
 
				+                self.heter_train_loop(epoch)
			
 
				+            elif reader_type == "QueueDataset":
			
 
				+                self.dataset_train_loop(epoch)
			
 
				+            elif reader_type == "InmemoryDataset":
			
 
				+                self.dataset_train_loop(epoch)
			
 
				+
			
 
				             epoch_time = time.time() - epoch_start_time
			
 
				             epoch_speed = self.example_nums / epoch_time
			
 
				             if use_auc is True:
			
@@ -246,6 +232,9 @@ class Main(object):
 
				                     [feed.name for feed in self.inference_feed_var],
			
 
				                     [self.inference_target_var], self.exe)
			
 
				 
			
 
				+        if reader_type == "InmemoryDataset":
			
 
				+            self.reader.release_memory()
			
 
				+
			
 
				     def init_reader(self):
			
 
				         if fleet.is_server():
			
 
				             return
			
@@ -266,17 +255,27 @@ class Main(object):
 
				 
			
 
				     def dataset_train_loop(self, epoch):
			
 
				         logger.info("Epoch: {}, Running Dataset Begin.".format(epoch))
			
 
				+        
			
 
				         fetch_info = [
			
 
				             "Epoch {} Var {}".format(epoch, var_name)
			
 
				             for var_name in self.metrics
			
 
				         ]
			
 
				+
			
 
				         fetch_vars = [var for _, var in self.metrics.items()]
			
 
				+
			
 
				         print_step = int(config.get("runner.print_interval"))
			
 
				-        print(paddle.static.default_main_program()._fleet_opt)
			
 
				-        dataset = HDFSDataset(hdfs_path='/path/to/data')
			
 
				-        # 创建分布式采样器
			
 
				-        sampler = DistributedSampler(dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank())
			
 
				-        loader = DataLoader(dataset, batch_size=32, sampler=sampler)
			
 
				+
			
 
				+        debug = config.get("runner.dataset_debug", False)
			
 
				+        if config.get("runner.need_dump"):
			
 
				+            debug = True
			
 
				+            dump_fields_path = "{}/{}".format(
			
 
				+                config.get("runner.dump_fields_path"), epoch)
			
 
				+            set_dump_config(paddle.static.default_main_program(), {
			
 
				+                "dump_fields_path": dump_fields_path,
			
 
				+                "dump_fields": config.get("runner.dump_fields")
			
 
				+            })
			
 
				+        logger.info(paddle.static.default_main_program()._fleet_opt)
			
 
				+        
			
 
				         self.exe.train_from_dataset(
			
 
				             program=paddle.static.default_main_program(),
			
 
				             dataset=self.reader,
			
@@ -284,6 +283,7 @@ class Main(object):
 
				             fetch_info=fetch_info,
			
 
				             print_period=print_step,
			
 
				             debug=debug)
			
 
				+        
			
 
				 
			
 
				     def heter_train_loop(self, epoch):
			
 
				         logger.info(