Bladeren bron

read hdfs

丁云鹏 5 maanden geleden
bovenliggende
commit
23c96c1024
20 gewijzigde bestanden met toevoegingen van 33 en 983 verwijderingen
  1. 0 13
      recommend-model-produce/src/main/python/models/dssmdemo/__init__.py
  2. 0 41
      recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_infer.py
  3. 0 48
      recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_train.py
  4. 0 82
      recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_train_insid.py
  5. 0 41
      recommend-model-produce/src/main/python/models/dssmdemo/config.yaml
  6. 0 42
      recommend-model-produce/src/main/python/models/dssmdemo/config_bigdata.yaml
  7. 0 70
      recommend-model-produce/src/main/python/models/dssmdemo/config_online.yaml
  8. 0 46
      recommend-model-produce/src/main/python/models/dssmdemo/config_ps.yaml
  9. 0 24
      recommend-model-produce/src/main/python/models/dssmdemo/data/data_process.sh
  10. 0 27
      recommend-model-produce/src/main/python/models/dssmdemo/data/prepare_dump_data.sh
  11. 0 0
      recommend-model-produce/src/main/python/models/dssmdemo/data/preprocess.py
  12. 0 0
      recommend-model-produce/src/main/python/models/dssmdemo/data/test/test.txt
  13. 0 0
      recommend-model-produce/src/main/python/models/dssmdemo/data/train/train.txt
  14. 0 93
      recommend-model-produce/src/main/python/models/dssmdemo/dygraph_model.py
  15. 0 101
      recommend-model-produce/src/main/python/models/dssmdemo/net.py
  16. 0 140
      recommend-model-produce/src/main/python/models/dssmdemo/readme.md
  17. 0 19
      recommend-model-produce/src/main/python/models/dssmdemo/run.sh
  18. 0 85
      recommend-model-produce/src/main/python/models/dssmdemo/static_model.py
  19. 0 78
      recommend-model-produce/src/main/python/models/dssmdemo/transform.py
  20. 33 33
      recommend-model-produce/src/main/python/tools/static_ps_trainer_v2.py

+ 0 - 13
recommend-model-produce/src/main/python/models/dssmdemo/__init__.py

@@ -1,13 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.

+ 0 - 41
recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_infer.py

@@ -1,41 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import numpy as np
-
-from paddle.io import IterableDataset
-
-
-class RecDataset(IterableDataset):
-    def __init__(self, file_list, config):
-        super(RecDataset, self).__init__()
-        self.file_list = file_list
-
-    def __iter__(self):
-        full_lines = []
-        for file in self.file_list:
-            with open(file, "r") as rf:
-                for line in rf:
-                    output_list = []
-                    features = line.rstrip('\n').split('\t')
-                    query = [
-                        float(feature) for feature in features[0].split(',')
-                    ]
-                    output_list.append(np.array(query).astype('float32'))
-                    pos_doc = [
-                        float(feature) for feature in features[1].split(',')
-                    ]
-                    output_list.append(np.array(pos_doc).astype('float32'))
-                    yield output_list

+ 0 - 48
recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_train.py

@@ -1,48 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import numpy as np
-
-from paddle.io import IterableDataset
-
-
-class RecDataset(IterableDataset):
-    def __init__(self, file_list, config):
-        super(RecDataset, self).__init__()
-        self.file_list = file_list
-
-    def __iter__(self):
-        full_lines = []
-        for file in self.file_list:
-            with open(file, "r") as rf:
-                for line in rf:
-                    output_list = []
-                    features = line.rstrip('\n').split('\t')
-                    query = [
-                        float(feature) for feature in features[0].split(',')
-                    ]
-                    output_list.append(np.array(query).astype('float32'))
-                    pos_doc = [
-                        float(feature) for feature in features[1].split(',')
-                    ]
-                    output_list.append(np.array(pos_doc).astype('float32'))
-
-                    for i in range(len(features) - 2):
-                        output_list.append(
-                            np.array([
-                                float(feature)
-                                for feature in features[i + 2].split(',')
-                            ]).astype('float32'))
-                    yield output_list

+ 0 - 82
recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_train_insid.py

@@ -1,82 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-import yaml
-import six
-import os
-import copy
-import paddle.distributed.fleet as fleet
-import logging
-
-logging.basicConfig(
-    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-class Reader(fleet.MultiSlotStringDataGenerator):
-    def init(self, config):
-        self.config = config
-        self.neg_num = self.config.get("hyper_parameters.neg_num")
-
-    def line_process(self, line):
-        data = line.rstrip('\n').split('\t')
-        ins_id = [data[0]]
-        content = [data[1]]
-        features = data[2:]
-        query = features[0].split(',')
-        pos_doc = features[1].split(',')
-
-        neg_doc_list = []
-        for i in range(self.neg_num):
-            neg_doc_list.append(features[i + 2].split(','))
-
-        return [ins_id, content, query, pos_doc] + neg_doc_list
-
-    def generate_sample(self, line):
-        "Dataset Generator"
-
-        def reader():
-            input_data = self.line_process(line)
-            feature_name = ["insid", "content", "query", "pos_doc"]
-            for i in range(self.neg_num):
-                feature_name.append("neg_doc_{}".format(i))
-            yield zip(feature_name, input_data)
-
-        return reader
-
-    def dataloader(self, file_list):
-        "DataLoader Pyreader Generator"
-
-        def reader():
-            for file in file_list:
-                with open(file, 'r') as f:
-                    for line in f:
-                        input_data = self.line_process(line)
-                        yield input_data
-
-        return reader
-
-
-if __name__ == "__main__":
-    yaml_path = sys.argv[1]
-    utils_path = sys.argv[2]
-    sys.path.append(utils_path)
-    import common_ps
-    yaml_helper = common_ps.YamlHelper()
-    config = yaml_helper.load_yaml(yaml_path)
-
-    r = Reader()
-    r.init(config)
-    # r.init(None)
-    r.run_from_stdin()

+ 0 - 41
recommend-model-produce/src/main/python/models/dssmdemo/config.yaml

@@ -1,41 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-runner:
-  train_data_dir: "data/train"
-  train_reader_path: "bq_reader_train"  # importlib format
-  train_batch_size: 8
-  model_save_path: "output_model_dssm"
-
-  use_gpu: False
-  epochs: 1
-  print_interval: 10
-  
-  test_data_dir: "data/test"
-  infer_reader_path: "bq_reader_infer"  # importlib format
-  infer_batch_size: 1
-  infer_load_path: "output_model_dssm"
-  infer_start_epoch: 0
-  infer_end_epoch: 1
-
-hyper_parameters:
-  optimizer:
-    class: adam
-    learning_rate: 0.001
-    strategy: sync
-  trigram_d: 2900
-  neg_num: 1
-  slice_end: 8
-  fc_sizes: [300, 300, 128]
-  fc_acts: ['relu', 'relu', 'relu']

+ 0 - 42
recommend-model-produce/src/main/python/models/dssmdemo/config_bigdata.yaml

@@ -1,42 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-runner:
-  train_data_dir: "../../../datasets/BQ_dssm/big_train"
-  train_reader_path: "bq_reader_train"  # importlib format
-  train_batch_size: 128
-  model_save_path: "output_model_all_dssm"
-
-  use_gpu: False
-  epochs: 1
-  print_interval: 1
-  
-  test_data_dir: "../../../datasets/BQ_dssm/big_test"
-  infer_reader_path: "bq_reader_infer"  # importlib format
-  infer_batch_size: 1
-  infer_load_path: "output_model_all_dssm"
-  infer_start_epoch: 0
-  infer_end_epoch: 1
-
-hyper_parameters:
-  optimizer:
-    class: adam
-    learning_rate: 0.001
-    strategy: sync
-  trigram_d: 5913
-  neg_num: 1
-  slice_end: 128
-  fc_sizes: [300, 300, 128]
-  fc_acts: ['relu', 'relu', 'relu']

+ 0 - 70
recommend-model-produce/src/main/python/models/dssmdemo/config_online.yaml

@@ -1,70 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-runner:
-  train_data_dir: "data/train_with_insid"
-  test_data_dir: "data/test_with_insid"
-  # train_reader_path: "bq_reader_train"  # importlib format
-  days: "{20210803..20210804}"
-  pass_per_day: 1
-  train_batch_size: 8
-  test_batch_size: 8
-  model_save_path: "output_model_dssm"
-
-  reader_type: "InmemoryDataset"  # DataLoader / QueueDataset / RecDataset / InmemoryDataset
-  pipe_command: "python3 bq_reader_train_insid.py"
-
-  sync_mode: "async"
-  # thread_num: 1
-  train_thread_num: 1
-  test_thread_num: 1
-
-  use_gpu: False
-  epochs: 1
-  print_interval: 1
-
-  dataset_debug: False
-
-  # when you need to prune net, please set need_prune to True,
-  # and need to set prune_feed_vars and prune_target_var in static_model.py
-  need_prune: True
-
-  parse_ins_id: True
-  parse_content: True
-  
-  # when you need to dump fileds and params in training, please set need_train_dump to True,
-  # and need to set train_dump_fields and train_dump_params in static_model.py
-  need_train_dump: True
-  # train_dump_fields_dir: "afs:/xxx"
-  train_dump_fields_dir: "./train_dump_data"
-
-  # when you need to dump fileds in inference, please set need_infer_dump to True,
-  # and need to set infer_dump_fields in static_model.py
-  need_infer_dump: True
-  # infer_dump_fields_dir: "afs:/xxx"
-  infer_dump_fields_dir: "./infer_dump_data"
-
-  fs_name: "afs://xxx"
-  fs_ugi: "xxx,xxx"
-  
-hyper_parameters:
-  optimizer:
-    class: adam
-    learning_rate: 0.001
-    strategy: sync
-  trigram_d: 2900
-  neg_num: 1
-  slice_end: 8
-  fc_sizes: [300, 300, 128]
-  fc_acts: ['relu', 'relu', 'relu']

+ 0 - 46
recommend-model-produce/src/main/python/models/dssmdemo/config_ps.yaml

@@ -1,46 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-runner:
-  train_data_dir: "data/train"
-  train_reader_path: "bq_reader_train"  # importlib format
-  train_batch_size: 8
-  model_save_path: "output_model_dssm"
-
-  reader_type: "QueueDataset"  # DataLoader / QueueDataset / RecDataset
-  pipe_command: "python bq_reader_train_ps.py"
-  thread_num: 1
-  sync_mode: "async"
-
-  use_gpu: False
-  epochs: 10
-  print_interval: 10
-  
-  test_data_dir: "data/test"
-  infer_reader_path: "bq_reader_infer"  # importlib format
-  infer_batch_size: 1
-  infer_load_path: "output_model_dssm"
-  infer_start_epoch: 0
-  infer_end_epoch: 1
-
-hyper_parameters:
-  optimizer:
-    class: adam
-    learning_rate: 0.001
-    strategy: sync
-  trigram_d: 2900
-  neg_num: 1
-  slice_end: 8
-  fc_sizes: [300, 300, 128]
-  fc_acts: ['relu', 'relu', 'relu']

+ 0 - 24
recommend-model-produce/src/main/python/models/dssmdemo/data/data_process.sh

@@ -1,24 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#!/bin/bash
-
-wget https://paddlerec.bj.bcebos.com/dssm%2Fbq.tar.gz
-tar xzf dssm%2Fbq.tar.gz
-rm -f dssm%2Fbq.tar.gz
-mv bq/train.txt ./raw_data.txt
-python3 preprocess.py
-mkdir big_train
-mv train.txt ./big_train
-mkdir big_test
-mv test.txt ./big_test

+ 0 - 27
recommend-model-produce/src/main/python/models/dssmdemo/data/prepare_dump_data.sh

@@ -1,27 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#!/bin/bash
-
-
-cat train/train.txt | awk -F'\t' 'BEGIN{OFS="\t"}{print NR, "item_"NR, $0}' > data_with_lineid
-for i in 20210803 20210804
-do
-    for j in 1
-    do
-        mkdir -p train_with_insid/$i/$j
-        cp data_with_lineid train_with_insid/$i/$j
-        mkdir -p test_with_insid/$i/$j
-        cp data_with_lineid test_with_insid/$i/$j
-    done
-done

+ 0 - 0
recommend-model-produce/src/main/python/models/dssmdemo/data/preprocess.py


File diff suppressed because it is too large
+ 0 - 0
recommend-model-produce/src/main/python/models/dssmdemo/data/test/test.txt


+ 0 - 0
recommend-model-produce/src/main/python/models/dssmdemo/data/train/train.txt


+ 0 - 93
recommend-model-produce/src/main/python/models/dssmdemo/dygraph_model.py

@@ -1,93 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-import math
-import net
-
-
-class DygraphModel():
-    # define model
-    def create_model(self, config):
-        trigram_d = config.get('hyper_parameters.trigram_d', None)
-        neg_num = config.get('hyper_parameters.neg_num', None)
-        slice_end = config.get('hyper_parameters.slice_end', None)
-        fc_sizes = config.get('hyper_parameters.fc_sizes', None)
-        fc_acts = config.get('hyper_parameters.fc_acts', None)
-
-        DSSM_model = net.DSSMLayer(trigram_d, neg_num, slice_end, fc_sizes,
-                                   fc_acts)
-        return DSSM_model
-
-    # define feeds which convert numpy of batch data to paddle.tensor 
-    def create_feeds_train(self, batch_data, trigram_d):
-        query = paddle.to_tensor(batch_data[0].numpy().astype('float32')
-                                 .reshape(-1, trigram_d))
-        doc_pos = paddle.to_tensor(batch_data[1].numpy().astype('float32')
-                                   .reshape(-1, trigram_d))
-        doc_negs = []
-        for ele in batch_data[2:]:
-            doc_negs.append(
-                paddle.to_tensor(ele.numpy().astype('float32').reshape(
-                    -1, trigram_d)))
-        return [query, doc_pos] + doc_negs
-
-    def create_feeds_infer(self, batch_data, trigram_d):
-        query = paddle.to_tensor(batch_data[0].numpy().astype('float32')
-                                 .reshape(-1, trigram_d))
-        doc_pos = paddle.to_tensor(batch_data[1].numpy().astype('float32')
-                                   .reshape(-1, trigram_d))
-        return [query, doc_pos]
-
-    # define loss function by predicts and label
-    def create_loss(self, hit_prob):
-        loss = -paddle.sum(paddle.log(hit_prob), axis=-1)
-        avg_cost = paddle.mean(x=loss)
-        return avg_cost
-
-    # define optimizer 
-    def create_optimizer(self, dy_model, config):
-        lr = config.get("hyper_parameters.optimizer.learning_rate", 0.001)
-        optimizer = paddle.optimizer.Adam(
-            learning_rate=lr, parameters=dy_model.parameters())
-        return optimizer
-
-    # define metrics such as auc/acc
-    # multi-task need to define multi metric
-    def create_metrics(self):
-        metrics_list_name = []
-        metrics_list = []
-        return metrics_list, metrics_list_name
-
-    # construct train forward phase  
-    def train_forward(self, dy_model, metrics_list, batch_data, config):
-        trigram_d = config.get('hyper_parameters.trigram_d', None)
-        inputs = self.create_feeds_train(batch_data, trigram_d)
-
-        R_Q_D_p, hit_prob = dy_model.forward(inputs, False)
-        loss = self.create_loss(hit_prob)
-        # update metrics
-        print_dict = {"loss": loss}
-        return loss, metrics_list, print_dict
-
-    def infer_forward(self, dy_model, metrics_list, batch_data, config):
-        trigram_d = config.get('hyper_parameters.trigram_d', None)
-        inputs = self.create_feeds_infer(batch_data, trigram_d)
-
-        R_Q_D_p, hit_prob = dy_model.forward(inputs, True)
-        # update metrics
-        print_dict = {"query_doc_sim": R_Q_D_p}
-        return metrics_list, print_dict

+ 0 - 101
recommend-model-produce/src/main/python/models/dssmdemo/net.py

@@ -1,101 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-import numpy as np
-
-
-class DSSMLayer(nn.Layer):
-    def __init__(self, trigram_d, neg_num, slice_end, hidden_layers,
-                 hidden_acts):
-        super(DSSMLayer, self).__init__()
-
-        self.hidden_layers = [trigram_d] + hidden_layers
-        self.hidden_acts = hidden_acts
-        self.slice_end = slice_end
-
-        self._query_layers = []
-        for i in range(len(self.hidden_layers) - 1):
-            linear = paddle.nn.Linear(
-                in_features=self.hidden_layers[i],
-                out_features=self.hidden_layers[i + 1],
-                weight_attr=paddle.ParamAttr(
-                    initializer=paddle.nn.initializer.XavierNormal(
-                        fan_in=self.hidden_layers[i],
-                        fan_out=self.hidden_layers[i + 1])),
-                bias_attr=paddle.ParamAttr(
-                    initializer=paddle.nn.initializer.XavierNormal(
-                        fan_in=self.hidden_layers[i],
-                        fan_out=self.hidden_layers[i + 1])))
-            self.add_sublayer('query_linear_%d' % i, linear)
-            self._query_layers.append(linear)
-            if self.hidden_acts[i] == "relu":
-                act = paddle.nn.ReLU()
-                self.add_sublayer('query_act_%d' % i, act)
-                self._query_layers.append(act)
-
-        self._doc_layers = []
-        for i in range(len(self.hidden_layers) - 1):
-            linear = paddle.nn.Linear(
-                in_features=self.hidden_layers[i],
-                out_features=self.hidden_layers[i + 1],
-                weight_attr=paddle.ParamAttr(
-                    initializer=paddle.nn.initializer.XavierNormal(
-                        fan_in=self.hidden_layers[i],
-                        fan_out=self.hidden_layers[i + 1])),
-                bias_attr=paddle.ParamAttr(
-                    initializer=paddle.nn.initializer.XavierNormal(
-                        fan_in=self.hidden_layers[i],
-                        fan_out=self.hidden_layers[i + 1])))
-            self.add_sublayer('pos_linear_%d' % i, linear)
-            self._doc_layers.append(linear)
-            if self.hidden_acts[i] == "relu":
-                act = paddle.nn.ReLU()
-                self.add_sublayer('pos_act_%d' % i, act)
-                self._doc_layers.append(act)
-
-    def forward(self, input_data, is_infer):
-        query_fc = input_data[0]
-        for n_layer in self._query_layers:
-            query_fc = n_layer(query_fc)
-        self.query_fc = query_fc
-
-        doc_pos_fc = input_data[1]
-        for n_layer in self._doc_layers:
-            doc_pos_fc = n_layer(doc_pos_fc)
-        self.doc_pos_fc = doc_pos_fc
-
-        self.params = [self._query_layers[-2].bias]
-
-        R_Q_D_p = F.cosine_similarity(
-            query_fc, doc_pos_fc, axis=1).reshape([-1, 1])
-
-        if is_infer:
-            return R_Q_D_p, paddle.ones(shape=[self.slice_end, 1])
-
-        R_Q_D_ns = []
-        for i in range(len(input_data) - 2):
-            doc_neg_fc_i = input_data[i + 2]
-            for n_layer in self._doc_layers:
-                doc_neg_fc_i = n_layer(doc_neg_fc_i)
-            R_Q_D_n = F.cosine_similarity(
-                query_fc, doc_neg_fc_i, axis=1).reshape([-1, 1])
-            R_Q_D_ns.append(R_Q_D_n)
-        concat_Rs = paddle.concat(x=[R_Q_D_p] + R_Q_D_ns, axis=1)
-        prob = F.softmax(concat_Rs, axis=1)
-        hit_prob = paddle.slice(
-            prob, axes=[0, 1], starts=[0, 0], ends=[self.slice_end, 1])
-        return R_Q_D_p, hit_prob

+ 0 - 140
recommend-model-produce/src/main/python/models/dssmdemo/readme.md

@@ -1,140 +0,0 @@
-# DSSM文本匹配模型
-
-**[AI Studio在线运行环境](https://aistudio.baidu.com/aistudio/projectdetail/3238124)**
-
-以下是本例的简要目录结构及说明: 
-
-```
-├── data #样例数据
-    ├── train
-        ├── train.txt #训练数据样例
-    ├── test
-        ├── test.txt #测试数据样例
-    ├── preprocess.py #数据处理程序
-    ├── data_process #数据一键处理脚本
-├── __init__.py
-├── README.md #文档
-├── config.yaml # sample数据配置
-├── config_bigdata.yaml # 全量数据配置
-├── net.py # 模型核心组网(动静统一)
-├── static_model.py # 构建静态图
-├── dygraph_model.py # 构建动态图
-├── transform.py #将数据整理成合适的格式方便计算指标
-├── run.sh #全量数据集中的训练脚本,从训练到预测并计算指标
-├── bq_reader_train.py #训练时数据读取程序
-├── bq_reader_infer.py #预测时数据读取程序
-```
-
-注:在阅读该示例前,建议您先了解以下内容:
-
-[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)  
-[readthedocs文档](https://paddlerec.readthedocs.io/en/latest/models/match/dssm.html)  
-
-
-## 内容
-
-- [模型简介](#模型简介)
-- [数据准备](#数据准备)
-- [运行环境](#运行环境)
-- [快速开始](#快速开始)
-- [模型组网](#模型组网)
-- [效果复现](#效果复现)
-- [进阶使用](#进阶使用)
-- [FAQ](#FAQ)
-
-## 模型简介
-DSSM是Deep Structured Semantic Model的缩写,即我们通常说的基于深度网络的语义模型,其核心思想是将query和doc映射到到共同维度的语义空间中,通过最大化query和doc语义向量之间的余弦相似度,从而训练得到隐含语义模型,达到检索的目的。DSSM有很广泛的应用,比如:搜索引擎检索,广告相关性,问答系统,机器翻译等。    
-
-## 数据准备
-BQ是一个智能客服中文问句匹配数据集,该数据集是自动问答系统语料,共有120,000对句子对,并标注了句子对相似度值。数据中存在错别字、语法不规范等问题,但更加贴近工业场景。
-原始数据集样例:
-```
-请问一天是否都是限定只能转入或转出都是五万。    微众多少可以赎回短期理财        0
-微粒咨询电话号码多少    你们的人工客服电话是多少        1
-已经在银行换了新预留号码。      我现在换了电话号码,这个需要更换吗      1
-每个字段以tab键分隔,第1,2列表示两个文本。第3列表示类别(0或1,0表示两个文本不相似,1表示两个文本相似)。
-```
-
-## 运行环境
-PaddlePaddle>=2.0
-
-python 2.7/3.5/3.6/3.7
-
-os : windows/linux/macos 
-12234
-## 快速开始
-本文提供了样例数据可以供您快速体验,在任意目录下均可执行。在dssm模型目录的快速执行命令如下: 
-```bash
-# 进入模型目录
-# cd models/match/dssm # 在任意目录均可运行
-# 动态图训练
-python -u ../../../tools/trainer.py -m config.yaml # 全量数据运行config_bigdata.yaml 
-# 动态图预测
-python -u ../../../tools/infer.py -m config.yaml 
-
-# 静态图训练
-python -u ../../../tools/static_trainer.py -m config.yaml # 全量数据运行config_bigdata.yaml 
-# 静态图预测
-python -u ../../../tools/static_infer.py -m config.yaml 
-``` 
-
-## 模型组网
-DSSM 的输入采用 BOW(Bag of words)的方式,相当于把字向量的位置信息抛弃了,整个句子里的词都放在一个袋子里了。将一个句子用这种方式转化为一个向量输入DNN中。  
-Query 和 Doc 的语义相似性可以用这两个向量的 cosine 距离表示,然后通过softmax 函数选出与Query语义最相似的样本 Doc 。  
-
-模型的具体细节可以阅读论文[DSSM](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cikm2013_DSSM_fullversion.pdf):
-<p align="center">
-<img align="center" src="../../../doc/imgs/dssm.png">
-<p>
-
-## 效果复现
-为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。  
-在全量数据下模型的指标如下:  
-
-| 模型 | 正序率 | batch_size | epoch_num| Time of each epoch |
-| :------| :------ | :------ | :------| :------ | 
-| DSSM | 0.93 | 128 | 1 | 约9分钟 |  
-
-1. 确认您当前所在目录为PaddleRec/models/match/dssm
-2. 进入paddlerec/datasets/BQ_dssm目录下,执行该脚本,会从国内源的服务器上下载我们预处理完成的BQ全量数据集,并解压到指定文件夹。  
-``` bash
-cd ../../../datasets/BQ_dssm
-sh run.sh
-```
-3. 切回模型目录,直接一键运行:bash run.sh 即可得到复现的论文效果.
-执行该脚本后,会开始自动训练并测试模型,将测试的结果保存到result.txt文件,再执行transform.py整理格式,最后通过执行../../../tools/cal_pos_neg.py进行评估得到数据的正序率指标   
-```bash
-cd - # 切回模型目录
-bash run.sh #动态图训练并测试,最后得到指标
-```
-
-## 进阶使用
-DSSM作为推荐系统中一种向量召回的方式,一般需要将doc侧的向量预先计算出来,灌入向量搜索引擎(例如milvus)中,同时保存的模型仅为query侧的模型。线上使用阶段,输入query侧的数据,计算出query侧向量后,直接通过向量搜索引擎召回对应的doc。  
-一般在训练的过程中,增加预测阶段,dump出全量的doc侧向量,需要做如下修改: 
-1. 为了区分dump出的向量,预测阶段使用的数据需要增加insid和content两个字段,其中insid唯一标记样本,content指明对应的doc。并在数据处理脚本中对这两个字段进行解析,详见bq_reader_train_insid.py脚本。
-2. dataset选择InmemoryDataset,同时设置
-```python
-dataset.set_parse_ins_id(True)
-dataset.set_parse_content(True)
-```
-3. 在static_model.py中配置需要dump的变量(doc侧最上层输出)
-```python
-self.infer_dump_fields = [dssm_model.doc_pos_fc]
-```
-4. 配置文件中,打开预测阶段的dump功能,并配置dump_path
-```bash
-need_infer_dump: True
-infer_dump_fields_dir: "./infer_dump_data"
-```
-保存模型时,只需要保存query侧网络
-1. 配置文件中,打开裁剪网络开关
-```bash
-need_prune: True
-```
-2. 在static_model.py中配置裁剪网络的输入和输出
-```python
-self.prune_feed_vars = [query]
-self.prune_target_var = dssm_model.query_fc
-```
-  
-## FAQ

+ 0 - 19
recommend-model-produce/src/main/python/models/dssmdemo/run.sh

@@ -1,19 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#!/bin/bash
-echo "................run................."
-python -u ../../../tools/trainer.py -m config_bigdata.yaml
-python -u ../../../tools/infer.py -m config_bigdata.yaml &> result.txt
-python transform.py
-python ../../../tools/cal_pos_neg.py pair.txt

+ 0 - 85
recommend-model-produce/src/main/python/models/dssmdemo/static_model.py

@@ -1,85 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import paddle
-from net import DSSMLayer
-
-
-class StaticModel():
-    def __init__(self, config):
-        self.cost = None
-        self.config = config
-        self._init_hyper_parameters()
-
-    def _init_hyper_parameters(self):
-        self.trigram_d = self.config.get("hyper_parameters.trigram_d")
-        self.neg_num = self.config.get("hyper_parameters.neg_num")
-        self.hidden_layers = self.config.get("hyper_parameters.fc_sizes")
-        self.hidden_acts = self.config.get("hyper_parameters.fc_acts")
-        self.learning_rate = self.config.get("hyper_parameters.learning_rate")
-        self.slice_end = self.config.get("hyper_parameters.slice_end")
-        self.learning_rate = self.config.get(
-            "hyper_parameters.optimizer.learning_rate")
-
-    def create_feeds(self, is_infer=False):
-        query = paddle.static.data(
-            name="query", shape=[-1, self.trigram_d], dtype='float32')
-        self.prune_feed_vars = [query]
-
-        doc_pos = paddle.static.data(
-            name="doc_pos", shape=[-1, self.trigram_d], dtype='float32')
-
-        if is_infer:
-            return [query, doc_pos]
-
-        doc_negs = [
-            paddle.static.data(
-                name="doc_neg_" + str(i),
-                shape=[-1, self.trigram_d],
-                dtype="float32") for i in range(self.neg_num)
-        ]
-        feeds_list = [query, doc_pos] + doc_negs
-        return feeds_list
-
-    def net(self, input, is_infer=False):
-        dssm_model = DSSMLayer(self.trigram_d, self.neg_num, self.slice_end,
-                               self.hidden_layers, self.hidden_acts)
-        R_Q_D_p, hit_prob = dssm_model.forward(input, is_infer)
-
-        self.inference_target_var = R_Q_D_p
-        self.prune_target_var = dssm_model.query_fc
-        self.train_dump_fields = [dssm_model.query_fc, R_Q_D_p]
-        self.train_dump_params = dssm_model.params
-        self.infer_dump_fields = [dssm_model.doc_pos_fc]
-        if is_infer:
-            fetch_dict = {'query_doc_sim': R_Q_D_p}
-            return fetch_dict
-        loss = -paddle.sum(paddle.log(hit_prob), axis=-1)
-        avg_cost = paddle.mean(x=loss)
-        # print(avg_cost)
-        self._cost = avg_cost
-        fetch_dict = {'Loss': avg_cost}
-        return fetch_dict
-
-    def create_optimizer(self, strategy=None):
-        optimizer = paddle.optimizer.Adam(
-            learning_rate=self.learning_rate, lazy_mode=True)
-        if strategy != None:
-            import paddle.distributed.fleet as fleet
-            optimizer = fleet.distributed_optimizer(optimizer, strategy)
-        optimizer.minimize(self._cost)
-
-    def infer_net(self, input):
-        return self.net(input, is_infer=True)

+ 0 - 78
recommend-model-produce/src/main/python/models/dssmdemo/transform.py

@@ -1,78 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import numpy as np
-import sklearn.metrics
-
-filename = './result.txt'
-f = open(filename, "r")
-lines = f.readlines()
-f.close()
-result = []
-for line in lines:
-    if "query_doc_sim" in str(line):
-        result.append(line)
-result = result[:-1]
-f = open(filename, "w")
-for i in range(len(result)):
-    f.write(str(result[i]))
-f.close()
-
-label = []
-filename = '../../../datasets/BQ_dssm/label.txt'
-f = open(filename, "r")
-#f.readline()
-num = 0
-for line in f.readlines():
-    num = num + 1
-    line = line.strip()
-    label.append(line)
-f.close()
-print(num)
-
-filename = './result.txt'
-sim = []
-for line in open(filename):
-    line = line.strip().split(",")
-    line[3] = line[3].split(":")
-    line = line[3][1].strip(" ")
-    line = line.strip("[")
-    line = line.strip("]")
-    sim.append(float(line))
-
-filename = '../../../datasets/BQ_dssm/big_test/test.txt'
-f = open(filename, "r")
-#f.readline()
-query = []
-for line in f.readlines():
-    line = line.strip().split("\t")
-    query.append(line[0])
-f.close()
-
-
-def takeFirst(x):
-    return x[0]
-
-
-filename = 'pair.txt'
-line = []
-print(len(query), len(sim), len(label))
-for i in range(len(sim)):
-    line.append([str(query[i]), str(sim[i]), str(label[i])])
-line.sort(key=takeFirst)
-f = open(filename, "w")
-for i in line:
-    f.write(i[0] + "\t" + i[1] + "\t" + i[2] + "\n")
-f.close()

+ 33 - 33
recommend-model-produce/src/main/python/tools/static_ps_trainer_v2.py

@@ -24,10 +24,7 @@ import time
 import sys
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
-from hdfs import InsecureClient
 import paddle
-import paddle.distributed as dist
-from paddle.io import Dataset, DataLoader
 
 import warnings
 import logging
@@ -36,7 +33,6 @@ import numpy as np
 import struct
 from utils.utils_single import auc
 
-
 __dir__ = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))
 
@@ -47,25 +43,6 @@ logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-client = InsecureClient('http://192.168.141.208:50070', user='root')
-
-class HDFSDataset(Dataset):
-    def __init__(self, hdfs_path, batch_size=32):
-        self.hdfs_path = hdfs_path
-        self.batch_size = batch_size
-        self.file_names = [f for f in self.client.list(hdfs_path) if f.type == 'FILE']
-
-
-    def __getitem__(self, idx):
-        # 读取单个样本的逻辑
-        file_name = self.file_names[idx]
-        with client.read('/path/to/file.txt') as reader:
-            data = reader.read()
-            return data
-
-    def __len__(self):
-        return len(self.file_names)
-
 
 def parse_args():
     parser = argparse.ArgumentParser("PaddleRec train script")
@@ -123,9 +100,9 @@ class Main(object):
         self.pure_bf16 = self.config['pure_bf16']
 
     def run(self):
-        logger.info("Begin 11111111")
+        logger.info("Begin 11111111") 
         self.init_fleet_with_gloo()
-        logger.info("Begin 22222222")
+        logger.info("Begin 22222222") 
         self.network()
         if fleet.is_server():
             self.run_server()
@@ -138,12 +115,12 @@ class Main(object):
     def init_fleet_with_gloo(use_gloo=True):
         if use_gloo:
             os.environ["PADDLE_WITH_GLOO"] = "0"
-            logger.info("Begin 11111111222222")
+            logger.info("Begin 11111111222222") 
             role = role_maker.PaddleCloudRoleMaker(
                 is_collective=False,
                 init_gloo=False
             ) 
-            logger.info("Begin 11111111333333")
+            logger.info("Begin 11111111333333") 
             fleet.init(role)
             #logger.info("worker_index: %s", fleet.worker_index())
             #logger.info("is_first_worker: %s", fleet.is_first_worker())
@@ -203,10 +180,19 @@ class Main(object):
         else:
             opt_info['stat_var_names'] = []
 
+        if reader_type == "InmemoryDataset":
+            self.reader.load_into_memory()
 
         for epoch in range(epochs):
             epoch_start_time = time.time()
-            self.dataset_train_loop(epoch)
+
+            if sync_mode == "heter":
+                self.heter_train_loop(epoch)
+            elif reader_type == "QueueDataset":
+                self.dataset_train_loop(epoch)
+            elif reader_type == "InmemoryDataset":
+                self.dataset_train_loop(epoch)
+
             epoch_time = time.time() - epoch_start_time
             epoch_speed = self.example_nums / epoch_time
             if use_auc is True:
@@ -246,6 +232,9 @@ class Main(object):
                     [feed.name for feed in self.inference_feed_var],
                     [self.inference_target_var], self.exe)
 
+        if reader_type == "InmemoryDataset":
+            self.reader.release_memory()
+
     def init_reader(self):
         if fleet.is_server():
             return
@@ -266,17 +255,27 @@ class Main(object):
 
     def dataset_train_loop(self, epoch):
         logger.info("Epoch: {}, Running Dataset Begin.".format(epoch))
+        
         fetch_info = [
             "Epoch {} Var {}".format(epoch, var_name)
             for var_name in self.metrics
         ]
+
         fetch_vars = [var for _, var in self.metrics.items()]
+
         print_step = int(config.get("runner.print_interval"))
-        print(paddle.static.default_main_program()._fleet_opt)
-        dataset = HDFSDataset(hdfs_path='/path/to/data')
-        # 创建分布式采样器
-        sampler = DistributedSampler(dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank())
-        loader = DataLoader(dataset, batch_size=32, sampler=sampler)
+
+        debug = config.get("runner.dataset_debug", False)
+        if config.get("runner.need_dump"):
+            debug = True
+            dump_fields_path = "{}/{}".format(
+                config.get("runner.dump_fields_path"), epoch)
+            set_dump_config(paddle.static.default_main_program(), {
+                "dump_fields_path": dump_fields_path,
+                "dump_fields": config.get("runner.dump_fields")
+            })
+        logger.info(paddle.static.default_main_program()._fleet_opt)
+        
         self.exe.train_from_dataset(
             program=paddle.static.default_main_program(),
             dataset=self.reader,
@@ -284,6 +283,7 @@ class Main(object):
             fetch_info=fetch_info,
             print_period=print_step,
             debug=debug)
+        
 
     def heter_train_loop(self, epoch):
         logger.info(

Some files were not shown because too many files changed in this diff