丁云鹏 5 mēneši atpakaļ
vecāks
revīzija
45766f389a
21 mainītis faili ar 958 papildinājumiem un 22 dzēšanām
  1. 0 0
      recommend-model-produce/src/main/python/models/dssm/config_ps.yaml
  2. 13 0
      recommend-model-produce/src/main/python/models/dssmdemo/__init__.py
  3. 41 0
      recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_infer.py
  4. 48 0
      recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_train.py
  5. 82 0
      recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_train_insid.py
  6. 41 0
      recommend-model-produce/src/main/python/models/dssmdemo/config.yaml
  7. 42 0
      recommend-model-produce/src/main/python/models/dssmdemo/config_bigdata.yaml
  8. 70 0
      recommend-model-produce/src/main/python/models/dssmdemo/config_online.yaml
  9. 46 0
      recommend-model-produce/src/main/python/models/dssmdemo/config_ps.yaml
  10. 24 0
      recommend-model-produce/src/main/python/models/dssmdemo/data/data_process.sh
  11. 27 0
      recommend-model-produce/src/main/python/models/dssmdemo/data/prepare_dump_data.sh
  12. 0 0
      recommend-model-produce/src/main/python/models/dssmdemo/data/preprocess.py
  13. 0 0
      recommend-model-produce/src/main/python/models/dssmdemo/data/test/test.txt
  14. 0 0
      recommend-model-produce/src/main/python/models/dssmdemo/data/train/train.txt
  15. 93 0
      recommend-model-produce/src/main/python/models/dssmdemo/dygraph_model.py
  16. 101 0
      recommend-model-produce/src/main/python/models/dssmdemo/net.py
  17. 140 0
      recommend-model-produce/src/main/python/models/dssmdemo/readme.md
  18. 19 0
      recommend-model-produce/src/main/python/models/dssmdemo/run.sh
  19. 85 0
      recommend-model-produce/src/main/python/models/dssmdemo/static_model.py
  20. 78 0
      recommend-model-produce/src/main/python/models/dssmdemo/transform.py
  21. 8 22
      recommend-model-produce/src/main/python/tools/static_ps_trainer_v2.py

+ 0 - 0
recommend-model-produce/src/main/python/models/dssm/config_ps.yaml


+ 13 - 0
recommend-model-produce/src/main/python/models/dssmdemo/__init__.py

@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

+ 41 - 0
recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_infer.py

@@ -0,0 +1,41 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+
+from paddle.io import IterableDataset
+
+
+class RecDataset(IterableDataset):
+    def __init__(self, file_list, config):
+        super(RecDataset, self).__init__()
+        self.file_list = file_list
+
+    def __iter__(self):
+        full_lines = []
+        for file in self.file_list:
+            with open(file, "r") as rf:
+                for line in rf:
+                    output_list = []
+                    features = line.rstrip('\n').split('\t')
+                    query = [
+                        float(feature) for feature in features[0].split(',')
+                    ]
+                    output_list.append(np.array(query).astype('float32'))
+                    pos_doc = [
+                        float(feature) for feature in features[1].split(',')
+                    ]
+                    output_list.append(np.array(pos_doc).astype('float32'))
+                    yield output_list

+ 48 - 0
recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_train.py

@@ -0,0 +1,48 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+
+from paddle.io import IterableDataset
+
+
+class RecDataset(IterableDataset):
+    def __init__(self, file_list, config):
+        super(RecDataset, self).__init__()
+        self.file_list = file_list
+
+    def __iter__(self):
+        full_lines = []
+        for file in self.file_list:
+            with open(file, "r") as rf:
+                for line in rf:
+                    output_list = []
+                    features = line.rstrip('\n').split('\t')
+                    query = [
+                        float(feature) for feature in features[0].split(',')
+                    ]
+                    output_list.append(np.array(query).astype('float32'))
+                    pos_doc = [
+                        float(feature) for feature in features[1].split(',')
+                    ]
+                    output_list.append(np.array(pos_doc).astype('float32'))
+
+                    for i in range(len(features) - 2):
+                        output_list.append(
+                            np.array([
+                                float(feature)
+                                for feature in features[i + 2].split(',')
+                            ]).astype('float32'))
+                    yield output_list

+ 82 - 0
recommend-model-produce/src/main/python/models/dssmdemo/bq_reader_train_insid.py

@@ -0,0 +1,82 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import yaml
+import six
+import os
+import copy
+import paddle.distributed.fleet as fleet
+import logging
+
+logging.basicConfig(
+    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class Reader(fleet.MultiSlotStringDataGenerator):
+    def init(self, config):
+        self.config = config
+        self.neg_num = self.config.get("hyper_parameters.neg_num")
+
+    def line_process(self, line):
+        data = line.rstrip('\n').split('\t')
+        ins_id = [data[0]]
+        content = [data[1]]
+        features = data[2:]
+        query = features[0].split(',')
+        pos_doc = features[1].split(',')
+
+        neg_doc_list = []
+        for i in range(self.neg_num):
+            neg_doc_list.append(features[i + 2].split(','))
+
+        return [ins_id, content, query, pos_doc] + neg_doc_list
+
+    def generate_sample(self, line):
+        "Dataset Generator"
+
+        def reader():
+            input_data = self.line_process(line)
+            feature_name = ["insid", "content", "query", "pos_doc"]
+            for i in range(self.neg_num):
+                feature_name.append("neg_doc_{}".format(i))
+            yield zip(feature_name, input_data)
+
+        return reader
+
+    def dataloader(self, file_list):
+        "DataLoader Pyreader Generator"
+
+        def reader():
+            for file in file_list:
+                with open(file, 'r') as f:
+                    for line in f:
+                        input_data = self.line_process(line)
+                        yield input_data
+
+        return reader
+
+
+if __name__ == "__main__":
+    yaml_path = sys.argv[1]
+    utils_path = sys.argv[2]
+    sys.path.append(utils_path)
+    import common_ps
+    yaml_helper = common_ps.YamlHelper()
+    config = yaml_helper.load_yaml(yaml_path)
+
+    r = Reader()
+    r.init(config)
+    # r.init(None)
+    r.run_from_stdin()

+ 41 - 0
recommend-model-produce/src/main/python/models/dssmdemo/config.yaml

@@ -0,0 +1,41 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+runner:
+  train_data_dir: "data/train"
+  train_reader_path: "bq_reader_train"  # importlib format
+  train_batch_size: 8
+  model_save_path: "output_model_dssm"
+
+  use_gpu: False
+  epochs: 1
+  print_interval: 10
+  
+  test_data_dir: "data/test"
+  infer_reader_path: "bq_reader_infer"  # importlib format
+  infer_batch_size: 1
+  infer_load_path: "output_model_dssm"
+  infer_start_epoch: 0
+  infer_end_epoch: 1
+
+hyper_parameters:
+  optimizer:
+    class: adam
+    learning_rate: 0.001
+    strategy: sync
+  trigram_d: 2900
+  neg_num: 1
+  slice_end: 8
+  fc_sizes: [300, 300, 128]
+  fc_acts: ['relu', 'relu', 'relu']

+ 42 - 0
recommend-model-produce/src/main/python/models/dssmdemo/config_bigdata.yaml

@@ -0,0 +1,42 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+runner:
+  train_data_dir: "../../../datasets/BQ_dssm/big_train"
+  train_reader_path: "bq_reader_train"  # importlib format
+  train_batch_size: 128
+  model_save_path: "output_model_all_dssm"
+
+  use_gpu: False
+  epochs: 1
+  print_interval: 1
+  
+  test_data_dir: "../../../datasets/BQ_dssm/big_test"
+  infer_reader_path: "bq_reader_infer"  # importlib format
+  infer_batch_size: 1
+  infer_load_path: "output_model_all_dssm"
+  infer_start_epoch: 0
+  infer_end_epoch: 1
+
+hyper_parameters:
+  optimizer:
+    class: adam
+    learning_rate: 0.001
+    strategy: sync
+  trigram_d: 5913
+  neg_num: 1
+  slice_end: 128
+  fc_sizes: [300, 300, 128]
+  fc_acts: ['relu', 'relu', 'relu']

+ 70 - 0
recommend-model-produce/src/main/python/models/dssmdemo/config_online.yaml

@@ -0,0 +1,70 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+runner:
+  train_data_dir: "data/train_with_insid"
+  test_data_dir: "data/test_with_insid"
+  # train_reader_path: "bq_reader_train"  # importlib format
+  days: "{20210803..20210804}"
+  pass_per_day: 1
+  train_batch_size: 8
+  test_batch_size: 8
+  model_save_path: "output_model_dssm"
+
+  reader_type: "InmemoryDataset"  # DataLoader / QueueDataset / RecDataset / InmemoryDataset
+  pipe_command: "python3 bq_reader_train_insid.py"
+
+  sync_mode: "async"
+  # thread_num: 1
+  train_thread_num: 1
+  test_thread_num: 1
+
+  use_gpu: False
+  epochs: 1
+  print_interval: 1
+
+  dataset_debug: False
+
+  # when you need to prune net, please set need_prune to True,
+  # and need to set prune_feed_vars and prune_target_var in static_model.py
+  need_prune: True
+
+  parse_ins_id: True
+  parse_content: True
+  
+  # when you need to dump fileds and params in training, please set need_train_dump to True,
+  # and need to set train_dump_fields and train_dump_params in static_model.py
+  need_train_dump: True
+  # train_dump_fields_dir: "afs:/xxx"
+  train_dump_fields_dir: "./train_dump_data"
+
+  # when you need to dump fileds in inference, please set need_infer_dump to True,
+  # and need to set infer_dump_fields in static_model.py
+  need_infer_dump: True
+  # infer_dump_fields_dir: "afs:/xxx"
+  infer_dump_fields_dir: "./infer_dump_data"
+
+  fs_name: "afs://xxx"
+  fs_ugi: "xxx,xxx"
+  
+hyper_parameters:
+  optimizer:
+    class: adam
+    learning_rate: 0.001
+    strategy: sync
+  trigram_d: 2900
+  neg_num: 1
+  slice_end: 8
+  fc_sizes: [300, 300, 128]
+  fc_acts: ['relu', 'relu', 'relu']

+ 46 - 0
recommend-model-produce/src/main/python/models/dssmdemo/config_ps.yaml

@@ -0,0 +1,46 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+runner:
+  train_data_dir: "data/train"
+  train_reader_path: "bq_reader_train"  # importlib format
+  train_batch_size: 8
+  model_save_path: "output_model_dssm"
+
+  reader_type: "QueueDataset"  # DataLoader / QueueDataset / RecDataset
+  pipe_command: "python bq_reader_train_ps.py"
+  thread_num: 1
+  sync_mode: "async"
+
+  use_gpu: False
+  epochs: 10
+  print_interval: 10
+  
+  test_data_dir: "data/test"
+  infer_reader_path: "bq_reader_infer"  # importlib format
+  infer_batch_size: 1
+  infer_load_path: "output_model_dssm"
+  infer_start_epoch: 0
+  infer_end_epoch: 1
+
+hyper_parameters:
+  optimizer:
+    class: adam
+    learning_rate: 0.001
+    strategy: sync
+  trigram_d: 2900
+  neg_num: 1
+  slice_end: 8
+  fc_sizes: [300, 300, 128]
+  fc_acts: ['relu', 'relu', 'relu']

+ 24 - 0
recommend-model-produce/src/main/python/models/dssmdemo/data/data_process.sh

@@ -0,0 +1,24 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/bin/bash
+
+wget https://paddlerec.bj.bcebos.com/dssm%2Fbq.tar.gz
+tar xzf dssm%2Fbq.tar.gz
+rm -f dssm%2Fbq.tar.gz
+mv bq/train.txt ./raw_data.txt
+python3 preprocess.py
+mkdir big_train
+mv train.txt ./big_train
+mkdir big_test
+mv test.txt ./big_test

+ 27 - 0
recommend-model-produce/src/main/python/models/dssmdemo/data/prepare_dump_data.sh

@@ -0,0 +1,27 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/bin/bash
+
+
+cat train/train.txt | awk -F'\t' 'BEGIN{OFS="\t"}{print NR, "item_"NR, $0}' > data_with_lineid
+for i in 20210803 20210804
+do
+    for j in 1
+    do
+        mkdir -p train_with_insid/$i/$j
+        cp data_with_lineid train_with_insid/$i/$j
+        mkdir -p test_with_insid/$i/$j
+        cp data_with_lineid test_with_insid/$i/$j
+    done
+done

+ 0 - 0
recommend-model-produce/src/main/python/models/dssmdemo/data/preprocess.py


Failā izmaiņas netiks attēlotas, jo tās ir par lielu
+ 0 - 0
recommend-model-produce/src/main/python/models/dssmdemo/data/test/test.txt


+ 0 - 0
recommend-model-produce/src/main/python/models/dssmdemo/data/train/train.txt


+ 93 - 0
recommend-model-produce/src/main/python/models/dssmdemo/dygraph_model.py

@@ -0,0 +1,93 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import math
+import net
+
+
+class DygraphModel():
+    # define model
+    def create_model(self, config):
+        trigram_d = config.get('hyper_parameters.trigram_d', None)
+        neg_num = config.get('hyper_parameters.neg_num', None)
+        slice_end = config.get('hyper_parameters.slice_end', None)
+        fc_sizes = config.get('hyper_parameters.fc_sizes', None)
+        fc_acts = config.get('hyper_parameters.fc_acts', None)
+
+        DSSM_model = net.DSSMLayer(trigram_d, neg_num, slice_end, fc_sizes,
+                                   fc_acts)
+        return DSSM_model
+
+    # define feeds which convert numpy of batch data to paddle.tensor 
+    def create_feeds_train(self, batch_data, trigram_d):
+        query = paddle.to_tensor(batch_data[0].numpy().astype('float32')
+                                 .reshape(-1, trigram_d))
+        doc_pos = paddle.to_tensor(batch_data[1].numpy().astype('float32')
+                                   .reshape(-1, trigram_d))
+        doc_negs = []
+        for ele in batch_data[2:]:
+            doc_negs.append(
+                paddle.to_tensor(ele.numpy().astype('float32').reshape(
+                    -1, trigram_d)))
+        return [query, doc_pos] + doc_negs
+
+    def create_feeds_infer(self, batch_data, trigram_d):
+        query = paddle.to_tensor(batch_data[0].numpy().astype('float32')
+                                 .reshape(-1, trigram_d))
+        doc_pos = paddle.to_tensor(batch_data[1].numpy().astype('float32')
+                                   .reshape(-1, trigram_d))
+        return [query, doc_pos]
+
+    # define loss function by predicts and label
+    def create_loss(self, hit_prob):
+        loss = -paddle.sum(paddle.log(hit_prob), axis=-1)
+        avg_cost = paddle.mean(x=loss)
+        return avg_cost
+
+    # define optimizer 
+    def create_optimizer(self, dy_model, config):
+        lr = config.get("hyper_parameters.optimizer.learning_rate", 0.001)
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=lr, parameters=dy_model.parameters())
+        return optimizer
+
+    # define metrics such as auc/acc
+    # multi-task need to define multi metric
+    def create_metrics(self):
+        metrics_list_name = []
+        metrics_list = []
+        return metrics_list, metrics_list_name
+
+    # construct train forward phase  
+    def train_forward(self, dy_model, metrics_list, batch_data, config):
+        trigram_d = config.get('hyper_parameters.trigram_d', None)
+        inputs = self.create_feeds_train(batch_data, trigram_d)
+
+        R_Q_D_p, hit_prob = dy_model.forward(inputs, False)
+        loss = self.create_loss(hit_prob)
+        # update metrics
+        print_dict = {"loss": loss}
+        return loss, metrics_list, print_dict
+
+    def infer_forward(self, dy_model, metrics_list, batch_data, config):
+        trigram_d = config.get('hyper_parameters.trigram_d', None)
+        inputs = self.create_feeds_infer(batch_data, trigram_d)
+
+        R_Q_D_p, hit_prob = dy_model.forward(inputs, True)
+        # update metrics
+        print_dict = {"query_doc_sim": R_Q_D_p}
+        return metrics_list, print_dict

+ 101 - 0
recommend-model-produce/src/main/python/models/dssmdemo/net.py

@@ -0,0 +1,101 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+
+class DSSMLayer(nn.Layer):
+    def __init__(self, trigram_d, neg_num, slice_end, hidden_layers,
+                 hidden_acts):
+        super(DSSMLayer, self).__init__()
+
+        self.hidden_layers = [trigram_d] + hidden_layers
+        self.hidden_acts = hidden_acts
+        self.slice_end = slice_end
+
+        self._query_layers = []
+        for i in range(len(self.hidden_layers) - 1):
+            linear = paddle.nn.Linear(
+                in_features=self.hidden_layers[i],
+                out_features=self.hidden_layers[i + 1],
+                weight_attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.XavierNormal(
+                        fan_in=self.hidden_layers[i],
+                        fan_out=self.hidden_layers[i + 1])),
+                bias_attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.XavierNormal(
+                        fan_in=self.hidden_layers[i],
+                        fan_out=self.hidden_layers[i + 1])))
+            self.add_sublayer('query_linear_%d' % i, linear)
+            self._query_layers.append(linear)
+            if self.hidden_acts[i] == "relu":
+                act = paddle.nn.ReLU()
+                self.add_sublayer('query_act_%d' % i, act)
+                self._query_layers.append(act)
+
+        self._doc_layers = []
+        for i in range(len(self.hidden_layers) - 1):
+            linear = paddle.nn.Linear(
+                in_features=self.hidden_layers[i],
+                out_features=self.hidden_layers[i + 1],
+                weight_attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.XavierNormal(
+                        fan_in=self.hidden_layers[i],
+                        fan_out=self.hidden_layers[i + 1])),
+                bias_attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.XavierNormal(
+                        fan_in=self.hidden_layers[i],
+                        fan_out=self.hidden_layers[i + 1])))
+            self.add_sublayer('pos_linear_%d' % i, linear)
+            self._doc_layers.append(linear)
+            if self.hidden_acts[i] == "relu":
+                act = paddle.nn.ReLU()
+                self.add_sublayer('pos_act_%d' % i, act)
+                self._doc_layers.append(act)
+
+    def forward(self, input_data, is_infer):
+        query_fc = input_data[0]
+        for n_layer in self._query_layers:
+            query_fc = n_layer(query_fc)
+        self.query_fc = query_fc
+
+        doc_pos_fc = input_data[1]
+        for n_layer in self._doc_layers:
+            doc_pos_fc = n_layer(doc_pos_fc)
+        self.doc_pos_fc = doc_pos_fc
+
+        self.params = [self._query_layers[-2].bias]
+
+        R_Q_D_p = F.cosine_similarity(
+            query_fc, doc_pos_fc, axis=1).reshape([-1, 1])
+
+        if is_infer:
+            return R_Q_D_p, paddle.ones(shape=[self.slice_end, 1])
+
+        R_Q_D_ns = []
+        for i in range(len(input_data) - 2):
+            doc_neg_fc_i = input_data[i + 2]
+            for n_layer in self._doc_layers:
+                doc_neg_fc_i = n_layer(doc_neg_fc_i)
+            R_Q_D_n = F.cosine_similarity(
+                query_fc, doc_neg_fc_i, axis=1).reshape([-1, 1])
+            R_Q_D_ns.append(R_Q_D_n)
+        concat_Rs = paddle.concat(x=[R_Q_D_p] + R_Q_D_ns, axis=1)
+        prob = F.softmax(concat_Rs, axis=1)
+        hit_prob = paddle.slice(
+            prob, axes=[0, 1], starts=[0, 0], ends=[self.slice_end, 1])
+        return R_Q_D_p, hit_prob

+ 140 - 0
recommend-model-produce/src/main/python/models/dssmdemo/readme.md

@@ -0,0 +1,140 @@
+# DSSM文本匹配模型
+
+**[AI Studio在线运行环境](https://aistudio.baidu.com/aistudio/projectdetail/3238124)**
+
+以下是本例的简要目录结构及说明: 
+
+```
+├── data #样例数据
+    ├── train
+        ├── train.txt #训练数据样例
+    ├── test
+        ├── test.txt #测试数据样例
+    ├── preprocess.py #数据处理程序
+    ├── data_process #数据一键处理脚本
+├── __init__.py
+├── README.md #文档
+├── config.yaml # sample数据配置
+├── config_bigdata.yaml # 全量数据配置
+├── net.py # 模型核心组网(动静统一)
+├── static_model.py # 构建静态图
+├── dygraph_model.py # 构建动态图
+├── transform.py #将数据整理成合适的格式方便计算指标
+├── run.sh #全量数据集中的训练脚本,从训练到预测并计算指标
+├── bq_reader_train.py #训练时数据读取程序
+├── bq_reader_infer.py #预测时数据读取程序
+```
+
+注:在阅读该示例前,建议您先了解以下内容:
+
+[paddlerec入门教程](https://github.com/PaddlePaddle/PaddleRec/blob/master/README.md)  
+[readthedocs文档](https://paddlerec.readthedocs.io/en/latest/models/match/dssm.html)  
+
+
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [运行环境](#运行环境)
+- [快速开始](#快速开始)
+- [模型组网](#模型组网)
+- [效果复现](#效果复现)
+- [进阶使用](#进阶使用)
+- [FAQ](#FAQ)
+
+## 模型简介
+DSSM是Deep Structured Semantic Model的缩写,即我们通常说的基于深度网络的语义模型,其核心思想是将query和doc映射到到共同维度的语义空间中,通过最大化query和doc语义向量之间的余弦相似度,从而训练得到隐含语义模型,达到检索的目的。DSSM有很广泛的应用,比如:搜索引擎检索,广告相关性,问答系统,机器翻译等。    
+
+## 数据准备
+BQ是一个智能客服中文问句匹配数据集,该数据集是自动问答系统语料,共有120,000对句子对,并标注了句子对相似度值。数据中存在错别字、语法不规范等问题,但更加贴近工业场景。
+原始数据集样例:
+```
+请问一天是否都是限定只能转入或转出都是五万。    微众多少可以赎回短期理财        0
+微粒咨询电话号码多少    你们的人工客服电话是多少        1
+已经在银行换了新预留号码。      我现在换了电话号码,这个需要更换吗      1
+每个字段以tab键分隔,第1,2列表示两个文本。第3列表示类别(0或1,0表示两个文本不相似,1表示两个文本相似)。
+```
+
+## 运行环境
+PaddlePaddle>=2.0
+
+python 2.7/3.5/3.6/3.7
+
+os : windows/linux/macos 
+12234
+## 快速开始
+本文提供了样例数据可以供您快速体验,在任意目录下均可执行。在dssm模型目录的快速执行命令如下: 
+```bash
+# 进入模型目录
+# cd models/match/dssm # 在任意目录均可运行
+# 动态图训练
+python -u ../../../tools/trainer.py -m config.yaml # 全量数据运行config_bigdata.yaml 
+# 动态图预测
+python -u ../../../tools/infer.py -m config.yaml 
+
+# 静态图训练
+python -u ../../../tools/static_trainer.py -m config.yaml # 全量数据运行config_bigdata.yaml 
+# 静态图预测
+python -u ../../../tools/static_infer.py -m config.yaml 
+``` 
+
+## 模型组网
+DSSM 的输入采用 BOW(Bag of words)的方式,相当于把字向量的位置信息抛弃了,整个句子里的词都放在一个袋子里了。将一个句子用这种方式转化为一个向量输入DNN中。  
+Query 和 Doc 的语义相似性可以用这两个向量的 cosine 距离表示,然后通过softmax 函数选出与Query语义最相似的样本 Doc 。  
+
+模型的具体细节可以阅读论文[DSSM](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cikm2013_DSSM_fullversion.pdf):
+<p align="center">
+<img align="center" src="../../../doc/imgs/dssm.png">
+<p>
+
+## 效果复现
+为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。  
+在全量数据下模型的指标如下:  
+
+| 模型 | 正序率 | batch_size | epoch_num| Time of each epoch |
+| :------| :------ | :------ | :------| :------ | 
+| DSSM | 0.93 | 128 | 1 | 约9分钟 |  
+
+1. 确认您当前所在目录为PaddleRec/models/match/dssm
+2. 进入paddlerec/datasets/BQ_dssm目录下,执行该脚本,会从国内源的服务器上下载我们预处理完成的BQ全量数据集,并解压到指定文件夹。  
+``` bash
+cd ../../../datasets/BQ_dssm
+sh run.sh
+```
+3. 切回模型目录,直接一键运行:bash run.sh 即可得到复现的论文效果.
+执行该脚本后,会开始自动训练并测试模型,将测试的结果保存到result.txt文件,再执行transform.py整理格式,最后通过执行../../../tools/cal_pos_neg.py进行评估得到数据的正序率指标   
+```bash
+cd - # 切回模型目录
+bash run.sh #动态图训练并测试,最后得到指标
+```
+
+## 进阶使用
+DSSM作为推荐系统中一种向量召回的方式,一般需要将doc侧的向量预先计算出来,灌入向量搜索引擎(例如milvus)中,同时保存的模型仅为query侧的模型。线上使用阶段,输入query侧的数据,计算出query侧向量后,直接通过向量搜索引擎召回对应的doc。  
+一般在训练的过程中,增加预测阶段,dump出全量的doc侧向量,需要做如下修改: 
+1. 为了区分dump出的向量,预测阶段使用的数据需要增加insid和content两个字段,其中insid唯一标记样本,content指明对应的doc。并在数据处理脚本中对这两个字段进行解析,详见bq_reader_train_insid.py脚本。
+2. dataset选择InmemoryDataset,同时设置
+```python
+dataset.set_parse_ins_id(True)
+dataset.set_parse_content(True)
+```
+3. 在static_model.py中配置需要dump的变量(doc侧最上层输出)
+```python
+self.infer_dump_fields = [dssm_model.doc_pos_fc]
+```
+4. 配置文件中,打开预测阶段的dump功能,并配置dump_path
+```bash
+need_infer_dump: True
+infer_dump_fields_dir: "./infer_dump_data"
+```
+保存模型时,只需要保存query侧网络
+1. 配置文件中,打开裁剪网络开关
+```bash
+need_prune: True
+```
+2. 在static_model.py中配置裁剪网络的输入和输出
+```python
+self.prune_feed_vars = [query]
+self.prune_target_var = dssm_model.query_fc
+```
+  
+## FAQ

+ 19 - 0
recommend-model-produce/src/main/python/models/dssmdemo/run.sh

@@ -0,0 +1,19 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/bin/bash
+echo "................run................."
+python -u ../../../tools/trainer.py -m config_bigdata.yaml
+python -u ../../../tools/infer.py -m config_bigdata.yaml &> result.txt
+python transform.py
+python ../../../tools/cal_pos_neg.py pair.txt

+ 85 - 0
recommend-model-produce/src/main/python/models/dssmdemo/static_model.py

@@ -0,0 +1,85 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+from net import DSSMLayer
+
+
+class StaticModel():
+    def __init__(self, config):
+        self.cost = None
+        self.config = config
+        self._init_hyper_parameters()
+
+    def _init_hyper_parameters(self):
+        self.trigram_d = self.config.get("hyper_parameters.trigram_d")
+        self.neg_num = self.config.get("hyper_parameters.neg_num")
+        self.hidden_layers = self.config.get("hyper_parameters.fc_sizes")
+        self.hidden_acts = self.config.get("hyper_parameters.fc_acts")
+        self.learning_rate = self.config.get("hyper_parameters.learning_rate")
+        self.slice_end = self.config.get("hyper_parameters.slice_end")
+        self.learning_rate = self.config.get(
+            "hyper_parameters.optimizer.learning_rate")
+
+    def create_feeds(self, is_infer=False):
+        query = paddle.static.data(
+            name="query", shape=[-1, self.trigram_d], dtype='float32')
+        self.prune_feed_vars = [query]
+
+        doc_pos = paddle.static.data(
+            name="doc_pos", shape=[-1, self.trigram_d], dtype='float32')
+
+        if is_infer:
+            return [query, doc_pos]
+
+        doc_negs = [
+            paddle.static.data(
+                name="doc_neg_" + str(i),
+                shape=[-1, self.trigram_d],
+                dtype="float32") for i in range(self.neg_num)
+        ]
+        feeds_list = [query, doc_pos] + doc_negs
+        return feeds_list
+
+    def net(self, input, is_infer=False):
+        dssm_model = DSSMLayer(self.trigram_d, self.neg_num, self.slice_end,
+                               self.hidden_layers, self.hidden_acts)
+        R_Q_D_p, hit_prob = dssm_model.forward(input, is_infer)
+
+        self.inference_target_var = R_Q_D_p
+        self.prune_target_var = dssm_model.query_fc
+        self.train_dump_fields = [dssm_model.query_fc, R_Q_D_p]
+        self.train_dump_params = dssm_model.params
+        self.infer_dump_fields = [dssm_model.doc_pos_fc]
+        if is_infer:
+            fetch_dict = {'query_doc_sim': R_Q_D_p}
+            return fetch_dict
+        loss = -paddle.sum(paddle.log(hit_prob), axis=-1)
+        avg_cost = paddle.mean(x=loss)
+        # print(avg_cost)
+        self._cost = avg_cost
+        fetch_dict = {'Loss': avg_cost}
+        return fetch_dict
+
+    def create_optimizer(self, strategy=None):
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=self.learning_rate, lazy_mode=True)
+        if strategy != None:
+            import paddle.distributed.fleet as fleet
+            optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(self._cost)
+
+    def infer_net(self, input):
+        return self.net(input, is_infer=True)

+ 78 - 0
recommend-model-produce/src/main/python/models/dssmdemo/transform.py

@@ -0,0 +1,78 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import numpy as np
+import sklearn.metrics
+
+filename = './result.txt'
+f = open(filename, "r")
+lines = f.readlines()
+f.close()
+result = []
+for line in lines:
+    if "query_doc_sim" in str(line):
+        result.append(line)
+result = result[:-1]
+f = open(filename, "w")
+for i in range(len(result)):
+    f.write(str(result[i]))
+f.close()
+
+label = []
+filename = '../../../datasets/BQ_dssm/label.txt'
+f = open(filename, "r")
+#f.readline()
+num = 0
+for line in f.readlines():
+    num = num + 1
+    line = line.strip()
+    label.append(line)
+f.close()
+print(num)
+
+filename = './result.txt'
+sim = []
+for line in open(filename):
+    line = line.strip().split(",")
+    line[3] = line[3].split(":")
+    line = line[3][1].strip(" ")
+    line = line.strip("[")
+    line = line.strip("]")
+    sim.append(float(line))
+
+filename = '../../../datasets/BQ_dssm/big_test/test.txt'
+f = open(filename, "r")
+#f.readline()
+query = []
+for line in f.readlines():
+    line = line.strip().split("\t")
+    query.append(line[0])
+f.close()
+
+
+def takeFirst(x):
+    return x[0]
+
+
+filename = 'pair.txt'
+line = []
+print(len(query), len(sim), len(label))
+for i in range(len(sim)):
+    line.append([str(query[i]), str(sim[i]), str(label[i])])
+line.sort(key=takeFirst)
+f = open(filename, "w")
+for i in line:
+    f.write(i[0] + "\t" + i[1] + "\t" + i[2] + "\n")
+f.close()

+ 8 - 22
recommend-model-produce/src/main/python/tools/static_ps_trainer_v2.py

@@ -24,7 +24,7 @@ import time
 import sys
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
-from paddle.distributed.fleet.utils import HDFSClient
+from hdfs import InsecureClient
 import paddle
 import paddle.distributed as dist
 from paddle.io import Dataset, DataLoader
@@ -47,25 +47,21 @@ logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-hadoop_home = "/home/client/hadoop-client/hadoop/"
-configs = {
-    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-    "hadoop.job.ugi": "hello,hello123"
-}
-
+client = InsecureClient('http://192.168.141.208:50070', user='root')
 
 class HDFSDataset(Dataset):
     def __init__(self, hdfs_path, batch_size=32):
         self.hdfs_path = hdfs_path
         self.batch_size = batch_size
-        self.dir_names, self.file_names = client.list_dirs(hdfs_path)
-        client = HDFSClient(hadoop_home, configs)
+        self.file_names = [f for f in self.client.list(hdfs_path) if f.type == 'FILE']
+
 
     def __getitem__(self, idx):
         # 读取单个样本的逻辑
         file_name = self.file_names[idx]
-        data = reader.read()
-        return data
+        with client.read('/path/to/file.txt') as reader:
+            data = reader.read()
+            return data
 
     def __len__(self):
         return len(self.file_names)
@@ -276,18 +272,8 @@ class Main(object):
         ]
         fetch_vars = [var for _, var in self.metrics.items()]
         print_step = int(config.get("runner.print_interval"))
-
-        debug = config.get("runner.dataset_debug", False)
-        if config.get("runner.need_dump"):
-            debug = True
-            dump_fields_path = "{}/{}".format(
-                config.get("runner.dump_fields_path"), epoch)
-            set_dump_config(paddle.static.default_main_program(), {
-                "dump_fields_path": dump_fields_path,
-                "dump_fields": config.get("runner.dump_fields")
-            })
         print(paddle.static.default_main_program()._fleet_opt)
-        dataset = HDFSDataset(hdfs_path='hdfs://namenode:port/path/to/data')
+        dataset = HDFSDataset(hdfs_path='/path/to/data')
         # 创建分布式采样器
         sampler = DistributedSampler(dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank())
         loader = DataLoader(dataset, batch_size=32, sampler=sampler)

Daži faili netika attēloti, jo izmaiņu fails ir pārāk liels