criteo_reader.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from __future__ import print_function
  15. import numpy as np
  16. from paddle.io import IterableDataset
  17. class RecDataset(IterableDataset):
  18. def __init__(self, file_list, config):
  19. super(RecDataset, self).__init__()
  20. self.file_list = file_list
  21. self.init()
  22. def init(self):
  23. from operator import mul
  24. padding = 0
  25. sparse_slots = "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
  26. self.sparse_slots = sparse_slots.strip().split(" ")
  27. self.dense_slots = ["dense_feature"]
  28. self.dense_slots_shape = [13]
  29. self.slots = self.sparse_slots + self.dense_slots
  30. self.slot2index = {}
  31. self.visit = {}
  32. for i in range(len(self.slots)):
  33. self.slot2index[self.slots[i]] = i
  34. self.visit[self.slots[i]] = False
  35. self.padding = padding
  36. def __iter__(self):
  37. full_lines = []
  38. self.data = []
  39. for file in self.file_list:
  40. with open(file, "r") as rf:
  41. for l in rf:
  42. line = l.strip().split(" ")
  43. output = [(i, []) for i in self.slots]
  44. for i in line:
  45. slot_feasign = i.split(":")
  46. slot = slot_feasign[0]
  47. if slot not in self.slots:
  48. continue
  49. if slot in self.sparse_slots:
  50. feasign = int(slot_feasign[1])
  51. else:
  52. feasign = float(slot_feasign[1])
  53. output[self.slot2index[slot]][1].append(feasign)
  54. self.visit[slot] = True
  55. for i in self.visit:
  56. slot = i
  57. if not self.visit[slot]:
  58. if i in self.dense_slots:
  59. output[self.slot2index[i]][1].extend(
  60. [self.padding] *
  61. self.dense_slots_shape[self.slot2index[i]])
  62. else:
  63. output[self.slot2index[i]][1].extend(
  64. [self.padding])
  65. else:
  66. self.visit[slot] = False
  67. # sparse
  68. output_list = []
  69. for key, value in output[:-1]:
  70. output_list.append(np.array(value).astype('int64'))
  71. # dense
  72. output_list.append(
  73. np.array(output[-1][1]).astype("float32"))
  74. # list
  75. yield output_list