functions.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. """
  2. @author: luojunhui
  3. """
  4. import jieba.analyse
  5. import pandas as pd
  6. from .model_init import models
  7. class ParamProcess(object):
  8. """
  9. 处理 params, 继承 models
  10. """
  11. def __init__(self):
  12. self.model_v1 = models.model_v1
  13. self.model_v2 = models.model_v2
  14. self.label_encoder = models.label_encoder
  15. self.features_v1 = ["channel", "type", "title"]
  16. self.features_v2 = ["channel", "out_user_id", "mode", "out_play_cnt", "out_like_cnt", "out_share_cnt", "title",
  17. "lop", "duration"]
  18. async def title_to_tags(self, features):
  19. """
  20. process video title to tags and transform features_json_to_dataFrame
  21. :param features:
  22. :return:
  23. """
  24. title = features['title']
  25. if title:
  26. title = title.strip()
  27. title_tags = list(jieba.analyse.textrank(title, topK=3))
  28. if title_tags:
  29. for i in range(3):
  30. try:
  31. features['tag{}'.format(i + 1)] = title_tags[i]
  32. except:
  33. features['tag_{}'.format(i + 1)] = None
  34. else:
  35. features['tag1'] = None
  36. features['tag2'] = None
  37. features['tag3'] = None
  38. df = pd.DataFrame([features])
  39. df = df.drop('title', axis=1)
  40. return df
  41. async def predict_score(self, version, features):
  42. """
  43. 预测
  44. :param version: 模型版本
  45. :param features: 视频被 label_encoder 之后的features
  46. :return: data
  47. """
  48. match version:
  49. case "v1":
  50. result = self.model_v1(features)
  51. result = list(result)
  52. if result:
  53. obj = {
  54. "vision": "v1",
  55. "score": result[0],
  56. "benchmark": 0.06,
  57. "is_good_video": 1 if result[0] > 0.06 else 0
  58. }
  59. else:
  60. obj = {
  61. "vision": "v1",
  62. "score": None,
  63. "benchmark": 0.06,
  64. "is_good_video": 0
  65. }
  66. return {
  67. "code": 0,
  68. "message": "success",
  69. "data": obj
  70. }
  71. case "v2":
  72. result = self.model_v2.predict(features)
  73. result = list(result)
  74. if result:
  75. obj = {
  76. "vision": "v2",
  77. "score": result[0],
  78. "benchmark": 0.3,
  79. "is_good_video": 1 if result[0] > 0.3 else 0
  80. }
  81. else:
  82. obj = {
  83. "vision": "v2",
  84. "score": None,
  85. "benchmark": 0.3,
  86. "is_good_video": 0
  87. }
  88. return {
  89. "code": 0,
  90. "message": "success",
  91. "data": obj
  92. }
  93. async def process_label(self, params):
  94. """
  95. 处理类别 features 和 float features
  96. :param params: 接收到的参数
  97. :return: 转化好的类别特征的 dataframe
  98. """
  99. version = params['version']
  100. features = params['features']
  101. features = await self.title_to_tags(features)
  102. match version:
  103. case "v1":
  104. # 全部转化为类别
  105. str_column = [
  106. "channel",
  107. "type",
  108. "tag1",
  109. "tag2",
  110. "tag3"
  111. ]
  112. for key in str_column:
  113. features[key] = self.label_encoder.fit_transform(features[key])
  114. return version, features
  115. case "v2":
  116. float_column = ["out_play_cnt", "out_like_cnt", "out_share_cnt", "lop", "duration"]
  117. str_column = ["channel", "mode", "out_user_id", "tag1", "tag2", "tag3"]
  118. for key in float_column:
  119. features[key] = pd.to_numeric(features[key], errors="coerce")
  120. for key in str_column:
  121. features[key] = self.label_encoder.fit_transform(features[key])
  122. return version, features
  123. async def process(self, params):
  124. """
  125. 处理
  126. :param params:
  127. :return:
  128. """
  129. # check params
  130. v = params.get("version")
  131. if v == "v1":
  132. features = params.get("features")
  133. if len(features) != 3:
  134. return {
  135. "code": 1,
  136. "message": "参数错误,v1,features长度应该是 3,传参长度是{}".format(len(features)),
  137. "data": None
  138. }
  139. for feature in self.features_v1:
  140. if feature in features:
  141. continue
  142. else:
  143. return {
  144. "code": 1,
  145. "message": "参数错误, 缺少参数{}".format(feature),
  146. "data": None
  147. }
  148. if v == "v2":
  149. features = params.get("features")
  150. if len(features) != 9:
  151. return {
  152. "code": 1,
  153. "message": "参数错误,v2,features长度应该是 9,传参长度是{}".format(len(features)),
  154. "data": None
  155. }
  156. for feature in self.features_v2:
  157. if feature in features:
  158. continue
  159. else:
  160. return {
  161. "code": 1,
  162. "message": "参数错误, 缺少参数{}".format(feature),
  163. "data": None
  164. }
  165. else:
  166. return {
  167. "code": 1,
  168. "message": "参数错误,version 应该是 v1 or v2, 传参是{}".format(v),
  169. "data": None
  170. }
  171. try:
  172. version, features = await self.process_label(params)
  173. except Exception as e:
  174. return {
  175. "code": 2,
  176. "message": "系统错误,定位在 process_label, 报错内容是{}:".format(e),
  177. "data": None
  178. }
  179. try:
  180. res = await self.predict_score(version, features)
  181. return res
  182. except Exception as e:
  183. return {
  184. "code": 2,
  185. "message": "系统异常, 定位在 predict_score, 报错是{}:".format(e),
  186. "data": None
  187. }