Browse Source

上传读取标题功能

罗俊辉 1 year ago
parent
commit
16b99b85f3
1 changed files with 10 additions and 7 deletions
  1. 10 7
      process_data.py

+ 10 - 7
process_data.py

@@ -47,13 +47,17 @@ class DataProcessor(object):
             "out_collection_cnt",
         ]
         item_features = [item[i] for i in userful_features]
-        keywords_textrank, keywords_tf = self.title_processor(video_id)
-        if keywords_tf and keywords_textrank:
-            item_features.append(",".join(keywords_textrank))
-            item_features.append(",".join(keywords_tf))
+        keywords_textrank = self.title_processor(video_id)
+        if keywords_textrank:
+            for i in range(3):
+                try:
+                    item_features.append(keywords_textrank[i])
+                except:
+                    item_features.append(None)
         else:
             item_features.append(None)
             item_features.append(None)
+            item_features.append(None)
         label_dt = generate_label_date(dt)
         label_obj = y_ori_data.get(label_dt, {}).get(video_id)
         if label_obj:
@@ -72,11 +76,10 @@ class DataProcessor(object):
         try:
             title = self.client.select(sql)
             keywords_textrank = jieba.analyse.textrank(title, topK=3)
-            keywords_tfidf = jieba.analyse.extract_tags(title, topK=3)
-            return list(keywords_textrank), list(keywords_tfidf)
+            return list(keywords_textrank)
         except Exception as e:
             print(video_id, "\t", e)
-            return [], []
+            return []
 
     def producer(self):
         """