Browse Source

generate label for mysql

罗俊辉 1 year ago
parent
commit
f36a0e55ab
1 changed files with 23 additions and 19 deletions
  1. 23 19
      process_data.py

+ 23 - 19
process_data.py

@@ -214,26 +214,30 @@ class SpiderProcess(object):
         data_list = self.client_spider.select(select_sql)
         df = []
         for line in tqdm(data_list):
-            temp = list(line)
-            video_id = line[0]
-            title = line[1]
-            lop, duration = self.spider_lop(video_id)
-            title_tags = list(jieba.analyse.textrank(title, topK=3))
-            temp.append(lop)
-            temp.append(duration)
-            if title_tags:
-                for i in range(3):
-                    try:
-                        temp.append(title_tags[i])
-                    except:
-                        temp.append(None)
-            else:
-                temp.append(None)
-                temp.append(None)
-                temp.append(None)
+            try:
+                temp = list(line)
+                video_id = line[0]
+                title = line[1]
+                lop, duration = self.spider_lop(video_id)
+                title_tags = list(jieba.analyse.textrank(title, topK=3))
+                temp.append(lop)
+                temp.append(duration)
+                if title_tags:
+                    for i in range(3):
+                        try:
+                            temp.append(title_tags[i])
+                        except:
+                            temp.append(None)
+                else:
+                    temp.append(None)
+                    temp.append(None)
+                    temp.append(None)
 
-            df.append(temp[1:])
-        df = pd.DataFrame(df, columns=['title', 'channel', 'out_user_id', 'mode', 'out_play_cnt', 'out_like_cnt', 'out_share_cnt', 'lop', 'duration', 'tag1', 'tag2', 'tag3'])
+                df.append(temp[1:])
+            except:
+                continue
+        df = pd.DataFrame(df, columns=['title', 'channel', 'out_user_id', 'mode', 'out_play_cnt', 'out_like_cnt',
+                                       'out_share_cnt', 'lop', 'duration', 'tag1', 'tag2', 'tag3'])
         df.to_excel("data/train_data/spider_data_{}.xlsx".format(datetime.datetime.today().strftime("y%m%d")))