|
@@ -214,26 +214,30 @@ class SpiderProcess(object):
|
|
|
data_list = self.client_spider.select(select_sql)
|
|
|
df = []
|
|
|
for line in tqdm(data_list):
|
|
|
- temp = list(line)
|
|
|
- video_id = line[0]
|
|
|
- title = line[1]
|
|
|
- lop, duration = self.spider_lop(video_id)
|
|
|
- title_tags = list(jieba.analyse.textrank(title, topK=3))
|
|
|
- temp.append(lop)
|
|
|
- temp.append(duration)
|
|
|
- if title_tags:
|
|
|
- for i in range(3):
|
|
|
- try:
|
|
|
- temp.append(title_tags[i])
|
|
|
- except:
|
|
|
- temp.append(None)
|
|
|
- else:
|
|
|
- temp.append(None)
|
|
|
- temp.append(None)
|
|
|
- temp.append(None)
|
|
|
+ try:
|
|
|
+ temp = list(line)
|
|
|
+ video_id = line[0]
|
|
|
+ title = line[1]
|
|
|
+ lop, duration = self.spider_lop(video_id)
|
|
|
+ title_tags = list(jieba.analyse.textrank(title, topK=3))
|
|
|
+ temp.append(lop)
|
|
|
+ temp.append(duration)
|
|
|
+ if title_tags:
|
|
|
+ for i in range(3):
|
|
|
+ try:
|
|
|
+ temp.append(title_tags[i])
|
|
|
+ except:
|
|
|
+ temp.append(None)
|
|
|
+ else:
|
|
|
+ temp.append(None)
|
|
|
+ temp.append(None)
|
|
|
+ temp.append(None)
|
|
|
|
|
|
- df.append(temp[1:])
|
|
|
- df = pd.DataFrame(df, columns=['title', 'channel', 'out_user_id', 'mode', 'out_play_cnt', 'out_like_cnt', 'out_share_cnt', 'lop', 'duration', 'tag1', 'tag2', 'tag3'])
|
|
|
+ df.append(temp[1:])
|
|
|
+ except:
|
|
|
+ continue
|
|
|
+ df = pd.DataFrame(df, columns=['title', 'channel', 'out_user_id', 'mode', 'out_play_cnt', 'out_like_cnt',
|
|
|
+ 'out_share_cnt', 'lop', 'duration', 'tag1', 'tag2', 'tag3'])
|
|
|
df.to_excel("data/train_data/spider_data_{}.xlsx".format(datetime.datetime.today().strftime("y%m%d")))
|
|
|
|
|
|
|