algorithm
/
RovOpt


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
							def tag_preprocessing(filename):
    #读取tag分词结果
    tag_txt = open("/root/ROVtrain/tfidfCompution/"+ filename +".txt","r")   #设置文件对象
    ftextlist = tag_txt.readlines() # 同上
    tag_txt.close() #关闭文件
    

    #转为corpus
    tagList = str(ftextlist).replace('[','').replace(']','').replace("'","").replace("'","").split(',')
    tag = np.array(tagList).reshape(len(tagList),1).tolist()
    
    
    #将词特征转为list形式
    train_tag_feature =  pd.DataFrame(df_new_feature).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videotags']),1).tolist()
    test_tag_feature = pd.DataFrame(df_new_feature_test).loc[:,'videotags'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_test).loc[:,'videotags']),1).tolist()
    
    #稀疏特征
    mlb_model_tag = MultiLabelBinarizer(sparse_output=True).fit(tag)
    train_tag = mlb_model_tag.transform(train_tag_feature)
    test_tag = mlb_model_tag.transform(test_tag_feature)
    
    return mlb_model_tag.classes_,train_tag,test_tag


# In[25]:


#读取tf,idf
def get_tag_tfidf(dt, tfidf_table_name):
    odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'videoods',
                endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
                read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
    tag_dict = {}
    for record in odps.read_table(tfidf_table_name, partition='dt=%s' % dt):
        tag_dict[record[0]] = record[1]
    return tag_dict


# In[26]:


def ttfidf_list_generation(tag_corpus,tag_dict):
    tag_tfidf_list = []
    for i in tag_corpus:
        try :
            tag_tfidf_list.append(tag_dict[i])
        except:
            tag_tfidf_list.append(0)
    return tag_tfidf_list