12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061 |
- import BertEmbedding
- import MilvusComponent
- import pandas as pd
- def queryCollection(vector):
- # 根据向量进行相似性检索 + 条件过滤
- results = MilvusComponent.query(
- vector=[vector],
- top_k=10,
- expr="id >= 15408599",
- output_fields=['title', 'preview_users', 'preview_times', 'view_users', 'view_times',
- 'play_users', 'play_times', 'share_users', 'share_times', 'return_users', 'return_times']
- )
- return results
- def calculate_and_export(filename):
- # 读取表格数据
- df = pd.read_excel(filename)
- # 遍历DataFrame的每一行
- for index, row in df.iterrows():
- print(f"title={row['title']} ")
- # 计算实际ROV
- actual_rov = row['return_cnt'] / \
- row['view_cnt'] if row['view_cnt'] != 0 else 0
- df.at[index, '实际ROV(回流人数/曝光次数)'] = actual_rov
- # 计算预测ROV
- try:
- vector = BertEmbedding.text_to_vector(row['title'])
- results = queryCollection(vector)
- for i in range(len(results[0])):
- hit = results[0][i]
- df.at[index, '相似视频ID'+str(i+1)] = hit.id
- df.at[index, '相似标题'+str(i+1)] = hit.entity.get('title')
- df.at[index, '相似曝光' +
- str(i+1)] = hit.entity.get('view_times')
- df.at[index, '相似分享' +
- str(i+1)] = hit.entity.get('share_times')
- df.at[index, '相似回流' +
- str(i+1)] = hit.entity.get('return_users')
- df.at[index, '相似内积'+str(i+1)] = hit.distance
- except Exception as e:
- print(e)
- print("Done=====================================")
- # 将结果保存回表格
- df.to_excel('videos-result.xlsx', index=False)
- if __name__ == '__main__':
- calculate_and_export('20231124_flowpool.xlsx')
- # vector = BertEmbedding.text_to_vector('印度种姓制度有多可怕?看完这些你就知道了')
- # results = queryCollection(vector)
- # print(results)
|