BertQuery.py 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. from BertDemo import collection, text_to_vector, List, Doc
  2. import pandas as pd
  3. # 创建一个缓存字典,用于存储查询结果
  4. query_cache = {}
  5. def vector_to_tuple(vector):
  6. # 将列表转换为元组,以便作为字典的键使用
  7. return tuple(vector)
  8. def queryCollection(vector) -> List[Doc]:
  9. # 如果向量已经查询过,则直接返回结果
  10. vector_tuple = vector_to_tuple(vector)
  11. if vector_tuple in query_cache:
  12. return query_cache[vector_tuple]
  13. # 根据向量进行相似性检索 + 条件过滤
  14. ret = collection.query(
  15. vector=vector, # 向量检索,也可设置主键检索
  16. topk=10,
  17. # filter='playCount > 1000',
  18. include_vector=True
  19. )
  20. if ret is None or ret.code != 0:
  21. print('查询失败')
  22. return None
  23. # query_cache[vector_tuple] = ret.output
  24. return ret.output
  25. def calculate_rov(row):
  26. print(f"title={row['title']} ")
  27. try:
  28. vector = text_to_vector(row['title'])
  29. docs = queryCollection(vector)
  30. sumRov = 0
  31. for doc in docs:
  32. sumRov += doc.fields['rntHeadCount'] / doc.fields['exposureCount']
  33. rov = sumRov / len(docs)
  34. except:
  35. rov = 0
  36. print(f"预测ROV={rov}")
  37. print("=====================================")
  38. return rov
  39. # def calculate_return(row):
  40. # print(f"title={row['title']} ")
  41. # vector = text_to_vector(row['title'])
  42. # docs = queryCollection(vector)
  43. # sumHeadCount = 0
  44. # for doc in docs:
  45. # sumHeadCount += doc.fields['rntHeadCount']
  46. # try:
  47. # headCount = sumHeadCount / len(docs)
  48. # except:
  49. # headCount = 0
  50. # print(f"预测回流人数={headCount}")
  51. # print("=====================================")
  52. # return headCount
  53. def calculate_and_export(filename):
  54. # 读取表格数据
  55. df = pd.read_excel(filename)
  56. # 应用计算函数并创建新的列, 跳过曝光次数为0的数据
  57. df['实际ROV(回流人数/曝光次数)'] = df.apply(lambda row: row['回流人数'] /
  58. row['曝光次数'] if row['曝光次数'] != 0 else 0, axis=1)
  59. df['预测ROV'] = df.apply(calculate_rov, axis=1)
  60. # 将结果保存回表格
  61. df.to_excel('videos-result.xlsx', index=False)
  62. if __name__ == '__main__':
  63. calculate_and_export('20231115_flow_pool.xlsx')