2 years ago · 287df6e638
--- a/20231115_flow_pool.xlsx
+++ b/20231115_flow_pool.xlsx
--- a/BertQuery.py
+++ b/BertQuery.py
@@ -2,7 +2,7 @@ from BertDemo import collection, text_to_vector, List, Doc
 
				 import pandas as pd
			
 
				 
			
 
				 # 创建一个缓存字典，用于存储查询结果
			
 
				-# query_cache = {}
			
 
				+query_cache = {}
			
 
				 
			
 
				 
			
 
				 def vector_to_tuple(vector):
			
@@ -14,8 +14,8 @@ def queryCollection(vector) -> List[Doc]:
 
				     # 如果向量已经查询过，则直接返回结果
			
 
				     vector_tuple = vector_to_tuple(vector)
			
 
				 
			
 
				-    # if vector_tuple in query_cache:
			
 
				-    #     return query_cache[vector_tuple]
			
 
				+    if vector_tuple in query_cache:
			
 
				+        return query_cache[vector_tuple]
			
 
				 
			
 
				     # 根据向量进行相似性检索 + 条件过滤
			
 
				     ret = collection.query(
			
@@ -31,52 +31,53 @@ def queryCollection(vector) -> List[Doc]:
 
				     return ret.output
			
 
				 
			
 
				 
			
 
				-def calculate_ros(row):
			
 
				+def calculate_rov(row):
			
 
				     print(f"title={row['title']} ")
			
 
				-    vector = text_to_vector(row['title'])
			
 
				-    docs = queryCollection(vector)
			
 
				-    sumRos = 0
			
 
				-    for doc in docs:
			
 
				-        sumRos += doc.fields['rntHeadCount'] / doc.fields['shareCount']
			
 
				-
			
 
				     try:
			
 
				-        ros = sumRos / len(docs)
			
 
				+        vector = text_to_vector(row['title'])
			
 
				+        docs = queryCollection(vector)
			
 
				+        sumRov = 0
			
 
				+        for doc in docs:
			
 
				+            sumRov += doc.fields['rntHeadCount'] / doc.fields['exposureCount']
			
 
				+        rov = sumRov / len(docs)
			
 
				     except:
			
 
				-        ros = 0
			
 
				+        rov = 0
			
 
				 
			
 
				-    print(f"预测ROS={ros}")
			
 
				+    print(f"预测ROV={rov}")
			
 
				     print("=====================================")
			
 
				-    return ros
			
 
				+    return rov
			
 
				 
			
 
				 
			
 
				-def calculate_return(row):
			
 
				-    print(f"title={row['title']} ")
			
 
				-    vector = text_to_vector(row['title'])
			
 
				-    docs = queryCollection(vector)
			
 
				-    sumHeadCount = 0
			
 
				-    for doc in docs:
			
 
				-        sumHeadCount += doc.fields['rntHeadCount']
			
 
				+# def calculate_return(row):
			
 
				+#     print(f"title={row['title']} ")
			
 
				+#     vector = text_to_vector(row['title'])
			
 
				+#     docs = queryCollection(vector)
			
 
				+#     sumHeadCount = 0
			
 
				+#     for doc in docs:
			
 
				+#         sumHeadCount += doc.fields['rntHeadCount']
			
 
				 
			
 
				-    try:
			
 
				-        headCount = sumHeadCount / len(docs)
			
 
				-    except:
			
 
				-        headCount = 0
			
 
				+#     try:
			
 
				+#         headCount = sumHeadCount / len(docs)
			
 
				+#     except:
			
 
				+#         headCount = 0
			
 
				 
			
 
				-    print(f"预测回流人数={headCount}")
			
 
				-    print("=====================================")
			
 
				-    return headCount
			
 
				+#     print(f"预测回流人数={headCount}")
			
 
				+#     print("=====================================")
			
 
				+#     return headCount
			
 
				 
			
 
				 
			
 
				 def calculate_and_export(filename):
			
 
				     # 读取表格数据
			
 
				     df = pd.read_excel(filename)
			
 
				 
			
 
				-    # 应用计算函数并创建新的列
			
 
				-    df['实际ROS(回流人数/分享次数)'] = df.apply(lambda row: row['回流人数'] /
			
 
				-                                      row['分享次数'], axis=1)
			
 
				-    df['预测ROS'] = df.apply(calculate_ros, axis=1)
			
 
				-    df['实际回流人数'] = df['回流人数']
			
 
				-    df['预测回流人数'] = df.apply(calculate_return, axis=1)
			
 
				+    # 应用计算函数并创建新的列, 跳过曝光次数为0的数据
			
 
				+    df['实际ROV(回流人数/曝光次数)'] = df.apply(lambda row: row['回流人数'] /
			
 
				+                                      row['曝光次数'] if row['曝光次数'] != 0 else 0, axis=1)
			
 
				+    df['预测ROV'] = df.apply(calculate_rov, axis=1)
			
 
				 
			
 
				     # 将结果保存回表格
			
 
				     df.to_excel('videos-result.xlsx', index=False)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    calculate_and_export('20231115_flow_pool.xlsx')
			
--- a/videos-202309.xlsx
+++ b/videos-202309.xlsx
--- a/videos-result.xlsx
+++ b/videos-result.xlsx