sunxy hace 1 año
padre
commit
287df6e638
Se han modificado 4 ficheros con 35 adiciones y 34 borrados
  1. BIN
      20231115_flow_pool.xlsx
  2. 35 34
      BertQuery.py
  3. BIN
      videos-202309.xlsx
  4. BIN
      videos-result.xlsx

BIN
20231115_flow_pool.xlsx


+ 35 - 34
BertQuery.py

@@ -2,7 +2,7 @@ from BertDemo import collection, text_to_vector, List, Doc
 import pandas as pd
 
 # 创建一个缓存字典,用于存储查询结果
-# query_cache = {}
+query_cache = {}
 
 
 def vector_to_tuple(vector):
@@ -14,8 +14,8 @@ def queryCollection(vector) -> List[Doc]:
     # 如果向量已经查询过,则直接返回结果
     vector_tuple = vector_to_tuple(vector)
 
-    # if vector_tuple in query_cache:
-    #     return query_cache[vector_tuple]
+    if vector_tuple in query_cache:
+        return query_cache[vector_tuple]
 
     # 根据向量进行相似性检索 + 条件过滤
     ret = collection.query(
@@ -31,52 +31,53 @@ def queryCollection(vector) -> List[Doc]:
     return ret.output
 
 
-def calculate_ros(row):
+def calculate_rov(row):
     print(f"title={row['title']} ")
-    vector = text_to_vector(row['title'])
-    docs = queryCollection(vector)
-    sumRos = 0
-    for doc in docs:
-        sumRos += doc.fields['rntHeadCount'] / doc.fields['shareCount']
-
     try:
-        ros = sumRos / len(docs)
+        vector = text_to_vector(row['title'])
+        docs = queryCollection(vector)
+        sumRov = 0
+        for doc in docs:
+            sumRov += doc.fields['rntHeadCount'] / doc.fields['exposureCount']
+        rov = sumRov / len(docs)
     except:
-        ros = 0
+        rov = 0
 
-    print(f"预测ROS={ros}")
+    print(f"预测ROV={rov}")
     print("=====================================")
-    return ros
+    return rov
 
 
-def calculate_return(row):
-    print(f"title={row['title']} ")
-    vector = text_to_vector(row['title'])
-    docs = queryCollection(vector)
-    sumHeadCount = 0
-    for doc in docs:
-        sumHeadCount += doc.fields['rntHeadCount']
+# def calculate_return(row):
+#     print(f"title={row['title']} ")
+#     vector = text_to_vector(row['title'])
+#     docs = queryCollection(vector)
+#     sumHeadCount = 0
+#     for doc in docs:
+#         sumHeadCount += doc.fields['rntHeadCount']
 
-    try:
-        headCount = sumHeadCount / len(docs)
-    except:
-        headCount = 0
+#     try:
+#         headCount = sumHeadCount / len(docs)
+#     except:
+#         headCount = 0
 
-    print(f"预测回流人数={headCount}")
-    print("=====================================")
-    return headCount
+#     print(f"预测回流人数={headCount}")
+#     print("=====================================")
+#     return headCount
 
 
 def calculate_and_export(filename):
     # 读取表格数据
     df = pd.read_excel(filename)
 
-    # 应用计算函数并创建新的列
-    df['实际ROS(回流人数/分享次数)'] = df.apply(lambda row: row['回流人数'] /
-                                      row['分享次数'], axis=1)
-    df['预测ROS'] = df.apply(calculate_ros, axis=1)
-    df['实际回流人数'] = df['回流人数']
-    df['预测回流人数'] = df.apply(calculate_return, axis=1)
+    # 应用计算函数并创建新的列, 跳过曝光次数为0的数据
+    df['实际ROV(回流人数/曝光次数)'] = df.apply(lambda row: row['回流人数'] /
+                                      row['曝光次数'] if row['曝光次数'] != 0 else 0, axis=1)
+    df['预测ROV'] = df.apply(calculate_rov, axis=1)
 
     # 将结果保存回表格
     df.to_excel('videos-result.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    calculate_and_export('20231115_flow_pool.xlsx')

BIN
videos-202309.xlsx


BIN
videos-result.xlsx