1 年之前 · 7906ee7dd0
--- a/BertDemo.py
+++ b/BertDemo.py
@@ -0,0 +1,112 @@
 
				+import time
			
 
				+from odps import ODPS
			
 
				+from transformers import BertModel, BertTokenizer
			
 
				+import dashvector
			
 
				+from dashvector import Doc
			
 
				+from typing import List
			
 
				+from pandas import DataFrame
			
 
				+import pandas as pd
			
 
				+
			
 
				+# 阿里云向量数据库连接
			
 
				+client = dashvector.Client(
			
 
				+    api_key='sk-TbWSOiwIcp9FZkx0fyM9JRomTxmOtD796E4626C1411EEB3525A6F9FFB919B')
			
 
				+# 索引集合
			
 
				+collection = client.get('video_title_performance_01')
			
 
				+assert collection
			
 
				+
			
 
				+
			
 
				+# 阿里云odps连接
			
 
				+access_id = 'LTAIWYUujJAm7CbH'
			
 
				+access_key = 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P'
			
 
				+endpoint = 'http://service.cn.maxcompute.aliyun.com/api'
			
 
				+project_name = 'loghubods'
			
 
				+
			
 
				+odps = ODPS(
			
 
				+    access_id=access_id,
			
 
				+    secret_access_key=access_key,
			
 
				+    project=project_name,
			
 
				+    endpoint=endpoint
			
 
				+)
			
 
				+
			
 
				+# 加载预训练的BERT模型和对应的tokenizer
			
 
				+model_name = 'bert-base-chinese'
			
 
				+model = BertModel.from_pretrained(model_name)
			
 
				+tokenizer = BertTokenizer.from_pretrained(model_name)
			
 
				+
			
 
				+
			
 
				+def insert_vector(docs: List[Doc]):
			
 
				+    if len(docs) == 0:
			
 
				+        return
			
 
				+    # 通过dashvector.Doc对象，批量插入
			
 
				+    resp = collection.insert(docs=docs)
			
 
				+    print(resp)
			
 
				+
			
 
				+
			
 
				+def text_to_vector(text) -> List[float]:
			
 
				+    # 使用tokenizer将文本转化为模型需要的格式，这里我们只取一个文本所以使用encode而非batch_encode
			
 
				+    inputs = tokenizer(text, return_tensors='pt')
			
 
				+
			
 
				+    # 用BERT模型处理输入数据
			
 
				+    outputs = model(**inputs)
			
 
				+
			
 
				+    # 提取嵌入向量
			
 
				+    embeddings = outputs.last_hidden_state  # 最后一层的隐藏状态
			
 
				+
			
 
				+    # 将嵌入向量转为NumPy数组
			
 
				+    embeddings = embeddings.detach().numpy().tolist()[0][0]
			
 
				+
			
 
				+    return embeddings
			
 
				+
			
 
				+
			
 
				+# 查询视频标题的表现（从阿里云odps中查询）
			
 
				+def query_video_title_perfermance(start_idx, limit):
			
 
				+    sql = f"SELECT * FROM video_perfermance_info_3 WHERE title is not NULL AND title != '' ORDER BY videoid LIMIT {start_idx}, {limit};"
			
 
				+    result = []
			
 
				+    with odps.execute_sql(sql).open_reader() as reader:
			
 
				+        for record in reader:
			
 
				+            # 处理查询结果
			
 
				+            result.append(record)
			
 
				+    return result
			
 
				+
			
 
				+# 将标题表现拼接为向量数据库的向量对象
			
 
				+
			
 
				+
			
 
				+def video_title_perfermance_to_vector(startIdx, limit) -> List[Doc]:
			
 
				+    records = query_video_title_perfermance(startIdx, limit)
			
 
				+    docs = []
			
 
				+    for record in records:
			
 
				+        # 获取字段值
			
 
				+        videoid = str(record.videoid)
			
 
				+        title = record.title
			
 
				+        if title is None:
			
 
				+            continue
			
 
				+        rntCount = record.回流次数
			
 
				+        rntHeadCount = record.回流人数
			
 
				+        shareCount = record.分享次数
			
 
				+        shareHeadCount = record.分享人数
			
 
				+        exposureCount = record.曝光次数
			
 
				+        exposureHeadCount = record.曝光人数
			
 
				+        playCount = record.播放次数
			
 
				+        playHeadCount = record.播放人数
			
 
				+
			
 
				+        # 将文本转化为向量
			
 
				+        vector = text_to_vector(title)
			
 
				+        # 将向量和标题表现拼接为Doc对象
			
 
				+        doc = Doc(id=videoid, vector=vector, fields={
			
 
				+            'title': title, 'rntCount': rntCount, 'rntHeadCount': rntHeadCount,
			
 
				+            'shareCount': shareCount, 'shareHeadCount': shareHeadCount,
			
 
				+            'exposureCount': exposureCount, 'exposureHeadCount': exposureHeadCount, 'playCount': playCount, 'playHeadCount': playHeadCount
			
 
				+        })
			
 
				+        docs.append(doc)
			
 
				+    return docs
			
 
				+
			
 
				+
			
 
				+def batchInsert():
			
 
				+    for i in range(100000, 185000, 500):
			
 
				+        print(i)
			
 
				+        # 计算耗时
			
 
				+        start = time.time()
			
 
				+        docs = video_title_perfermance_to_vector(i, 500)
			
 
				+        insert_vector(docs)
			
 
				+        end = time.time()
			
 
				+        print(f'{i} done in {end - start}s')
			
--- a/BertInsert.py
+++ b/BertInsert.py
@@ -0,0 +1,3 @@
 
				+import BertDemo
			
 
				+
			
 
				+BertDemo.batchInsert()
			
--- a/BertQuery.py
+++ b/BertQuery.py
@@ -0,0 +1,80 @@
 
				+from BertDemo import collection, text_to_vector, List, Doc
			
 
				+import pandas as pd
			
 
				+
			
 
				+# 创建一个缓存字典，用于存储查询结果
			
 
				+query_cache = {}
			
 
				+
			
 
				+
			
 
				+def vector_to_tuple(vector):
			
 
				+    # 将列表转换为元组，以便作为字典的键使用
			
 
				+    return tuple(vector)
			
 
				+
			
 
				+
			
 
				+def queryCollection(vector) -> List[Doc]:
			
 
				+    # 如果向量已经查询过，则直接返回结果
			
 
				+    vector_tuple = vector_to_tuple(vector)
			
 
				+
			
 
				+    if vector_tuple in query_cache:
			
 
				+        return query_cache[vector_tuple]
			
 
				+
			
 
				+    # 根据向量进行相似性检索 + 条件过滤
			
 
				+    ret = collection.query(
			
 
				+        vector=vector,   # 向量检索，也可设置主键检索
			
 
				+        topk=10,
			
 
				+        # filter='playCount > 1000',
			
 
				+        include_vector=True
			
 
				+    )
			
 
				+    if ret is None or ret.code != 0:
			
 
				+        print('查询失败')
			
 
				+        return None
			
 
				+    query_cache[vector_tuple] = ret.output
			
 
				+    return ret.output
			
 
				+
			
 
				+
			
 
				+def calculate_ros(row):
			
 
				+    print(f"title={row['title']} ")
			
 
				+    vector = text_to_vector(row['title'])
			
 
				+    docs = queryCollection(vector)
			
 
				+    sumRos = 0
			
 
				+    for doc in docs:
			
 
				+        sumRos += doc.fields['rntHeadCount'] / doc.fields['shareCount']
			
 
				+
			
 
				+    try:
			
 
				+        ros = sumRos / len(docs)
			
 
				+    except:
			
 
				+        ros = 0
			
 
				+
			
 
				+    print(f"预测ROS={ros}")
			
 
				+    print("=====================================")
			
 
				+    return ros
			
 
				+
			
 
				+
			
 
				+def calculate_return(row):
			
 
				+    print(f"title={row['title']} ")
			
 
				+    vector = text_to_vector(row['title'])
			
 
				+    docs = queryCollection(vector)
			
 
				+    sumHeadCount = 0
			
 
				+    for doc in docs:
			
 
				+        sumHeadCount += doc.fields['rntHeadCount']
			
 
				+
			
 
				+    try:
			
 
				+        headCount = sumHeadCount / len(docs)
			
 
				+    except:
			
 
				+        headCount = 0
			
 
				+
			
 
				+    print(f"预测回流人数={headCount}")
			
 
				+    print("=====================================")
			
 
				+    return headCount
			
 
				+
			
 
				+
			
 
				+# 读取表格数据
			
 
				+df = pd.read_excel('videos-202309.xlsx')
			
 
				+
			
 
				+# 应用计算函数并创建新的列
			
 
				+df['实际ROS(回流人数/分享次数)'] = df.apply(lambda row: row['回流人数'] / row['分享次数'], axis=1)
			
 
				+df['预测ROS'] = df.apply(calculate_ros, axis=1)
			
 
				+df['实际回流人数'] = df['回流人数']
			
 
				+df['预测回流人数'] = df.apply(calculate_return, axis=1)
			
 
				+
			
 
				+# 将结果保存回表格
			
 
				+df.to_excel('videos-result.xlsx', index=False)
			
--- a/ODPSUtil.py
+++ b/ODPSUtil.py
@@ -0,0 +1 @@
 
				+
			
--- a/README.md
+++ b/README.md
@@ -1,3 +1,3 @@
 
				 # aigc-content-vector
			
 
				 
			
 
				-aigc内容向量话
			
 
				+aigc 内容向量化
			
--- a/videos-202309.xlsx
+++ b/videos-202309.xlsx