123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112 |
- import time
- from odps import ODPS
- from transformers import BertModel, BertTokenizer
- import dashvector
- from dashvector import Doc
- from typing import List
- from pandas import DataFrame
- import pandas as pd
- client = dashvector.Client(
- api_key='sk-TbWSOiwIcp9FZkx0fyM9JRomTxmOtD796E4626C1411EEB3525A6F9FFB919B')
- collection = client.get('video_title_performance_01')
- assert collection
- access_id = 'LTAIWYUujJAm7CbH'
- access_key = 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P'
- endpoint = 'http://service.cn.maxcompute.aliyun.com/api'
- project_name = 'loghubods'
- odps = ODPS(
- access_id=access_id,
- secret_access_key=access_key,
- project=project_name,
- endpoint=endpoint
- )
- model_name = 'bert-base-chinese'
- model = BertModel.from_pretrained(model_name)
- tokenizer = BertTokenizer.from_pretrained(model_name)
- def insert_vector(docs: List[Doc]):
- if len(docs) == 0:
- return
-
- resp = collection.insert(docs=docs)
- print(resp)
- def text_to_vector(text) -> List[float]:
-
- inputs = tokenizer(text, return_tensors='pt')
-
- outputs = model(**inputs)
-
- embeddings = outputs.last_hidden_state
-
- embeddings = embeddings.detach().numpy().tolist()[0][0]
- return embeddings
- def query_video_title_perfermance(start_idx, limit):
- sql = f"SELECT * FROM video_perfermance_info_3 WHERE title is not NULL AND title != '' ORDER BY videoid LIMIT {start_idx}, {limit};"
- result = []
- with odps.execute_sql(sql).open_reader() as reader:
- for record in reader:
-
- result.append(record)
- return result
- def video_title_perfermance_to_vector(startIdx, limit) -> List[Doc]:
- records = query_video_title_perfermance(startIdx, limit)
- docs = []
- for record in records:
-
- videoid = str(record.videoid)
- title = record.title
- if title is None:
- continue
- rntCount = record.回流次数
- rntHeadCount = record.回流人数
- shareCount = record.分享次数
- shareHeadCount = record.分享人数
- exposureCount = record.曝光次数
- exposureHeadCount = record.曝光人数
- playCount = record.播放次数
- playHeadCount = record.播放人数
-
- vector = text_to_vector(title)
-
- doc = Doc(id=videoid, vector=vector, fields={
- 'title': title, 'rntCount': rntCount, 'rntHeadCount': rntHeadCount,
- 'shareCount': shareCount, 'shareHeadCount': shareHeadCount,
- 'exposureCount': exposureCount, 'exposureHeadCount': exposureHeadCount, 'playCount': playCount, 'playHeadCount': playHeadCount
- })
- docs.append(doc)
- return docs
- def batchInsert():
- for i in range(100000, 185000, 500):
- print(i)
-
- start = time.time()
- docs = video_title_perfermance_to_vector(i, 500)
- insert_vector(docs)
- end = time.time()
- print(f'{i} done in {end - start}s')
|