|
@@ -1,13 +1,109 @@
|
|
|
+import oss2
|
|
|
import requests
|
|
|
import os
|
|
|
import json
|
|
|
import traceback
|
|
|
+import pandas as pd
|
|
|
+from odps import ODPS
|
|
|
from log import Log
|
|
|
from config import set_config
|
|
|
log_ = Log()
|
|
|
config_ = set_config()
|
|
|
|
|
|
|
|
|
+def get_data_from_odps(date, project, table, connect_timeout=3000, read_timeout=500000,
|
|
|
+ pool_maxsize=1000, pool_connections=1000):
|
|
|
+ """
|
|
|
+ 从odps获取数据
|
|
|
+ :param date: 日期 type-string '%Y%m%d'
|
|
|
+ :param project: type-string
|
|
|
+ :param table: 表名 type-string
|
|
|
+ :param connect_timeout: 连接超时设置
|
|
|
+ :param read_timeout: 读取超时设置
|
|
|
+ :param pool_maxsize:
|
|
|
+ :param pool_connections:
|
|
|
+ :return: records
|
|
|
+ """
|
|
|
+ odps = ODPS(
|
|
|
+ access_id=config_.ODPS_CONFIG['ACCESSID'],
|
|
|
+ secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
|
|
|
+ project=project,
|
|
|
+ endpoint=config_.ODPS_CONFIG['ENDPOINT'],
|
|
|
+ connect_timeout=connect_timeout,
|
|
|
+ read_timeout=read_timeout,
|
|
|
+ pool_maxsize=pool_maxsize,
|
|
|
+ pool_connections=pool_connections
|
|
|
+ )
|
|
|
+ records = odps.read_table(name=table, partition='dt=%s' % date)
|
|
|
+ return records
|
|
|
+
|
|
|
+
|
|
|
+def get_feature_data(project, table, dt, features):
|
|
|
+ """获取特征数据"""
|
|
|
+ records = get_data_from_odps(date=dt, project=project, table=table)
|
|
|
+ feature_data = []
|
|
|
+ for record in records:
|
|
|
+ item = {}
|
|
|
+ for feature_name in features:
|
|
|
+ item[feature_name] = record[feature_name]
|
|
|
+ feature_data.append(item)
|
|
|
+ feature_df = pd.DataFrame(feature_data)
|
|
|
+ return feature_df
|
|
|
+
|
|
|
+
|
|
|
+def check_table_partition_exits(date, project, table, connect_timeout=3000, read_timeout=500000,
|
|
|
+ pool_maxsize=1000, pool_connections=1000):
|
|
|
+ """
|
|
|
+ 判断表中是否存在这个分区
|
|
|
+ :param date: 日期 type-string '%Y%m%d'
|
|
|
+ :param project: type-string
|
|
|
+ :param table: 表名 type-string
|
|
|
+ :param connect_timeout: 连接超时设置
|
|
|
+ :param read_timeout: 读取超时设置
|
|
|
+ :param pool_maxsize:
|
|
|
+ :param pool_connections:
|
|
|
+ :return: records
|
|
|
+ """
|
|
|
+ odps = ODPS(
|
|
|
+ access_id=config_.ODPS_CONFIG['ACCESSID'],
|
|
|
+ secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
|
|
|
+ project=project,
|
|
|
+ endpoint=config_.ODPS_CONFIG['ENDPOINT'],
|
|
|
+ connect_timeout=connect_timeout,
|
|
|
+ read_timeout=read_timeout,
|
|
|
+ pool_maxsize=pool_maxsize,
|
|
|
+ pool_connections=pool_connections
|
|
|
+ )
|
|
|
+ t = odps.get_table(name=table)
|
|
|
+ return t.exist_partition(partition_spec=f'dt={date}')
|
|
|
+
|
|
|
+
|
|
|
+def data_check(project, table, dt):
|
|
|
+ """检查数据是否准备好"""
|
|
|
+ odps = ODPS(
|
|
|
+ access_id=config_.ODPS_CONFIG['ACCESSID'],
|
|
|
+ secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
|
|
|
+ project=project,
|
|
|
+ endpoint=config_.ODPS_CONFIG['ENDPOINT'],
|
|
|
+ connect_timeout=3000,
|
|
|
+ read_timeout=500000,
|
|
|
+ pool_maxsize=1000,
|
|
|
+ pool_connections=1000
|
|
|
+ )
|
|
|
+
|
|
|
+ try:
|
|
|
+ check_res = check_table_partition_exits(date=dt, project=project, table=table)
|
|
|
+ if check_res:
|
|
|
+ sql = f'select * from {project}.{table} where dt = {dt}'
|
|
|
+ with odps.execute_sql(sql=sql).open_reader() as reader:
|
|
|
+ data_count = reader.count
|
|
|
+ else:
|
|
|
+ data_count = 0
|
|
|
+ except Exception as e:
|
|
|
+ data_count = 0
|
|
|
+ return data_count
|
|
|
+
|
|
|
+
|
|
|
def request_post(request_url, headers, request_data):
|
|
|
"""
|
|
|
post 请求 HTTP接口
|
|
@@ -76,6 +172,8 @@ def asr_validity_discrimination(text):
|
|
|
|
|
|
|
|
|
|
|
|
+
|
|
|
+
|
|
|
if __name__ == '__main__':
|
|
|
text = """现场和电视机前的观众朋友,大家晚上好。
|
|
|
这里是非常说明的访谈现场,
|