|
@@ -4,10 +4,11 @@ from typing import Dict, List
|
|
|
|
|
|
|
|
import pandas as pd
|
|
import pandas as pd
|
|
|
|
|
|
|
|
-from client import YarnClient
|
|
|
|
|
-from util import dateutil, feishu_inform_util
|
|
|
|
|
|
|
+from client import YarnClient, ODPSClient
|
|
|
|
|
+from util import date_util, feishu_inform_util
|
|
|
|
|
|
|
|
-yarn_client = YarnClient.YarnClient("192.168.203.16")
|
|
|
|
|
|
|
+yarn_client = YarnClient.YarnClient("121.40.173.140")
|
|
|
|
|
+odps_client = ODPSClient.ODPSClient()
|
|
|
|
|
|
|
|
table_list = [
|
|
table_list = [
|
|
|
"alg_mid_feature_sharecf",
|
|
"alg_mid_feature_sharecf",
|
|
@@ -52,38 +53,38 @@ table_list = [
|
|
|
|
|
|
|
|
filter_date = datetime(2024, 1, 1)
|
|
filter_date = datetime(2024, 1, 1)
|
|
|
|
|
|
|
|
|
|
+columns = ["id", "表名", "startedTime", "launchTime", "finishedTime", "state", "elapsedTime", "分区", "数据量",
|
|
|
|
|
+ "数据大小", "创建时间", "更新时间"]
|
|
|
|
|
+
|
|
|
|
|
|
|
|
def df_print(result):
|
|
def df_print(result):
|
|
|
df = pd.DataFrame(result)
|
|
df = pd.DataFrame(result)
|
|
|
- # 过滤出 name 中包含 'cid' 的行
|
|
|
|
|
- filtered_df = df[df['name'].str.contains('cid')].copy() # 使用 .copy() 生成副本以避免警告
|
|
|
|
|
- filtered_df.loc[:, 'name'] = filtered_df['name'].str.replace('odps sync to redis : ', '', regex=False)
|
|
|
|
|
|
|
+ sorted_df = df.sort_values(by="startedTime")
|
|
|
|
|
|
|
|
- sorted_df = filtered_df.sort_values(by="startedTime")
|
|
|
|
|
|
|
+ sorted_df = sorted_df[columns]
|
|
|
|
|
|
|
|
# 获取表头
|
|
# 获取表头
|
|
|
header = ' | '.join(sorted_df.columns)
|
|
header = ' | '.join(sorted_df.columns)
|
|
|
|
|
|
|
|
|
|
+ # 获取数据行
|
|
|
def format_row(row):
|
|
def format_row(row):
|
|
|
return ' | '.join([str(row[col]) for col in sorted_df.columns])
|
|
return ' | '.join([str(row[col]) for col in sorted_df.columns])
|
|
|
|
|
|
|
|
- # 获取数据行
|
|
|
|
|
- rows = filtered_df.apply(format_row, axis=1).tolist()
|
|
|
|
|
|
|
+ rows = sorted_df.apply(format_row, axis=1).tolist()
|
|
|
|
|
|
|
|
# 打印输出
|
|
# 打印输出
|
|
|
print(header)
|
|
print(header)
|
|
|
- print('-' * len(header))
|
|
|
|
|
print('\n'.join(rows))
|
|
print('\n'.join(rows))
|
|
|
|
|
|
|
|
|
|
|
|
|
def handle_table(table_name: str, spark_task_list: List[Dict]) -> (bool, str):
|
|
def handle_table(table_name: str, spark_task_list: List[Dict]) -> (bool, str):
|
|
|
filtered_data = [
|
|
filtered_data = [
|
|
|
item for item in spark_task_list
|
|
item for item in spark_task_list
|
|
|
- if table_name in item['name'] and dateutil.str_cover_date(item['startedTime']) > filter_date
|
|
|
|
|
|
|
+ if table_name in item['name'] and date_util.str_cover_date(item['startedTime']) > filter_date
|
|
|
]
|
|
]
|
|
|
if filtered_data:
|
|
if filtered_data:
|
|
|
latest_started_time = max(
|
|
latest_started_time = max(
|
|
|
- [dateutil.str_cover_date(item['startedTime']) for item in filtered_data])
|
|
|
|
|
|
|
+ [date_util.str_cover_date(item['startedTime']) for item in filtered_data])
|
|
|
print(f"表: {table_name}, 最后一次同步时间为: {latest_started_time}")
|
|
print(f"表: {table_name}, 最后一次同步时间为: {latest_started_time}")
|
|
|
now = datetime.now()
|
|
now = datetime.now()
|
|
|
time_difference = now - latest_started_time
|
|
time_difference = now - latest_started_time
|
|
@@ -153,5 +154,35 @@ def _main():
|
|
|
send_error_info(table_name, latest_started_time, webhook_url)
|
|
send_error_info(table_name, latest_started_time, webhook_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def _analyse():
|
|
|
|
|
+ hours_7_early = int((datetime.now() - timedelta(hours=14)).timestamp()) * 1000
|
|
|
|
|
+ result = yarn_client.get_apps(started_time_begin=hours_7_early)
|
|
|
|
|
+ result = [
|
|
|
|
|
+ {
|
|
|
|
|
+ **{k: v for k, v in item.items() if k != 'name'},
|
|
|
|
|
+ 'table_name': item['name'].split(":")[1].strip()
|
|
|
|
|
+ }
|
|
|
|
|
+ for item in result
|
|
|
|
|
+ if "alg" in item['name'] and item['state'] == 'RUNNING'
|
|
|
|
|
+ ]
|
|
|
|
|
+ partition_info = {}
|
|
|
|
|
+ for table_name in list({item['table_name'] for item in result}):
|
|
|
|
|
+ resp = odps_client.get_all_partition_info(table_name=table_name)
|
|
|
|
|
+ partition_info[table_name] = {item['分区']: item for item in resp}
|
|
|
|
|
+
|
|
|
|
|
+ spark_task_list = []
|
|
|
|
|
+ for item in result:
|
|
|
|
|
+ dt_hh = date_util.date_convert_dt_hh(item['startedTime'])
|
|
|
|
|
+ if item['table_name'] in partition_info and dt_hh in partition_info[item['table_name']]:
|
|
|
|
|
+ item = {
|
|
|
|
|
+ **item,
|
|
|
|
|
+ **partition_info[item['table_name']][dt_hh]
|
|
|
|
|
+ }
|
|
|
|
|
+ spark_task_list.append(item)
|
|
|
|
|
+
|
|
|
|
|
+ df_print(spark_task_list)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
|
- _main()
|
|
|
|
|
|
|
+ # _main()
|
|
|
|
|
+ _analyse()
|