liqian 2 年之前
父節點
當前提交
1db7c44b32
共有 1 個文件被更改,包括 74 次插入92 次删除
  1. 74 92
      rov_server_update.py

+ 74 - 92
rov_server_update.py

@@ -13,53 +13,7 @@ from concurrent.futures import ThreadPoolExecutor
 
 health_instances = []
 ess_instances = []
-
-# slb_id = 'lb-bp1werfophtsjzfr76njm'
-# # 修改负载均衡权限
-# slb_client_params = {
-#     'access_key_id': 'LTAIuPbTPL3LDDKN',
-#     'access_key_secret': 'ORcNedKwWuwVtcq4IRFtUDZgS0b1le',
-#     'region_id': 'cn-hangzhou'
-# }
-# # 购买机器权限
-# create_client_params = {
-#     'access_key_id': 'LTAI4GBWbFvvXoXsSVBe1o9f',
-#     'access_key_secret': 'kRAikWitb4kDxaAyBqNrmLmllMEDO3',
-#     'region_id': 'cn-hangzhou'
-# }
-#
-# # 机器配置
-# instance_config = {
-#     # 使用的镜像信息
-#     'image_id': 'm-bp1e5jx8eqhq22l91xw7',
-#     # 设置实例规格
-#     'instance_type': 'ecs.ic5.xlarge',
-#     # 选择的交换机
-#     'vswitch_id': 'vsw-bp19lpjwtc6j0p0m9mdc2',
-#     # 当前VPC类型的安全组
-#     'security_group_id': 'sg-bp1irhrkr4vfj272hk4y',
-#     # 硬盘的大小,单位:G
-#     'disk_size': '200',
-#     # 服务器命名
-#     'instance_name': 'ESS-rov-server-[1,2]',
-#     # 服务器所在区域
-#     'zone_id': 'cn-hangzhou-h',
-#     # 磁盘类型:云盘
-#     'disk_category': 'cloud_efficiency',
-#     # 密钥
-#     'key_pair_name': 'stuuudy'
-# }
-#
-# # 服务启动脚本
-# start_sh_dir = os.path.dirname(os.path.realpath(__file__))
-# start_sh_filename = 'rov_server_start.sh'
-# with open(file=os.path.join(start_sh_dir, start_sh_filename), mode='r', encoding='utf-8') as rf:
-#     file_content = rf.read()
-# start_sh = {
-#     'target_dir': '/home/piaoquan_server_sh',
-#     'name': start_sh_filename,
-#     'content': file_content,
-# }
+remove_container_instances = []
 
 
 def server_health_check(client, instance_id):
@@ -140,13 +94,14 @@ async def run_server(create_client, slb_client, instance_ids, max_workers):
         sys.exit()
 
 
-async def ess_instance(create_client, slb_client, ess_count, max_workers):
+async def ess_instance(create_client, slb_client, ess_count, max_workers, version):
     """
     扩容机器并运行新服务
     :param create_client: 购买机器客户端连接
     :param slb_client: 修改负载均衡权限
     :param ess_count: 扩容数量
     :param max_workers: 线程数
+    :param version: 版本标记
     :return:
     """
     # 1. 购买机器并启动
@@ -162,7 +117,7 @@ async def ess_instance(create_client, slb_client, ess_count, max_workers):
     logging.info(f"send start shell file finished, instances: {ess_instance_ids}")
     # 3. 启动服务
     server_start_sh = os.path.join(rov_server_config.start_sh['target_dir'], rov_server_config.start_sh['name'])
-    server_start_commend = f"sh {server_start_sh}"
+    server_start_commend = f"sh {server_start_sh} {version}"
     utils.run_command(client=create_client, instance_ids=ess_instance_ids, command=server_start_commend)
     # 4. 异步探活
     global health_instances
@@ -179,13 +134,15 @@ async def ess_instance(create_client, slb_client, ess_count, max_workers):
     if len(health_instances) == len(ess_instance_ids):
         # 所有机器探活成功
         time.sleep(60)
+        utils.add_backend_servers(client=slb_client, slb_id=rov_server_config.slb_id, instances=health_instances)
+        health_instance_ids = [instance_id for instance_id, _ in health_instances]
         add_weight_list = [(10, 30), (20, 20), (40, 10), (60, 10), (80, 10), (100, 10)]
-        # set_instance_weight_process(client=slb_client, instance_id_list=ess_instance_ids, weight_list=add_weight_list)
+        # set_instance_weight_process(client=slb_client, instance_id_list=health_instance_ids, weight_list=add_weight_list)
         global ess_instances
-        ess_instances.extend(ess_instance_ids)
+        ess_instances.extend(health_instance_ids)
         logging.info(f"ess count: {ess_count}, "
                      f"create count: {len(ess_instance_ids)}, "
-                     f"finished count: {len(health_instances)}")
+                     f"finished count: {len(health_instance_ids)}")
     else:
         logging.info(f"ess count: {ess_count}, "
                      f"create count: {len(ess_instance_ids)}, "
@@ -207,44 +164,55 @@ def remove_container_image(client, instance_id, container_name):
     # 移除旧的容器
     container_remove_retry = 3
     i = 0
-    while i < container_remove_retry:
+    while True:
+        if i >= container_remove_retry:
+            logging.error(f"容器不存在或者无法删除当前容器, instance = {instance_id}/{ip_address}")
+            sys.exit()
         try:
             container_id = client.containers.get(container_name)
             container_id.remove(force=True)
             break
         except Exception as e:
             i += 1
-            print("容器不存在或者无法删除当前容器")
+
     # 删除旧镜像
     images_remove_retry = 3
     j = 0
-    while j < images_remove_retry:
+    while True:
+        if j >= images_remove_retry:
+            logging.error(f"镜像不存在,无法获取到镜像ID, instance = {instance_id}/{ip_address}")
+            sys.exit()
         try:
             images = client.images.list()
             for image in images:
                 client.images.remove(force=True, image=image.tags[0])
                 time.sleep(2)
+            global remove_container_instances
+            remove_container_instances.append(instance_id)
         except Exception as e:
             i += 1
-            print("镜像不存在,无法获取到镜像ID")
 
 
-async def update_instance(create_client, slb_client, instance_ids, max_workers):
+async def update_instance(create_client, slb_client, instance_ids, max_workers, version):
     """
     线上机器更新
     :param create_client:
     :param slb_client: slb客户端连接
     :param instance_ids: instanceId type-list
     :param max_workers:
+    :param version: 版本标记
     :return:
     """
     media_index = len(instance_ids)//2
     instance_ids_group = [instance_ids[:media_index], instance_ids[media_index:]]
+    update_finished_count = 0
     for instance_id_list in instance_ids_group:
         # 1. 摘流量
         set_instance_weight_process(client=slb_client, instance_id_list=instance_id_list, weight_list=[(0, 60)])
         logging.info(f"set weight with 0 finished, instances: {instance_id_list}")
         # 2. 异步移除旧容器并删除旧镜像
+        global remove_container_instances
+        remove_container_instances = []
         container_name = 'rov-server'
         loop = asyncio.get_running_loop()
         executor = ThreadPoolExecutor(max_workers=max_workers)
@@ -253,13 +221,18 @@ async def update_instance(create_client, slb_client, instance_ids, max_workers):
             [(slb_client, instance_id, container_name) for instance_id in instance_id_list]
         ]
         await asyncio.wait(tasks)
-        logging.info(f"remove container & images finished, instances: {instance_id_list}")
+        logging.info(f"remove container & images finished, instances: {remove_container_instances},"
+                     f" count: {len(remove_container_instances)}")
+        if len(remove_container_instances) < len(instance_id_list):
+            logging.error(f"remove container image failed| "
+                          f"request count: {len(instance_id_list)}, removed count: {len(remove_container_instances)}")
+            sys.exit()
         # 3. 发送启动脚本到机器上
         utils.send_file_to_ecs(client=create_client, instance_id_list=instance_id_list, **rov_server_config.start_sh)
-        logging.info(f"send start shell file finished, instances: {instance_id_list}")
+        logging.info(f"send start shell file finished, instances: {instance_id_list}, count: {len(instance_id_list)}")
         # 4. 启动服务
         server_start_sh = os.path.join(rov_server_config.start_sh['target_dir'], rov_server_config.start_sh['name'])
-        server_start_commend = f"sh {server_start_sh}"
+        server_start_commend = f"sh {server_start_sh} {version}"
         utils.run_command(client=create_client, instance_ids=instance_id_list, command=server_start_commend)
         # 5. 异步探活
         global health_instances
@@ -271,17 +244,21 @@ async def update_instance(create_client, slb_client, instance_ids, max_workers):
             [(slb_client, instance_id) for instance_id in instance_id_list]
         ]
         await asyncio.wait(tasks)
-        logging.info(f"health instances count: {len(health_instances)}, {health_instances}")
+        logging.info(f"health instances: {health_instances}, count: {len(health_instances)}")
         # 6. 挂载流量
         if len(health_instances) == len(instance_id_list):
             # 所有机器探活成功
             time.sleep(60)
+            utils.add_backend_servers(client=slb_client, slb_id=rov_server_config.slb_id, instances=health_instances)
+            health_instance_ids = [instance_id for instance_id, _ in health_instances]
             add_weight_list = [(10, 30), (20, 20), (40, 10), (60, 10), (80, 10), (100, 10)]
-            set_instance_weight_process(client=slb_client, instance_id_list=instance_id_list,
+            set_instance_weight_process(client=slb_client, instance_id_list=health_instance_ids,
                                         weight_list=add_weight_list)
-            logging.info(f"finished count: {len(health_instances)}")
+            logging.info(f"finished instances: {health_instances}, count: {len(health_instances)}")
+            update_finished_count += len(health_instances)
+            logging.info(f"update finished: {update_finished_count}/{len(instance_ids)}")
         else:
-            logging.info(f"health count: {len(health_instances)}")
+            logging.info(f"health instances: {health_instances}, count: {len(health_instances)}")
             sys.exit()
 
 
@@ -327,37 +304,42 @@ def remove_instances(create_client, slb_client, instance_ids):
 
 
 def main():
-    slb_client = utils.connect_client(access_key_id=rov_server_config.slb_client_params['access_key_id'],
-                                      access_key_secret=rov_server_config.slb_client_params['access_key_secret'],
-                                      region_id=rov_server_config.slb_client_params['region_id'])
-    create_client = utils.connect_client(access_key_id=rov_server_config.create_client_params['access_key_id'],
-                                         access_key_secret=rov_server_config.create_client_params['access_key_secret'],
-                                         region_id=rov_server_config.create_client_params['region_id'])
+    try:
+        version = sys.argv[1]
+        slb_client = utils.connect_client(access_key_id=rov_server_config.slb_client_params['access_key_id'],
+                                          access_key_secret=rov_server_config.slb_client_params['access_key_secret'],
+                                          region_id=rov_server_config.slb_client_params['region_id'])
+        create_client = utils.connect_client(access_key_id=rov_server_config.create_client_params['access_key_id'],
+                                             access_key_secret=rov_server_config.create_client_params['access_key_secret'],
+                                             region_id=rov_server_config.create_client_params['region_id'])
 
-    # 1. 获取slb下所有机器
-    online_instance_ids = utils.get_instance_ids(client=slb_client, slb_id=rov_server_config.slb_id)
-    online_instance_count = len(online_instance_ids)
-    logging.info(f"online instance count: {online_instance_count}.")
-    logging.info(f"online instance ids: {online_instance_ids}")
+        # 1. 获取slb下所有机器
+        online_instance_ids = utils.get_instance_ids(client=slb_client, slb_id=rov_server_config.slb_id)
+        online_instance_count = len(online_instance_ids)
+        logging.info(f"online instance count: {online_instance_count}.")
+        logging.info(f"online instance ids: {online_instance_ids}")
 
-    # 2. 扩容机器并启动新服务 扩容数量:线上机器数量/2
-    logging.info(f"ess instances start ...")
-    ess_instance_count = online_instance_count // 2
-    logging.info(f"ess instance count: {ess_instance_count}")
-    asyncio.run(ess_instance(create_client=create_client, slb_client=slb_client,
-                             ess_count=ess_instance_count, max_workers=2))
-    logging.info(f"ess instances end!")
+        # 2. 扩容机器并启动新服务 扩容数量:线上机器数量//2
+        logging.info(f"ess instances start ...")
+        ess_instance_count = online_instance_count // 2
+        logging.info(f"ess instance count: {ess_instance_count}")
+        asyncio.run(ess_instance(create_client=create_client, slb_client=slb_client,
+                                 ess_count=ess_instance_count, max_workers=2, version=version))
+        logging.info(f"ess instances end!")
 
-    # # 3. 原有机器进行更新
-    # logging.info(f"update online instances start ...")
-    # asyncio.run(update_instance(create_client=create_client, slb_client=slb_client,
-    #                             instance_ids=online_instance_ids, max_workers=2))
-    # logging.info(f"update online instances end!")
-
-    # 4. 停止并释放扩容机器
-    logging.info(f"stop & release instances start ...")
-    remove_instances(create_client=create_client, slb_client=slb_client, instance_ids=ess_instances)
-    logging.info(f"stop & release instances end!")
+        # # 3. 原有机器进行更新
+        # logging.info(f"update online instances start ...")
+        # asyncio.run(update_instance(create_client=create_client, slb_client=slb_client,
+        #                             instance_ids=online_instance_ids, max_workers=2, version=version))
+        # logging.info(f"update online instances end!")
+        #
+        # # 4. 停止并释放扩容机器
+        # logging.info(f"stop & release instances start ...")
+        # remove_instances(create_client=create_client, slb_client=slb_client, instance_ids=ess_instances)
+        # logging.info(f"stop & release instances end!")
+    except Exception as e:
+        logging.error(e)
+        sys.exit()
 
 
 if __name__ == '__main__':