From 3c86a82af36d54da0ac17f366126eddcae7c5c9d Mon Sep 17 00:00:00 2001 From: Joongi Kim Date: Tue, 29 Mar 2022 08:09:56 +0000 Subject: [PATCH] fix: Store last_stat when kernels are terminated as a backup --- src/ai/backend/manager/registry.py | 49 +++++++++++++++++++----------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index 719fb7c0a..431061611 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -1807,15 +1807,22 @@ async def _update() -> None: destroyed_kernels.append(kernel) async def _update() -> None: + kern_stat = await redis.execute( + self.redis_stat, + lambda r: r.get(str(kernel['id'])), + ) async with self.db.begin() as conn: + values = { + 'status': KernelStatus.TERMINATED, + 'status_info': reason, + 'status_changed': now, + 'terminated_at': now, + } + if kern_stat: + values['last_stat'] = msgpack.unpackb(kern_stat) await conn.execute( sa.update(kernels) - .values({ - 'status': KernelStatus.TERMINATED, - 'status_info': reason, - 'status_changed': now, - 'terminated_at': now, - }) + .values(values) .where(kernels.c.id == kernel['id']), ) @@ -2612,6 +2619,11 @@ async def mark_kernel_terminated( except asyncio.CancelledError: pass + kern_stat = await redis.execute( + self.redis_stat, + lambda r: r.get(str(kernel_id)), + ) + async def _update_kernel_status() -> Row | None: async with self.db.begin() as conn: # Check the current status. @@ -2643,19 +2655,22 @@ async def _update_kernel_status() -> Row | None: # Change the status to TERMINATED. # (we don't delete the row for later logging and billing) now = datetime.now(tzutc()) + values = { + 'status': KernelStatus.TERMINATED, + 'status_info': reason, + 'status_changed': now, + 'status_data': sql_json_merge( + kernels.c.status_data, + ("kernel",), + {"exit_code": exit_code}, + ), + 'terminated_at': now, + } + if kern_stat: + values['last_stat'] = msgpack.unpackb(kern_stat) update_query = ( sa.update(kernels) - .values({ - 'status': KernelStatus.TERMINATED, - 'status_info': reason, - 'status_changed': now, - 'status_data': sql_json_merge( - kernels.c.status_data, - ("kernel",), - {"exit_code": exit_code}, - ), - 'terminated_at': now, - }) + .values(values) .where(kernels.c.id == kernel_id) ) await conn.execute(update_query)