From 1b0db3a383b83ddd12100a5af79b587e93075899 Mon Sep 17 00:00:00 2001 From: Joongi Kim Date: Tue, 13 Apr 2021 14:53:16 +0900 Subject: [PATCH] release: 21.03.3 --- CHANGELOG.md | 13 +++++++++++++ changes/415.feature | 4 ---- changes/418.fix | 1 - src/ai/backend/manager/__init__.py | 2 +- 4 files changed, 14 insertions(+), 6 deletions(-) delete mode 100644 changes/415.feature delete mode 100644 changes/418.fix diff --git a/CHANGELOG.md b/CHANGELOG.md index 22c68b808..7a31d5846 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,19 @@ Changes .. towncrier release notes start +21.03.3 (2021-04-13) +-------------------- + +### Features +* Rewrite the session scheduler to avoid HoL blocking ([#415](https://github.com/lablup/backend.ai-manager/issues/415)) + - Skip over sessions in the queue if they fail to satisfy predicates for multiple retries -> 1st case of HoL blocking: a rogue pending session blocks everything in the same scaling group + - You may configure the maximum number of retries in the `config/plugins/scheduler/fifo/num_retries_to_skip` etcd key. + - Split the scheduler into two async loops for scheduling decision and session spawning by inserting "SCHEDULED" status between "PENDING" and "PREPARING" statuses -> 2nd case of HoL blocking: failure isolation with each task + +### Fixes +* Adjust the firing rate of `DoPrepareEvent` to follow and alternate with the scheduler execution ([#418](https://github.com/lablup/backend.ai-manager/issues/418)) + + 21.03.2 (2021-04-02) -------------------- diff --git a/changes/415.feature b/changes/415.feature deleted file mode 100644 index 9210b1cbe..000000000 --- a/changes/415.feature +++ /dev/null @@ -1,4 +0,0 @@ -Rewrite the session scheduler to avoid HoL blocking - - Skip over sessions in the queue if they fail to satisfy predicates for multiple retries -> 1st case of HoL blocking: a rogue pending session blocks everything in the same scaling group - - You may configure the maximum number of retries in the `config/plugins/scheduler/fifo/num_retries_to_skip` etcd key. - - Split the scheduler into two async loops for scheduling decision and session spawning by inserting "SCHEDULED" status between "PENDING" and "PREPARING" statuses -> 2nd case of HoL blocking: failure isolation with each task diff --git a/changes/418.fix b/changes/418.fix deleted file mode 100644 index f0f33748a..000000000 --- a/changes/418.fix +++ /dev/null @@ -1 +0,0 @@ -Adjust the firing rate of `DoPrepareEvent` to follow and alternate with the scheduler execution diff --git a/src/ai/backend/manager/__init__.py b/src/ai/backend/manager/__init__.py index f1f0221b4..fd4163923 100644 --- a/src/ai/backend/manager/__init__.py +++ b/src/ai/backend/manager/__init__.py @@ -1 +1 @@ -__version__ = '21.03.2' +__version__ = '21.03.3'