forked from pytorch/elastic
-
Notifications
You must be signed in to change notification settings - Fork 0
/
api.py
61 lines (45 loc) · 1.44 KB
/
api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env/python3
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import abc
from typing import Tuple
class RendezvousClosedException(Exception):
"""
Raised when a rendezvous for the specified run_id is closed.
This is used to signal completion to nodes that arrive late.
"""
pass
class RendezvousTimeoutException(Exception):
"""
Raised from `next_rendezvous` to signal that the rendezvous did not
succeed within the allocated time. This is meant to be interpreted
as a non-retryable type of failure.
"""
pass
class RendezvousNonRetryableError(Exception):
"""
Raised from any of the `RendezvousHandler` methods when a failure
occured that should not be retried with the same worker process.
"""
pass
class RendezvousHandler(abc.ABC):
@abc.abstractmethod
def next_rendezvous(self) -> Tuple["torch.distributed.Store", int, int]:
"""
Returns a tuple of (c10d Store, rank, world size),
or raises RendezvousClosedException,
or raises RendezvousTimeoutException.
"""
pass
@abc.abstractmethod
def is_closed(self) -> bool:
pass
@abc.abstractmethod
def set_closed(self):
pass
@abc.abstractmethod
def num_nodes_waiting(self) -> int:
pass