Skip to content

Commit bc00e4b

Browse files
Kiuk Chungfacebook-github-bot
authored andcommitted
(torchx/components) expose raw resource params for dist.ddp (#395)
Summary: Pull Request resolved: #395 Exposes raw resources (cpu, gpu, memMB) in addition to "host" (named resource) for dist.ddp and utils.python to make it convenient for users who don't want to register named resources in entrypoint. Reviewed By: aivanou Differential Revision: D34260962 fbshipit-source-id: e693e581b8c2c75d9fd1e057e26df2cd39ba8362
1 parent 08e3e83 commit bc00e4b

File tree

4 files changed

+85
-8
lines changed

4 files changed

+85
-8
lines changed

torchx/components/dist.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,10 @@ def ddp(
134134
script: str,
135135
image: str = torchx.IMAGE,
136136
name: Optional[str] = None,
137-
h: str = "aws_t3.medium",
137+
cpu: int = 2,
138+
gpu: int = 0,
139+
memMB: int = 1024,
140+
h: Optional[str] = None,
138141
j: str = "1x2",
139142
rdzv_endpoint: str = "etcd-server.default.svc.cluster.local:2379",
140143
) -> specs.AppDef:
@@ -143,12 +146,19 @@ def ddp(
143146
Uses `torch.distributed.run <https://pytorch.org/docs/stable/distributed.elastic.html>`_
144147
to launch and coordinate pytorch worker processes.
145148
149+
Note: (cpu, gpu, memMB) parameters are mutually exclusive with ``h`` (named resource) where
150+
``h`` takes precedence if specified for setting resource requirements.
151+
See `registering named resources <https://pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.
152+
146153
Args:
147154
script_args: arguments to the main module
148155
script: script or binary to run within the image
149156
image: image (e.g. docker)
150157
name: job name override (uses the script name if not specified)
151-
h: a registered named resource
158+
cpu: number of cpus per replica
159+
gpu: number of gpus per replica
160+
memMB: cpu memory in MB per replica
161+
h: a registered named resource (if specified takes precedence over cpu, gpu, memMB)
152162
j: {nnodes}x{nproc_per_node}, for gpu hosts, nproc_per_node must not exceed num gpus
153163
rdzv_endpoint: etcd server endpoint (only matters when nnodes > 1)
154164
"""
@@ -172,7 +182,7 @@ def ddp(
172182
image=image,
173183
entrypoint="python",
174184
num_replicas=nnodes,
175-
resource=specs.named_resources[h],
185+
resource=specs.resource(cpu=cpu, gpu=gpu, memMB=memMB, h=h),
176186
args=[
177187
"-m",
178188
"torch.distributed.run",

torchx/components/utils.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,21 +101,31 @@ def python(
101101
c: Optional[str] = None,
102102
image: str = torchx.IMAGE,
103103
name: str = "torchx_utils_python",
104-
host: str = "aws_t3.medium",
104+
cpu: int = 2,
105+
gpu: int = 0,
106+
memMB: int = 1024,
107+
h: Optional[str] = None,
105108
num_replicas: int = 1,
106109
) -> specs.AppDef:
107110
"""
108111
Runs ``python -c CMD`` or ``python -m MODULE`` on the specified
109112
image and host. Use ``--`` to separate component args and program args
110113
(e.g. ``torchx run utils.python --m foo.main -- --args to --main``)
111114
115+
Note: (cpu, gpu, memMB) parameters are mutually exclusive with ``h`` (named resource) where
116+
``h`` takes precedence if specified for setting resource requirements.
117+
See `registering named resources <https://pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.
118+
112119
Args:
113120
args: arguments passed to the program in sys.argv[1:] (ignored with `--c`)
114121
m: run library module as a script
115122
c: program passed as string (may error if scheduler has a length limit on args)
116123
image: image to run on
117124
name: name of the job
118-
host: a registered named resource
125+
cpu: number of cpus per replica
126+
gpu: number of gpus per replica
127+
memMB: cpu memory in MB per replica
128+
h: a registered named resource (if specified takes precedence over cpu, gpu, memMB)
119129
num_replicas: number of copies to run (each on its own container)
120130
:return:
121131
"""
@@ -134,7 +144,7 @@ def python(
134144
image=image,
135145
entrypoint="python",
136146
num_replicas=num_replicas,
137-
resource=specs.named_resources[host],
147+
resource=specs.resource(cpu=cpu, gpu=gpu, memMB=memMB, h=h),
138148
# pyre-ignore[6]: one of (only one of) m or c HAS to be not null
139149
args=[
140150
"-m" if m else "-c",

torchx/specs/__init__.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
scheduler or pipeline adapter.
1212
"""
1313

14-
from typing import Dict
14+
from typing import Dict, Optional
1515

1616
import torchx.specs.named_resources_aws as aws_resources
1717
from torchx.util.entrypoints import load_group
@@ -72,6 +72,51 @@ def _load_named_resources() -> Dict[str, Resource]:
7272
named_resources: Dict[str, Resource] = _load_named_resources()
7373

7474

75+
def resource(
76+
cpu: Optional[int] = None,
77+
gpu: Optional[int] = None,
78+
memMB: Optional[int] = None,
79+
h: Optional[str] = None,
80+
) -> Resource:
81+
"""
82+
Convenience method to create a ``Resource`` object from either the
83+
raw resource specs (cpu, gpu, memMB) or the registered named resource (``h``).
84+
Note that the (cpu, gpu, memMB) is mutually exclusive with ``h``
85+
with ``h`` taking predecence if specified.
86+
87+
If ``h`` is specified then it is used to look up the
88+
resource specs from the list of registered named resources.
89+
See `registering named resource <https://pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.
90+
91+
Otherwise a ``Resource`` object is created from the raw resource specs.
92+
93+
Example:
94+
95+
.. code-block:: python
96+
resource(cpu=1) # returns Resource(cpu=1)
97+
resource(named_resource="foobar") # returns registered named resource "foo"
98+
resource(cpu=1, named_resource="foobar") # returns registered named resource "foo" (cpu=1 ignored)
99+
resource() # returns default resource values
100+
resource(cpu=None, gpu=None, memMB=None) # throws
101+
"""
102+
103+
if h:
104+
return get_named_resources(h)
105+
else:
106+
# could make these defaults customizable via entrypoint
107+
# not doing that now since its not a requested feature and may just over complicate things
108+
# keeping these defaults method local so that no one else takes a dep on it
109+
DEFAULT_CPU = 2
110+
DEFAULT_GPU = 0
111+
DEFAULT_MEM_MB = 1024
112+
113+
return Resource(
114+
cpu=cpu or DEFAULT_CPU,
115+
gpu=gpu or DEFAULT_GPU,
116+
memMB=memMB or DEFAULT_MEM_MB,
117+
)
118+
119+
75120
def get_named_resources(res: str) -> Resource:
76121
"""
77122
Get resource object based on the string definition registered via entrypoints.txt.

torchx/specs/test/api_test.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
import torchx.specs.named_resources_aws as named_resources_aws
1616
from pyre_extensions import none_throws
17-
from torchx.specs import named_resources
17+
from torchx.specs import named_resources, resource
1818
from torchx.specs.api import (
1919
_TERMINAL_STATES,
2020
MISSING,
@@ -124,6 +124,18 @@ def test_named_resources(self) -> None:
124124
named_resources_aws.aws_p3_8xlarge(), named_resources["aws_p3.8xlarge"]
125125
)
126126

127+
def test_resource_util_fn(self) -> None:
128+
self.assertEqual(Resource(cpu=2, gpu=0, memMB=1024), resource())
129+
self.assertEqual(Resource(cpu=1, gpu=0, memMB=1024), resource(cpu=1))
130+
self.assertEqual(Resource(cpu=2, gpu=1, memMB=1024), resource(cpu=2, gpu=1))
131+
self.assertEqual(
132+
Resource(cpu=2, gpu=1, memMB=2048), resource(cpu=2, gpu=1, memMB=2048)
133+
)
134+
135+
h = "aws_t3.medium"
136+
self.assertEqual(named_resources[h], resource(h=h))
137+
self.assertEqual(named_resources[h], resource(cpu=16, gpu=4, h="aws_t3.medium"))
138+
127139

128140
class RoleBuilderTest(unittest.TestCase):
129141
def test_defaults(self) -> None:

0 commit comments

Comments
 (0)