17
17
from googleapiclient import discovery , errors
18
18
19
19
from ray ._private .accelerators import TPUAcceleratorManager
20
+ from ray ._private .accelerators import tpu
20
21
from ray .autoscaler ._private .gcp .node import MAX_POLLS , POLL_INTERVAL , GCPNodeType
21
22
from ray .autoscaler ._private .util import check_legacy_fields
22
23
51
52
# NOTE: iam.serviceAccountUser allows the Head Node to create worker nodes
52
53
# with ServiceAccounts.
53
54
54
- # By default TPU VMs come with 4 chips per host and 2 tensorcores per chip.
55
- # For more details: https://cloud.google.com/tpu/docs/system-architecture-tpu-vm
56
- DEFAULT_TPU_NUM_CHIPS_PER_HOST = 4
57
- DEFAULT_TPU_CORES_PER_CHIP = 2
58
-
59
55
60
56
def tpu_accelerator_config_to_type (accelerator_config : dict ) -> str :
61
57
"""Convert a provided accelerator_config to accelerator_type.
@@ -75,17 +71,14 @@ def tpu_accelerator_config_to_type(accelerator_config: dict) -> str:
75
71
# Reduce e.g. "2x2x2" to 8
76
72
chip_dimensions = [int (chip_count ) for chip_count in topology .split ("x" )]
77
73
num_chips = reduce (lambda x , y : x * y , chip_dimensions )
78
- num_cores = num_chips * DEFAULT_TPU_CORES_PER_CHIP
79
74
80
75
# V5LitePod is rendered as "V5LITE_POD" in accelerator configuration but
81
76
# accelerator type uses a format like "v5litepod-{cores}", so we need
82
77
# to manually convert the string here.
83
78
if generation == "v5lite_pod" :
84
79
generation = "v5litepod"
85
- num_cores = num_chips
86
80
87
- if generation == "v6e" :
88
- num_cores = num_chips
81
+ num_cores = tpu .get_tpu_cores_per_chip (generation ) * num_chips
89
82
90
83
return f"{ generation } -{ num_cores } "
91
84
@@ -136,39 +129,13 @@ def _validate_tpu_config(node: dict):
136
129
)
137
130
138
131
139
- def _get_num_tpu_visible_chips_per_host (accelerator_type : str ) -> int :
140
- if accelerator_type == "v5litepod-8" :
141
- return 8
142
-
143
- # All V6e configurations have 8 chips per host
144
- if accelerator_type .startswith ("v6e" ):
145
- return 8
146
-
147
- return DEFAULT_TPU_NUM_CHIPS_PER_HOST
148
-
149
-
150
- def _get_tpu_cores_per_chip (accelerator_type : str ) -> int :
151
- # accelerator_type is in the form v{generateion}-{cores}
152
- accelerator_type = accelerator_type .split ("-" )[0 ]
153
-
154
- # V5Litepods have 1 core per chip
155
- if accelerator_type == "v5litepod" :
156
- return 1
157
-
158
- # V6es have 1 core per chip
159
- if accelerator_type == "v6e" :
160
- return 1
161
-
162
- return DEFAULT_TPU_CORES_PER_CHIP
163
-
164
-
165
132
def _get_num_tpu_chips (node : dict ) -> int :
166
133
chips = 0
167
134
if "acceleratorType" in node :
168
135
accelerator_type = node ["acceleratorType" ]
169
136
# `acceleratorType` is typically v{generation}-{cores}
170
137
cores = int (accelerator_type .split ("-" )[1 ])
171
- chips = cores / _get_tpu_cores_per_chip (accelerator_type )
138
+ chips = cores / tpu . get_tpu_cores_per_chip (accelerator_type )
172
139
if "acceleratorConfig" in node :
173
140
topology = node ["acceleratorConfig" ]["topology" ]
174
141
# `topology` is typically {chips}x{chips}x{chips}
@@ -185,7 +152,7 @@ def _is_single_host_tpu(node: dict) -> bool:
185
152
accelerator_type = node ["acceleratorType" ]
186
153
else :
187
154
accelerator_type = tpu_accelerator_config_to_type (node ["acceleratorConfig" ])
188
- return _get_num_tpu_chips (node ) == _get_num_tpu_visible_chips_per_host (
155
+ return _get_num_tpu_chips (node ) <= tpu . get_num_tpu_visible_chips_per_host (
189
156
accelerator_type
190
157
)
191
158
0 commit comments