|
30 | 30 | HCCN_TOOL_PATH = envs.HCCN_PATH
|
31 | 31 |
|
32 | 32 |
|
33 |
| -def get_device_ips(world_size: int): |
34 |
| - npu_info = subprocess.run( |
35 |
| - ["npu-smi", "info", "-m"], |
36 |
| - stdout=subprocess.PIPE, |
37 |
| - stderr=subprocess.PIPE, |
38 |
| - universal_newlines=True, |
39 |
| - ) |
| 33 | +def get_device_ips(): |
| 34 | + npu_info = subprocess.run(['npu-smi', 'info', '-m'], |
| 35 | + stdout=subprocess.PIPE, |
| 36 | + stderr=subprocess.PIPE, |
| 37 | + universal_newlines=True) |
40 | 38 | if npu_info.returncode != 0 or not os.path.exists(HCCN_TOOL_PATH):
|
41 | 39 | raise RuntimeError("No npu-smi/hccn_tool tools provided for NPU.")
|
42 |
| - npu_start_idx = int( |
43 |
| - re.match(r".*\n\t([0-9]+).*", |
44 |
| - npu_info.stdout).group(1)) # type: ignore |
| 40 | + |
| 41 | + # Extract NPU IDs for all Ascend devices (excluding Mcu rows) |
| 42 | + device_ids = [] |
| 43 | + for line in npu_info.stdout.strip().split('\n'): |
| 44 | + match = re.match(r'^\s*(\d+)\s+\d+\s+\d+\s+Ascend', line) |
| 45 | + if match: |
| 46 | + device_ids.append(int(match.group(1))) |
| 47 | + |
| 48 | + if not device_ids: |
| 49 | + raise RuntimeError( |
| 50 | + "Cannot parse any valid device ID from npu-smi output.") |
| 51 | + |
45 | 52 | device_ip_list = []
|
46 |
| - for ip_offset in range(world_size): |
47 |
| - cmd = [ |
48 |
| - HCCN_TOOL_PATH, |
49 |
| - "-i", |
50 |
| - f"{npu_start_idx + ip_offset}", |
51 |
| - "-ip", |
52 |
| - "-g", |
53 |
| - ] |
54 |
| - device_ip_info = subprocess.run( |
55 |
| - cmd, |
56 |
| - stdout=subprocess.PIPE, |
57 |
| - stderr=subprocess.PIPE, |
58 |
| - universal_newlines=True, |
59 |
| - ) |
60 |
| - device_ip = re.match(r"ipaddr:(.*)\n", |
61 |
| - device_ip_info.stdout).group(1) # type: ignore |
| 53 | + for device_id in device_ids: |
| 54 | + cmd = [HCCN_TOOL_PATH, '-i', str(device_id), '-ip', '-g'] |
| 55 | + device_ip_info = subprocess.run(cmd, |
| 56 | + stdout=subprocess.PIPE, |
| 57 | + stderr=subprocess.PIPE, |
| 58 | + universal_newlines=True) |
| 59 | + ip_match = re.search(r'ipaddr:(.*)', device_ip_info.stdout) |
| 60 | + if not ip_match: |
| 61 | + raise RuntimeError( |
| 62 | + f"Cannot parse IP from hccn_tool for device {device_id}") |
| 63 | + device_ip = ip_match.group(1).strip() |
62 | 64 | device_ip_list.append(device_ip)
|
| 65 | + |
63 | 66 | return device_ip_list
|
64 | 67 |
|
65 | 68 |
|
66 |
| -# Pass number of NPUs into this function. |
67 |
| -print(get_device_ips(8)) |
| 69 | +print(get_device_ips()) |
0 commit comments