Skip to content

Commit ba2b86e

Browse files
committed
remove code for getting local ip/nic
Signed-off-by: Lizhi Zhou <[email protected]>
1 parent b1032cb commit ba2b86e

File tree

4 files changed

+9
-95
lines changed

4 files changed

+9
-95
lines changed

tests/integration/defs/test_e2e.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1768,13 +1768,11 @@ def test_trtllm_multimodal_benchmark_serving(llm_root, llm_venv):
17681768

17691769

17701770
@pytest.mark.skip_less_device(4)
1771-
@pytest.mark.skip_less_device_memory(80000)
1772-
@pytest.mark.parametrize("ctx_config", ["tp2pp1", "tp1pp2"],
1773-
ids=["ctx_tp2pp1", "ctx_tp1pp2"])
1774-
@pytest.mark.parametrize("gen_config", ["tp2pp1", "tp1pp2"],
1775-
ids=["gen_tp2pp1", "gen_tp1pp2"])
1776-
def test_openai_disaggregated_serving_multi_nodes(llm_root, llm_venv,
1777-
ctx_config, gen_config):
1771+
@pytest.mark.skip_less_device_memory(40000)
1772+
@pytest.mark.parametrize("gen_config", ["gen_tp2pp1", "gen_tp1pp2"])
1773+
@pytest.mark.parametrize("ctx_config", ["ctx_tp2pp1", "ctx_tp1pp2"])
1774+
def test_openai_disagg_multi_nodes_completion(llm_root, llm_venv, ctx_config,
1775+
gen_config):
17781776
test_root = unittest_path() / "llmapi" / "apps"
17791777
llm_venv.run_cmd([
17801778
"-m",

tests/integration/test_lists/qa/llm_function_multinode.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,5 @@ test_e2e.py::test_multi_nodes_eval[llama4-models/nvidia/Llama-4-Maverick-17B-128
77
test_e2e.py::test_multi_nodes_eval[Qwen3/Qwen3-235B-A22B-tp16-mmlu]
88
test_e2e.py::test_multi_nodes_eval[Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf-tp16-mmlu]
99
test_e2e.py::test_multi_nodes_eval[DeepSeek-R1/DeepSeek-R1-0528-FP4-tp16-mmlu]
10+
test_e2e.py::test_openai_disagg_multi_nodes_completion[ctx_tp2pp1-gen_tp2pp1]
11+
test_e2e.py::test_openai_disagg_multi_nodes_completion[ctx_tp1pp2-gen_tp1pp2]

tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes.py

Lines changed: 2 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import os
22
import socket
3-
import subprocess
43
import time
54

65
import openai
@@ -9,7 +8,7 @@
98

109
from ..test_llm import get_model_path
1110
from .openai_server import RemoteDisaggOpenAIServer, RemoteOpenAIServer
12-
from .utils import expand_slurm_nodelist, get_local_interfaces, get_local_ip
11+
from .utils import expand_slurm_nodelist
1312

1413
RANK = int(os.environ.get("SLURM_PROCID", 0))
1514
NODE_RANK = int(os.environ.get("SLURM_NODEID", 0))
@@ -52,56 +51,14 @@ def is_pytest_node():
5251
return NODE_RANK == 0
5352

5453

55-
def find_nic():
56-
test_ip = socket.gethostbyname(get_the_other_host())
57-
print(f"test_ip: {test_ip} for the other host {get_the_other_host()}")
58-
try:
59-
# iproute2 may not be installed
60-
proc = subprocess.run(["ip", "route", "get", test_ip],
61-
capture_output=True,
62-
text=True,
63-
shell=True,
64-
check=True)
65-
output_parts = proc.stdout.split()
66-
if "dev" in output_parts:
67-
dev_idx = output_parts.index("dev")
68-
nic_name = output_parts[dev_idx + 1]
69-
else:
70-
raise ValueError("Could not find 'dev' in ip route output")
71-
print(f"get NIC name from ip route, result: {nic_name}")
72-
return nic_name
73-
except Exception as e:
74-
print(f"Failed to find NIC from ip route: {e}")
75-
try:
76-
# Establish a socket to the test ip, then get the local ip from the socket,
77-
# enumerate the local interfaces and find the one with the local ip
78-
local_ip = get_local_ip(test_ip)
79-
local_ip_dict = get_local_interfaces()
80-
for nic_name, ip in local_ip_dict.items():
81-
if ip == local_ip:
82-
return nic_name
83-
except OSError as e:
84-
print(f"Failed to find NIC from local interfaces: {e}")
85-
return None
86-
87-
8854
def env():
8955
# Remove MPI related environment variables to isolate the ctx/gen processes
9056
# so that they will not be in the same MPI communicator, otherwise the rank and world_size may mismatch
91-
new_env = {
57+
return {
9258
k: v
9359
for k, v in os.environ.items()
9460
if not ('PMI_' in k or 'OMPI_' in k or 'PMIX_' in k or 'SLURM_' in k)
9561
}
96-
nic = find_nic()
97-
if nic:
98-
# TODO: integrate this into disagg-serving
99-
# setting TRTLLM_UCX_INTERFACE manually if possible because the interfaces found automatically by TRTLLM can have the same ip across nodes, then cache transceiver may fail to send/receive kv cache
100-
print(f"setting TRTLLM_UCX_INTERFACE to {nic}")
101-
new_env["TRTLLM_UCX_INTERFACE"] = nic
102-
else:
103-
print(f"Failed to find NIC, will use default UCX interface")
104-
return new_env
10562

10663

10764
@pytest.fixture(scope="module")

tests/unittest/llmapi/apps/utils.py

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,7 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16-
import array
17-
import fcntl
1816
import re
19-
import socket
20-
import struct
2117
from pathlib import Path
2218
from typing import Any, Callable
2319

@@ -180,45 +176,6 @@ def server_with_custom_sampler(model_name: str, request: Any, backend: str,
180176
return server_with_custom_sampler
181177

182178

183-
def get_local_ip(test_ip):
184-
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
185-
s.connect((test_ip, 80))
186-
return s.getsockname()[0]
187-
188-
189-
# TODO: Avoid introducing another dependency since this is for test only. If we need to
190-
# detect UCX interface in the ctx/gen server, we need to support ipv6 and use a package like netifaces
191-
def get_local_interfaces():
192-
""" Returns a dictionary of name:ip key value pairs. """
193-
MAX_BYTES = 4096
194-
FILL_CHAR = b'\0'
195-
SIOCGIFCONF = 0x8912
196-
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
197-
names = array.array('B', MAX_BYTES * FILL_CHAR)
198-
names_address, names_length = names.buffer_info()
199-
mutable_byte_buffer = struct.pack('iL', MAX_BYTES, names_address)
200-
mutated_byte_buffer = fcntl.ioctl(sock.fileno(), SIOCGIFCONF,
201-
mutable_byte_buffer)
202-
max_bytes_out, names_address_out = struct.unpack('iL', mutated_byte_buffer)
203-
namestr = names.tobytes()
204-
namestr[:max_bytes_out]
205-
bytes_out = namestr[:max_bytes_out]
206-
ip_dict = {}
207-
for i in range(0, max_bytes_out, 40):
208-
name = namestr[i:i + 16].split(FILL_CHAR, 1)[0]
209-
name = name.decode('utf-8')
210-
ip_bytes = namestr[i + 20:i + 24]
211-
full_addr = []
212-
for netaddr in ip_bytes:
213-
if isinstance(netaddr, int):
214-
full_addr.append(str(netaddr))
215-
elif isinstance(netaddr, str):
216-
full_addr.append(str(ord(netaddr)))
217-
ip_dict[name] = '.'.join(full_addr)
218-
219-
return ip_dict
220-
221-
222179
def expand_slurm_nodelist(nodelist_str):
223180
"""
224181
Expand SLURM nodelist format into individual node names.

0 commit comments

Comments
 (0)