test: cherrypick vllm e2e fixes into release 0.3.1 (#1664)

PeaBrane · web-flow · commit 6db54ff9f8b6 · 2025-06-26T16:46:39.000-04:00
diff --git a/tests/serve/conftest.py b/tests/serve/conftest.py
@@ -12,3 +12,74 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+import logging
+import os
+
+import pytest
+
+# List of models used in the serve tests
+SERVE_TEST_MODELS = [
+    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    "llava-hf/llava-1.5-7b-hf",
+]
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.fixture(scope="session")
+def predownload_models():
+    # Check for HF_TOKEN in environment
+    hf_token = os.environ.get("HF_TOKEN")
+    if hf_token:
+        logger.info("HF_TOKEN found in environment")
+    else:
+        logger.warning(
+            "HF_TOKEN not found in environment. "
+            "Some models may fail to download or you may encounter rate limits. "
+            "Get a token from https://huggingface.co/settings/tokens"
+        )
+
+    try:
+        from huggingface_hub import snapshot_download
+
+        for model_id in SERVE_TEST_MODELS:
+            logger.info(f"Pre-downloading model: {model_id}")
+
+            try:
+                # Download the full model snapshot (includes all files)
+                # HuggingFace will handle caching automatically
+                snapshot_download(
+                    repo_id=model_id,
+                    token=hf_token,
+                )
+                logger.info(f"Successfully pre-downloaded: {model_id}")
+
+            except Exception as e:
+                logger.error(f"Failed to pre-download {model_id}: {e}")
+                # Don't fail the fixture - let individual tests handle missing models
+
+    except ImportError:
+        logger.warning(
+            "huggingface_hub not installed. "
+            "Models will be downloaded during test execution."
+        )
+
+    yield
+
+
+# Automatically use the predownload fixture for all serve tests
+def pytest_collection_modifyitems(config, items):
+    for item in items:
+        # Skip items that don't have fixturenames (like MypyFileItem)
+        if not hasattr(item, "fixturenames"):
+            continue
+
+        # Only apply to tests in the serve directory
+        if "serve" in str(item.path):
+            # Check if the test already uses the fixture
+            if "predownload_models" not in item.fixturenames:
+                # Don't add if test explicitly marks to skip model download
+                if not item.get_closest_marker("skip_model_download"):
+                    item.fixturenames = list(item.fixturenames)
+                    item.fixturenames.append("predownload_models")
diff --git a/tests/serve/test_dynamo_serve.py b/tests/serve/test_dynamo_serve.py
@@ -67,14 +67,14 @@
         ],
         "max_tokens": 150,  # Reduced from 500
         "temperature": 0.1,
-        "seed": 0,
+        # "seed": 0,
     },
     payload_completions={
         "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
         "prompt": text_prompt,
         "max_tokens": 150,
         "temperature": 0.1,
-        "seed": 0,
+        # "seed": 0,
     },
     repeat_count=10,
     expected_log=[],
@@ -159,7 +159,7 @@
     "multimodal_agg": (
         DeploymentGraph(
             module="graphs.agg:Frontend",
-            config="configs/agg.yaml",
+            config="configs/agg-llava.yaml",
             directory="/workspace/examples/multimodal",
             endpoints=["v1/chat/completions"],
             response_handlers=[
@@ -257,12 +257,22 @@ def __init__(self, graph: DeploymentGraph, request, port=8000, timeout=900):
         if graph.config:
             command.extend(["-f", os.path.join(graph.directory, graph.config)])
 
-        command.extend(["--Frontend.port", str(port)])
-
-        health_check_urls = [(f"http://localhost:{port}/v1/models", self._check_model)]
-
+        # Handle multimodal deployments differently
         if "multimodal" in graph.directory:
+            # Set DYNAMO_PORT environment variable for multimodal
+            env = os.environ.copy()
+            env["DYNAMO_PORT"] = str(port)
             health_check_urls = []
+            # Don't add health check on port since multimodal uses DYNAMO_PORT
+            health_check_ports = []
+        else:
+            # Regular LLM deployments
+            command.extend(["--Frontend.port", str(port)])
+            health_check_urls = [
+                (f"http://localhost:{port}/v1/models", self._check_model)
+            ]
+            health_check_ports = [port]
+            env = None
 
         self.port = port
 
@@ -271,11 +281,12 @@ def __init__(self, graph: DeploymentGraph, request, port=8000, timeout=900):
             timeout=timeout,
             display_output=True,
             working_dir=graph.directory,
-            health_check_ports=[port],
+            health_check_ports=health_check_ports,
             health_check_urls=health_check_urls,
             delayed_start=graph.delayed_start,
             stragglers=["http"],
             log_dir=request.node.name,
+            env=env,  # Pass the environment variables
         )
 
     def _check_model(self, response):