feat: Adding tpu_topology to Vertex SDK

vertex-sdk-bot · copybara-github · commit 423c7646185b · 2024-04-15T18:03:17.000-07:00
PiperOrigin-RevId: 625144116
diff --git a/google/cloud/aiplatform/models.py b/google/cloud/aiplatform/models.py
@@ -772,6 +772,7 @@ def deploy(
         max_replica_count: int = 1,
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
+        tpu_topology: Optional[str] = None,
         service_account: Optional[str] = None,
         explanation_metadata: Optional[aiplatform.explain.ExplanationMetadata] = None,
         explanation_parameters: Optional[
@@ -833,6 +834,9 @@ def deploy(
                 NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, NVIDIA_TESLA_T4
             accelerator_count (int):
                 Optional. The number of accelerators to attach to a worker replica.
+            tpu_topology (str):
+                Optional. The TPU topology to use for the DeployedModel.
+                Required for CloudTPU multihost deployments.
             service_account (str):
                 The service account that the DeployedModel's container runs as. Specify the
                 email address of the service account. If this service account is not
@@ -896,6 +900,7 @@ def deploy(
             max_replica_count=max_replica_count,
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
+            tpu_topology=tpu_topology,
             service_account=service_account,
             explanation_spec=explanation_spec,
             metadata=metadata,
@@ -919,6 +924,7 @@ def _deploy(
         max_replica_count: int = 1,
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
+        tpu_topology: Optional[str] = None,
         service_account: Optional[str] = None,
         explanation_spec: Optional[aiplatform.explain.ExplanationSpec] = None,
         metadata: Optional[Sequence[Tuple[str, str]]] = (),
@@ -977,6 +983,9 @@ def _deploy(
                 NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, NVIDIA_TESLA_T4
             accelerator_count (int):
                 Optional. The number of accelerators to attach to a worker replica.
+            tpu_topology (str):
+                Optional. The TPU topology to use for the DeployedModel.
+                Required for CloudTPU multihost deployments.
             service_account (str):
                 The service account that the DeployedModel's container runs as. Specify the
                 email address of the service account. If this service account is not
@@ -1026,6 +1035,7 @@ def _deploy(
             max_replica_count=max_replica_count,
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
+            tpu_topology=tpu_topology,
             service_account=service_account,
             explanation_spec=explanation_spec,
             metadata=metadata,
@@ -1056,6 +1066,7 @@ def _deploy_call(
         max_replica_count: int = 1,
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
+        tpu_topology: Optional[str] = None,
         service_account: Optional[str] = None,
         explanation_spec: Optional[aiplatform.explain.ExplanationSpec] = None,
         metadata: Optional[Sequence[Tuple[str, str]]] = (),
@@ -1123,6 +1134,9 @@ def _deploy_call(
                 NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, NVIDIA_TESLA_T4
             accelerator_count (int):
                 Optional. The number of accelerators to attach to a worker replica.
+            tpu_topology (str):
+                Optional. The TPU topology to use for the DeployedModel.
+                Required for CloudTPU multihost deployments.
             service_account (str):
                 The service account that the DeployedModel's container runs as. Specify the
                 email address of the service account. If this service account is not
@@ -1250,6 +1264,9 @@ def _deploy_call(
                         [autoscaling_metric_spec]
                     )
 
+            if tpu_topology is not None:
+                machine_spec.tpu_topology = tpu_topology
+
             dedicated_resources.machine_spec = machine_spec
             deployed_model.dedicated_resources = dedicated_resources
 
@@ -2440,6 +2457,7 @@ def deploy(
         max_replica_count: int = 1,
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
+        tpu_topology: Optional[str] = None,
         service_account: Optional[str] = None,
         explanation_metadata: Optional[aiplatform.explain.ExplanationMetadata] = None,
         explanation_parameters: Optional[
@@ -2487,6 +2505,9 @@ def deploy(
                 NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, NVIDIA_TESLA_T4
             accelerator_count (int):
                 Optional. The number of accelerators to attach to a worker replica.
+            tpu_topology (str):
+                Optional. The TPU topology to use for the DeployedModel.
+                Required for CloudTPU multihost deployments.
             service_account (str):
                 The service account that the DeployedModel's container runs as. Specify the
                 email address of the service account. If this service account is not
@@ -2534,6 +2555,7 @@ def deploy(
             max_replica_count=max_replica_count,
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
+            tpu_topology=tpu_topology,
             service_account=service_account,
             explanation_spec=explanation_spec,
             metadata=metadata,
@@ -3442,6 +3464,7 @@ def deploy(
         max_replica_count: int = 1,
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
+        tpu_topology: Optional[str] = None,
         service_account: Optional[str] = None,
         explanation_metadata: Optional[aiplatform.explain.ExplanationMetadata] = None,
         explanation_parameters: Optional[
@@ -3505,6 +3528,9 @@ def deploy(
                 NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, NVIDIA_TESLA_T4
             accelerator_count (int):
                 Optional. The number of accelerators to attach to a worker replica.
+            tpu_topology (str):
+                Optional. The TPU topology to use for the DeployedModel.
+                Requireid for CloudTPU multihost deployments.
             service_account (str):
                 The service account that the DeployedModel's container runs as. Specify the
                 email address of the service account. If this service account is not
@@ -3601,6 +3627,7 @@ def deploy(
             max_replica_count=max_replica_count,
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
+            tpu_topology=tpu_topology,
             service_account=service_account,
             explanation_spec=explanation_spec,
             metadata=metadata,
@@ -3627,6 +3654,7 @@ def _deploy(
         max_replica_count: int = 1,
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
+        tpu_topology: Optional[str] = None,
         service_account: Optional[str] = None,
         explanation_spec: Optional[aiplatform.explain.ExplanationSpec] = None,
         metadata: Optional[Sequence[Tuple[str, str]]] = (),
@@ -3687,6 +3715,9 @@ def _deploy(
                 NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, NVIDIA_TESLA_T4
             accelerator_count (int):
                 Optional. The number of accelerators to attach to a worker replica.
+            tpu_topology (str):
+                Optional. The TPU topology to use for the DeployedModel.
+                Requireid for CloudTPU multihost deployments.
             service_account (str):
                 The service account that the DeployedModel's container runs as. Specify the
                 email address of the service account. If this service account is not
@@ -3777,6 +3808,7 @@ def _deploy(
             max_replica_count=max_replica_count,
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
+            tpu_topology=tpu_topology,
             service_account=service_account,
             explanation_spec=explanation_spec,
             metadata=metadata,
diff --git a/tests/unit/aiplatform/test_models.py b/tests/unit/aiplatform/test_models.py
@@ -132,6 +132,9 @@
 _TEST_STARTING_REPLICA_COUNT = 2
 _TEST_MAX_REPLICA_COUNT = 12
 
+_TEST_TPU_MACHINE_TYPE = "ct5lp-hightpu-4t"
+_TEST_TPU_TOPOLOGY = "2x2"
+
 _TEST_BATCH_SIZE = 16
 
 _TEST_PIPELINE_RESOURCE_NAME = (
@@ -2077,6 +2080,45 @@ def test_deploy_no_endpoint_dedicated_resources(self, deploy_model_mock, sync):
             timeout=None,
         )
 
+    @pytest.mark.usefixtures(
+        "get_endpoint_mock", "get_model_mock", "create_endpoint_mock"
+    )
+    @pytest.mark.parametrize("sync", [True, False])
+    def test_deploy_no_endpoint_with_tpu_topology(self, deploy_model_mock, sync):
+        test_model = models.Model(_TEST_ID)
+        test_model._gca_resource.supported_deployment_resources_types.append(
+            aiplatform.gapic.Model.DeploymentResourcesType.DEDICATED_RESOURCES
+        )
+        test_endpoint = test_model.deploy(
+            machine_type=_TEST_TPU_MACHINE_TYPE,
+            tpu_topology=_TEST_TPU_TOPOLOGY,
+            sync=sync,
+            deploy_request_timeout=None,
+        )
+
+        if not sync:
+            test_endpoint.wait()
+
+        expected_machine_spec = gca_machine_resources.MachineSpec(
+            machine_type=_TEST_TPU_MACHINE_TYPE,
+            tpu_topology=_TEST_TPU_TOPOLOGY,
+        )
+        expected_dedicated_resources = gca_machine_resources.DedicatedResources(
+            machine_spec=expected_machine_spec, min_replica_count=1, max_replica_count=1
+        )
+        expected_deployed_model = gca_endpoint.DeployedModel(
+            dedicated_resources=expected_dedicated_resources,
+            model=test_model.resource_name,
+            display_name=None,
+        )
+        deploy_model_mock.assert_called_once_with(
+            endpoint=test_endpoint.resource_name,
+            deployed_model=expected_deployed_model,
+            traffic_split={"0": 100},
+            metadata=(),
+            timeout=None,
+        )
+
     @pytest.mark.usefixtures(
         "get_endpoint_mock", "get_model_mock", "create_endpoint_mock"
     )