From 0bb2ecd2bca01554d91b1f5f3d69f88237adaa3d Mon Sep 17 00:00:00 2001 From: Christian Leopoldseder Date: Tue, 28 Apr 2026 10:43:47 -0700 Subject: [PATCH] feat: GenAI SDK client(multimodal) - Accept an explicit bigquery_uri parameter in create_from_bigquery PiperOrigin-RevId: 907062100 --- .../test_create_multimodal_datasets.py | 87 +++++++++++++++++++ vertexai/_genai/datasets.py | 58 +++++++++++-- 2 files changed, 139 insertions(+), 6 deletions(-) diff --git a/tests/unit/vertexai/genai/replays/test_create_multimodal_datasets.py b/tests/unit/vertexai/genai/replays/test_create_multimodal_datasets.py index 6280e2a671..7925380121 100644 --- a/tests/unit/vertexai/genai/replays/test_create_multimodal_datasets.py +++ b/tests/unit/vertexai/genai/replays/test_create_multimodal_datasets.py @@ -115,6 +115,40 @@ def test_create_dataset_from_bigquery(client): ) +@pytest.mark.usefixtures("mock_generate_multimodal_dataset_display_name") +def test_create_dataset_from_bigquery_with_uri(client): + dataset = client.datasets.create_from_bigquery( + bigquery_uri=f"bq://{BIGQUERY_TABLE_NAME}", + ) + assert isinstance(dataset, types.MultimodalDataset) + assert dataset.metadata.input_config.bigquery_source.uri == ( + f"bq://{BIGQUERY_TABLE_NAME}" + ) + + +def test_create_dataset_from_bigquery_preserves_other_metadata(client): + dataset = client.datasets.create_from_bigquery( + bigquery_uri=f"bq://{BIGQUERY_TABLE_NAME}", + multimodal_dataset={ + "display_name": "test-from-bigquery-uri", + "metadata": { + "gemini_request_read_config": { + "assembled_request_column_name": "test_column" + } + }, + }, + ) + assert isinstance(dataset, types.MultimodalDataset) + assert dataset.display_name == "test-from-bigquery-uri" + assert ( + dataset.metadata.gemini_request_read_config.assembled_request_column_name + == "test_column" + ) + assert dataset.metadata.input_config.bigquery_source.uri == ( + f"bq://{BIGQUERY_TABLE_NAME}" + ) + + @pytest.mark.usefixtures("mock_generate_multimodal_dataset_display_name") def test_create_dataset_from_bigquery_no_display_name(client): dataset = client.datasets.create_from_bigquery( @@ -130,6 +164,13 @@ def test_create_dataset_from_bigquery_no_display_name(client): assert dataset.display_name == "test-generated-name" +def test_create_dataset_from_bigquery_raises_if_neither(client): + with pytest.raises( + ValueError, match="At least one of `bigquery_uri` or `multimodal_dataset`" + ): + client.datasets.create_from_bigquery() + + @pytest.mark.usefixtures("mock_bigquery_client", "mock_import_bigframes") def test_create_dataset_from_pandas(client, is_replay_mode): dataframe = pd.DataFrame( @@ -298,6 +339,44 @@ async def test_create_dataset_from_bigquery_async(client): ) +@pytest.mark.asyncio +@pytest.mark.usefixtures("mock_generate_multimodal_dataset_display_name") +async def test_create_dataset_from_bigquery_with_uri_async(client): + dataset = await client.aio.datasets.create_from_bigquery( + bigquery_uri=f"bq://{BIGQUERY_TABLE_NAME}", + ) + assert isinstance(dataset, types.MultimodalDataset) + assert dataset.metadata.input_config.bigquery_source.uri == ( + f"bq://{BIGQUERY_TABLE_NAME}" + ) + + +@pytest.mark.asyncio +async def test_create_dataset_from_bigquery_preserves_other_metadata_async( + client, +): + dataset = await client.aio.datasets.create_from_bigquery( + bigquery_uri=f"bq://{BIGQUERY_TABLE_NAME}", + multimodal_dataset={ + "display_name": "test-from-bigquery-uri", + "metadata": { + "gemini_request_read_config": { + "assembled_request_column_name": "test_column" + } + }, + }, + ) + assert isinstance(dataset, types.MultimodalDataset) + assert dataset.display_name == "test-from-bigquery-uri" + assert ( + dataset.metadata.gemini_request_read_config.assembled_request_column_name + == "test_column" + ) + assert dataset.metadata.input_config.bigquery_source.uri == ( + f"bq://{BIGQUERY_TABLE_NAME}" + ) + + @pytest.mark.asyncio @pytest.mark.usefixtures("mock_generate_multimodal_dataset_display_name") async def test_create_dataset_from_bigquery_no_display_name_async(client): @@ -314,6 +393,14 @@ async def test_create_dataset_from_bigquery_no_display_name_async(client): assert dataset.display_name == "test-generated-name" +@pytest.mark.asyncio +async def test_create_dataset_from_bigquery_raises_if_neither_async(client): + with pytest.raises( + ValueError, match="At least one of `bigquery_uri` or `multimodal_dataset`" + ): + await client.aio.datasets.create_from_bigquery() + + @pytest.mark.asyncio async def test_create_dataset_from_bigquery_async_with_timeout(client): dataset = await client.aio.datasets.create_from_bigquery( diff --git a/vertexai/_genai/datasets.py b/vertexai/_genai/datasets.py index 9906d5e44a..61930a43b9 100644 --- a/vertexai/_genai/datasets.py +++ b/vertexai/_genai/datasets.py @@ -924,14 +924,24 @@ def _wait_for_operation( def create_from_bigquery( self, *, - multimodal_dataset: types.MultimodalDatasetOrDict, + bigquery_uri: Optional[str] = None, + multimodal_dataset: Optional[types.MultimodalDatasetOrDict] = None, config: Optional[types.CreateMultimodalDatasetConfigOrDict] = None, ) -> types.MultimodalDataset: """Creates a multimodal dataset from a BigQuery table. Args: + bigquery_uri: + Optional. The BigQuery URI of the table to create the dataset from. + e.g. "bq://project.dataset.table". If both `bigquery_uri` and + `multimodal_dataset` are provided, and `multimodal_dataset` also + contains a BigQuery URI, the `bigquery_uri` parameter takes precedence. multimodal_dataset: - Required. A representation of a multimodal dataset. + Optional. A representation of a multimodal dataset. If `bigquery_uri` + is set, `multimodal_dataset` can still be used to set other metadata + fields. If both `bigquery_uri` and `multimodal_dataset` are provided, + and `multimodal_dataset` also contains a BigQuery URI, the + `bigquery_uri` parameter takes precedence. config: Optional. A configuration for creating the multimodal dataset. If not provided, the default configuration will be used. @@ -939,8 +949,21 @@ def create_from_bigquery( Returns: A types.MultimodalDataset object representing a multimodal dataset. """ - if isinstance(multimodal_dataset, dict): + if not bigquery_uri and not multimodal_dataset: + raise ValueError( + "At least one of `bigquery_uri` or `multimodal_dataset` must be" + " provided." + ) + + if multimodal_dataset is None: + multimodal_dataset = types.MultimodalDataset() + elif isinstance(multimodal_dataset, dict): multimodal_dataset = types.MultimodalDataset(**multimodal_dataset) + + if bigquery_uri: + multimodal_dataset = multimodal_dataset.model_copy(deep=True) + multimodal_dataset.set_bigquery_uri(bigquery_uri) + _datasets_utils.validate_multimodal_dataset_bigquery_uri(multimodal_dataset) if isinstance(config, dict): @@ -2187,14 +2210,24 @@ async def _wait_for_operation( async def create_from_bigquery( self, *, - multimodal_dataset: types.MultimodalDatasetOrDict, + bigquery_uri: Optional[str] = None, + multimodal_dataset: Optional[types.MultimodalDatasetOrDict] = None, config: Optional[types.CreateMultimodalDatasetConfigOrDict] = None, ) -> types.MultimodalDataset: """Creates a multimodal dataset from a BigQuery table. Args: + bigquery_uri: + Optional. The BigQuery URI of the table to create the dataset from. + e.g. "bq://project.dataset.table". If both `bigquery_uri` and + `multimodal_dataset` are provided, and `multimodal_dataset` also + contains a BigQuery URI, the `bigquery_uri` parameter takes precedence. multimodal_dataset: - Required. A representation of a multimodal dataset. + Optional. A representation of a multimodal dataset. If `bigquery_uri` + is set, `multimodal_dataset` can still be used to set other metadata + fields. If both `bigquery_uri` and `multimodal_dataset` are provided, + and `multimodal_dataset` also contains a BigQuery URI, the + `bigquery_uri` parameter takes precedence. config: Optional. A configuration for creating the multimodal dataset. If not provided, the default configuration will be used. @@ -2202,8 +2235,21 @@ async def create_from_bigquery( Returns: A types.MultimodalDataset object representing a multimodal dataset. """ - if isinstance(multimodal_dataset, dict): + if not bigquery_uri and not multimodal_dataset: + raise ValueError( + "At least one of `bigquery_uri` or `multimodal_dataset` must be" + " provided." + ) + + if multimodal_dataset is None: + multimodal_dataset = types.MultimodalDataset() + elif isinstance(multimodal_dataset, dict): multimodal_dataset = types.MultimodalDataset(**multimodal_dataset) + + if bigquery_uri: + multimodal_dataset = multimodal_dataset.model_copy(deep=True) + multimodal_dataset.set_bigquery_uri(bigquery_uri) + _datasets_utils.validate_multimodal_dataset_bigquery_uri(multimodal_dataset) if isinstance(config, dict):