feat: Add support for loading evaluation datasets from Observability GCS sources

vertex-sdk-bot · copybara-github · commit ed8e218ed4c1 · 2025-08-13T09:14:34.000-07:00
PiperOrigin-RevId: 792182776
diff --git a/tests/unit/vertexai/genai/test_evals.py b/tests/unit/vertexai/genai/test_evals.py
@@ -3599,3 +3599,169 @@ def test_execute_evaluation_adds_creation_timestamp(
 
         assert result.metadata is not None
         assert result.metadata.creation_timestamp == mock_now
+
+
+class TestEvaluationDataset:
+    """Contains set of tests for the EvaluationDataset class methods."""
+
+    @mock.patch.object(_evals_utils, "GcsUtils")
+    def test_load_from_observability_eval_cases(self, mock_gcs_utils):
+        """Tests that load_from_observability_eval_cases reads data from GCS."""
+
+        def read_file_contents_side_effect(src: str) -> str:
+            if src == "gs://project/input.json":
+                return "input"
+            elif src == "gs://project/output.json":
+                return "output"
+            elif src == "gs://project/system_instruction.json":
+                return "system_instruction"
+            else:
+                return ""
+
+        mock_gcs_utils.return_value.read_file_contents.side_effect = (
+            read_file_contents_side_effect
+        )
+
+        eval_cases = [
+            vertexai_genai_types.ObservabilityEvalCase(
+                input_src="gs://project/input.json",
+                output_src="gs://project/output.json",
+                system_instruction_src="gs://project/system_instruction.json",
+            )
+        ]
+        result = (
+            vertexai_genai_types.EvaluationDataset.load_from_observability_eval_cases(
+                eval_cases
+            )
+        )
+
+        mock_gcs_utils.return_value.read_file_contents.assert_has_calls(
+            [
+                mock.call("gs://project/input.json"),
+                mock.call("gs://project/output.json"),
+                mock.call("gs://project/system_instruction.json"),
+            ],
+            any_order=True,
+        )
+        assert result.eval_dataset_df is not None
+        pd.testing.assert_frame_equal(
+            result.eval_dataset_df,
+            pd.DataFrame(
+                {
+                    "format": ["observability"],
+                    "request": ["input"],
+                    "response": ["output"],
+                    "system_instruction": ["system_instruction"],
+                }
+            ),
+        )
+
+    @mock.patch.object(_evals_utils, "GcsUtils")
+    def test_load_from_observability_eval_cases_no_system_instruction(
+        self, mock_gcs_utils
+    ):
+        """Tests load_from_observability_eval_cases works without system_instruction."""
+
+        def read_file_contents_side_effect(src: str) -> str:
+            if src == "gs://project/input.json":
+                return "input"
+            elif src == "gs://project/output.json":
+                return "output"
+            elif src == "gs://project/system_instruction.json":
+                return "system_instruction"
+            else:
+                return ""
+
+        mock_gcs_utils.return_value.read_file_contents.side_effect = (
+            read_file_contents_side_effect
+        )
+
+        eval_cases = [
+            vertexai_genai_types.ObservabilityEvalCase(
+                input_src="gs://project/input.json",
+                output_src="gs://project/output.json",
+            )
+        ]
+        result = (
+            vertexai_genai_types.EvaluationDataset.load_from_observability_eval_cases(
+                eval_cases
+            )
+        )
+
+        mock_gcs_utils.return_value.read_file_contents.assert_has_calls(
+            [
+                mock.call("gs://project/input.json"),
+                mock.call("gs://project/output.json"),
+            ],
+            any_order=True,
+        )
+        assert result.eval_dataset_df is not None
+        pd.testing.assert_frame_equal(
+            result.eval_dataset_df,
+            pd.DataFrame(
+                {
+                    "format": ["observability"],
+                    "request": ["input"],
+                    "response": ["output"],
+                    "system_instruction": [""],
+                }
+            ),
+        )
+
+    @mock.patch.object(_evals_utils, "GcsUtils")
+    def test_load_from_observability_eval_cases_multiple_cases(self, mock_gcs_utils):
+        """Test load_from_observability_eval_cases can handle multiple cases."""
+
+        def read_file_contents_side_effect(src: str) -> str:
+            if src == "gs://project/input_1.json":
+                return "input_1"
+            elif src == "gs://project/input_2.json":
+                return "input_2"
+            elif src == "gs://project/output_1.json":
+                return "output_1"
+            elif src == "gs://project/output_2.json":
+                return "output_2"
+            elif src == "gs://project/system_instruction_1.json":
+                return "system_instruction_1"
+            elif src == "gs://project/system_instruction_2.json":
+                return "system_instruction_2"
+            else:
+                return ""
+
+        mock_gcs_utils.return_value.read_file_contents.side_effect = (
+            read_file_contents_side_effect
+        )
+
+        eval_cases = [
+            vertexai_genai_types.ObservabilityEvalCase(
+                input_src="gs://project/input_1.json",
+                output_src="gs://project/output_1.json",
+                system_instruction_src="gs://project/system_instruction_1.json",
+            ),
+            vertexai_genai_types.ObservabilityEvalCase(
+                input_src="gs://project/input_2.json",
+                output_src="gs://project/output_2.json",
+                system_instruction_src="gs://project/system_instruction_2.json",
+            ),
+        ]
+        result = (
+            vertexai_genai_types.EvaluationDataset.load_from_observability_eval_cases(
+                eval_cases
+            )
+        )
+
+        assert result.eval_dataset_df is not None
+        pd.testing.assert_frame_equal(
+            result.eval_dataset_df,
+            pd.DataFrame(
+                {
+                    "format": ["observability", "observability"],
+                    "request": ["input_1", "input_2"],
+                    "response": ["output_1", "output_2"],
+                    "system_instruction": [
+                        "system_instruction_1",
+                        "system_instruction_2",
+                    ],
+                }
+            ),
+        )
diff --git a/vertexai/_genai/types.py b/vertexai/_genai/types.py
@@ -7085,6 +7085,58 @@ def _check_pandas_installed(cls, data: Any) -> Any:
                 )
         return data
 
+    @classmethod
+    def load_from_observability_eval_cases(
+        cls, cases: list["ObservabilityEvalCase"]
+    ) -> "EvaluationDataset":
+        """Fetches GenAI Observability data from GCS and parses into a DataFrame."""
+        try:
+            import pandas as pd
+            from . import _evals_utils
+
+            formats = []
+            requests = []
+            responses = []
+            system_instructions = []
+
+            for case in cases:
+                gcs_utils = _evals_utils.GcsUtils(
+                    case.api_client._api_client if case.api_client else None
+                )
+
+                # Associate "observability" data format for given sources
+                formats.append("observability")
+
+                # Input source
+                request_data = gcs_utils.read_file_contents(case.input_src)
+                requests.append(request_data)
+
+                # Output source
+                response_data = gcs_utils.read_file_contents(case.output_src)
+                responses.append(response_data)
+
+                # System instruction source
+                system_instruction_data = ""
+                if case.system_instruction_src is not None:
+                    system_instruction_data = gcs_utils.read_file_contents(
+                        case.system_instruction_src
+                    )
+                system_instructions.append(system_instruction_data)
+
+            eval_dataset_df = pd.DataFrame(
+                {
+                    "format": formats,
+                    "request": requests,
+                    "response": responses,
+                    "system_instruction": system_instructions,
+                }
+            )
+
+        except ImportError as e:
+            raise ImportError("Pandas DataFrame library is required.") from e
+
+        return EvaluationDataset(eval_dataset_df=eval_dataset_df)
+
     def show(self) -> None:
         """Shows the evaluation dataset."""
         from . import _evals_visualization
@@ -7559,6 +7611,45 @@ class EvaluateDatasetOperationDict(TypedDict, total=False):
 ]
 
 
+class ObservabilityEvalCase(_common.BaseModel):
+    """A single evaluation case instance for data stored in GCP Observability."""
+
+    input_src: Optional[str] = Field(
+        default=None,
+        description="""String containing the GCS reference to the GenAI input content.""",
+    )
+    output_src: Optional[str] = Field(
+        default=None,
+        description="""String containing the GCS reference to the GenAI response content.""",
+    )
+    system_instruction_src: Optional[str] = Field(
+        default=None,
+        description="""An optional string containing the GCS reference to the GenAI system instruction.""",
+    )
+    api_client: Optional[Any] = Field(
+        default=None, description="""The underlying API client."""
+    )
+
+
+class ObservabilityEvalCaseDict(TypedDict, total=False):
+    """A single evaluation case instance for data stored in GCP Observability."""
+
+    input_src: Optional[str]
+    """String containing the GCS reference to the GenAI input content."""
+
+    output_src: Optional[str]
+    """String containing the GCS reference to the GenAI response content."""
+
+    system_instruction_src: Optional[str]
+    """An optional string containing the GCS reference to the GenAI system instruction."""
+
+    api_client: Optional[Any]
+    """The underlying API client."""
+
+
+ObservabilityEvalCaseOrDict = Union[ObservabilityEvalCase, ObservabilityEvalCaseDict]
+
+
 class RubricGroup(_common.BaseModel):
     """A group of rubrics, used for grouping rubrics based on a metric or a version."""