diff --git a/tests/unit_tests/data_validation/test_IQROutliersBarPlot.py b/tests/unit_tests/data_validation/test_IQROutliersBarPlot.py index 23193f737..6f3bbd5f5 100644 --- a/tests/unit_tests/data_validation/test_IQROutliersBarPlot.py +++ b/tests/unit_tests/data_validation/test_IQROutliersBarPlot.py @@ -97,4 +97,4 @@ def test_boolean_dtype_excluded_from_raw_data(self): results = IQROutliersBarPlot(vm_dataset) raw_data = results[-1] - self.assertNotIn("flag", raw_data.outlier_counts_by_feature.index) + self.assertNotIn("flag", raw_data.outlier_counts_by_feature.columns) diff --git a/tests/unit_tests/data_validation/test_IQROutliersTable.py b/tests/unit_tests/data_validation/test_IQROutliersTable.py index 0174d724f..32cd3aa9e 100644 --- a/tests/unit_tests/data_validation/test_IQROutliersTable.py +++ b/tests/unit_tests/data_validation/test_IQROutliersTable.py @@ -83,3 +83,25 @@ def test_binary_exclusion(self): # Verify binary column is not in results for summary in outliers_summary: self.assertNotIn("binary", summary["Variable"]) + + def test_boolean_dtype_excluded_from_raw_data(self): + n_samples = 100 + normal_data = np.array([1.0] * 25 + [2.0] * 50 + [3.0] * 25) + data_with_outliers = normal_data.copy() + data_with_outliers[0:4] = [-15, -10, 10, 15] + df = pd.DataFrame( + { + "with_outliers": data_with_outliers, + "flag": np.random.choice([True, False], n_samples), + } + ) + vm_dataset = vm.init_dataset( + input_id="test_boolean_dataset", dataset=df, __log=False + ) + + result, raw_data = IQROutliersTable(vm_dataset) + outliers_summary = result["Summary of Outliers Detected by IQR Method"] + summary_variables = [summary["Variable"] for summary in outliers_summary] + + self.assertNotIn("flag", summary_variables) + self.assertNotIn("flag", raw_data.all_outliers) diff --git a/validmind/tests/data_validation/IQROutliersBarPlot.py b/validmind/tests/data_validation/IQROutliersBarPlot.py index e9e443826..329845984 100644 --- a/validmind/tests/data_validation/IQROutliersBarPlot.py +++ b/validmind/tests/data_validation/IQROutliersBarPlot.py @@ -5,6 +5,7 @@ from typing import Tuple +import pandas as pd import plotly.graph_objects as go from validmind import RawData, tags, tasks @@ -76,9 +77,12 @@ def IQROutliersBarPlot( """ df = dataset.df - # Exclude binary/boolean features (IQR is not meaningful and quantile fails on bool) + # Exclude boolean and binary features. The IQR is not meaningful for them and + # `quantile` raises "numpy boolean subtract" on boolean dtype columns. eligible_columns = [ - col for col in dataset.feature_columns_numeric if len(df[col].unique()) > 2 + col + for col in dataset.feature_columns_numeric + if not pd.api.types.is_bool_dtype(df[col]) and df[col].nunique() > 2 ] figures = [] diff --git a/validmind/tests/data_validation/IQROutliersTable.py b/validmind/tests/data_validation/IQROutliersTable.py index b4b6ac432..bc0b19120 100644 --- a/validmind/tests/data_validation/IQROutliersTable.py +++ b/validmind/tests/data_validation/IQROutliersTable.py @@ -5,6 +5,8 @@ from typing import Any, Dict, Tuple +import pandas as pd + from validmind import RawData, tags, tasks from validmind.vm_models import VMDataset @@ -72,8 +74,9 @@ def IQROutliersTable( all_outliers = {} for col in dataset.feature_columns_numeric: - # Skip binary features - if len(df[col].unique()) <= 2: + # Skip boolean and binary features. The IQR is not meaningful for them and + # `quantile` raises "numpy boolean subtract" on boolean dtype columns. + if pd.api.types.is_bool_dtype(df[col]) or df[col].nunique() <= 2: continue outliers = compute_outliers(df[col], threshold)