Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -97,4 +97,4 @@ def test_boolean_dtype_excluded_from_raw_data(self):
results = IQROutliersBarPlot(vm_dataset)
raw_data = results[-1]

self.assertNotIn("flag", raw_data.outlier_counts_by_feature.index)
self.assertNotIn("flag", raw_data.outlier_counts_by_feature.columns)
22 changes: 22 additions & 0 deletions tests/unit_tests/data_validation/test_IQROutliersTable.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,25 @@ def test_binary_exclusion(self):
# Verify binary column is not in results
for summary in outliers_summary:
self.assertNotIn("binary", summary["Variable"])

def test_boolean_dtype_excluded_from_raw_data(self):
n_samples = 100
normal_data = np.array([1.0] * 25 + [2.0] * 50 + [3.0] * 25)
data_with_outliers = normal_data.copy()
data_with_outliers[0:4] = [-15, -10, 10, 15]
df = pd.DataFrame(
{
"with_outliers": data_with_outliers,
"flag": np.random.choice([True, False], n_samples),
}
)
vm_dataset = vm.init_dataset(
input_id="test_boolean_dataset", dataset=df, __log=False
)

result, raw_data = IQROutliersTable(vm_dataset)
outliers_summary = result["Summary of Outliers Detected by IQR Method"]
summary_variables = [summary["Variable"] for summary in outliers_summary]

self.assertNotIn("flag", summary_variables)
self.assertNotIn("flag", raw_data.all_outliers)
8 changes: 6 additions & 2 deletions validmind/tests/data_validation/IQROutliersBarPlot.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from typing import Tuple

import pandas as pd
import plotly.graph_objects as go

from validmind import RawData, tags, tasks
Expand Down Expand Up @@ -76,9 +77,12 @@ def IQROutliersBarPlot(
"""
df = dataset.df

# Exclude binary/boolean features (IQR is not meaningful and quantile fails on bool)
# Exclude boolean and binary features. The IQR is not meaningful for them and
# `quantile` raises "numpy boolean subtract" on boolean dtype columns.
eligible_columns = [
col for col in dataset.feature_columns_numeric if len(df[col].unique()) > 2
col
for col in dataset.feature_columns_numeric
if not pd.api.types.is_bool_dtype(df[col]) and df[col].nunique() > 2
]

figures = []
Expand Down
7 changes: 5 additions & 2 deletions validmind/tests/data_validation/IQROutliersTable.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from typing import Any, Dict, Tuple

import pandas as pd

from validmind import RawData, tags, tasks
from validmind.vm_models import VMDataset

Expand Down Expand Up @@ -72,8 +74,9 @@ def IQROutliersTable(
all_outliers = {}

for col in dataset.feature_columns_numeric:
# Skip binary features
if len(df[col].unique()) <= 2:
# Skip boolean and binary features. The IQR is not meaningful for them and
# `quantile` raises "numpy boolean subtract" on boolean dtype columns.
if pd.api.types.is_bool_dtype(df[col]) or df[col].nunique() <= 2:
continue

outliers = compute_outliers(df[col], threshold)
Expand Down
Loading