Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions tests/unit_tests/model_validation/sklearn/test_MinimumF1Score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import unittest

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

import validmind as vm
from validmind.tests.model_validation.sklearn.MinimumF1Score import MinimumF1Score


def _dataset_with_predictions(input_id, y_true, y_pred):
"""Build a VMDataset whose predictions are injected verbatim.

A fitted model is only needed to construct a valid VMModel; the predictions
are supplied via ``prediction_values`` so the model's ``predict`` is never
called and the true/predicted label sets can be controlled exactly.
"""
df = pd.DataFrame(
{
"f1": np.linspace(-1.0, 1.0, len(y_true)),
"f2": np.linspace(1.0, -1.0, len(y_true)),
"target": y_true,
}
)
dataset = vm.init_dataset(
input_id=input_id, dataset=df, target_column="target", __log=False
)

model = LogisticRegression(max_iter=1000)
model.fit(df[["f1", "f2"]].to_numpy(), np.array(y_true))
vm_model = vm.init_model(input_id=f"{input_id}_model", model=model, __log=False)

dataset.assign_predictions(model=vm_model, prediction_values=y_pred)
return dataset, vm_model


class TestMinimumF1Score(unittest.TestCase):
def test_predicted_class_absent_from_true_labels(self):
# Regression test for ZD-704. This split's true labels are binary ({0, 1}),
# but the model predicts a third class. scikit-learn's f1_score derives the
# target type from the union of y_true and y_pred, so deciding the averaging
# mode from y_true alone selects average="binary" and raises
# "Target is multiclass but average='binary'". The test must instead detect
# the multiclass label space and use macro averaging.
y_true = [0, 1, 1, 0, 1, 0, 1, 0]
y_pred = [0, 1, 2, 0, 1, 0, 2, 0] # class 2 never appears in y_true
dataset, model = _dataset_with_predictions("f1_multiclass_pred", y_true, y_pred)

result = MinimumF1Score(dataset, model, min_threshold=0.5)

score = result[0][0]["Score"]
expected = f1_score(np.array(y_true), np.array(y_pred), average="macro")
# Matching the macro value confirms the multiclass branch was taken (the
# binary branch would have raised rather than returned a number).
self.assertAlmostEqual(score, expected)
self.assertEqual(result[1], score > 0.5)

def test_binary_uses_binary_average(self):
# A genuinely binary problem (true and predicted labels both within {0, 1})
# must still use sklearn's default binary averaging and be unaffected by the
# fix.
y_true = [0, 1, 1, 0, 1, 0]
y_pred = [0, 1, 0, 0, 1, 1]
dataset, model = _dataset_with_predictions("f1_binary", y_true, y_pred)

result = MinimumF1Score(dataset, model, min_threshold=0.5)

score = result[0][0]["Score"]
expected = f1_score(np.array(y_true), np.array(y_pred)) # binary default
self.assertAlmostEqual(score, expected)
self.assertEqual(result[1], score > 0.5)


if __name__ == "__main__":
unittest.main()
15 changes: 12 additions & 3 deletions validmind/tests/model_validation/sklearn/MinimumF1Score.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,19 @@ def MinimumF1Score(
closely with specific requirements.
"""

if len(np.unique(dataset.y)) > 2:
score = f1_score(dataset.y, dataset.y_pred(model), average="macro")
y_true = dataset.y
y_pred = dataset.y_pred(model)

# Decide the averaging method from the labels f1_score actually sees -- the
# union of y_true and y_pred. Inspecting y_true alone is not enough: a split
# whose true labels collapse to <=2 classes while the model predicts a third
# class is still multiclass to sklearn, and average="binary" would raise
# "Target is multiclass but average='binary'".
n_classes = len(np.unique(np.concatenate([y_true, y_pred])))
if n_classes > 2:
score = f1_score(y_true, y_pred, average="macro")
else:
score = f1_score(dataset.y, dataset.y_pred(model))
score = f1_score(y_true, y_pred)

return (
[
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def PrecisionRecallCurve(
raise SkipTestError("Skipping PrecisionRecallCurve for Foundation models")

y_true = dataset.y
# Binary-only by design: multiclass is skipped, not handled (unlike MinimumF1Score).
if len(np.unique(y_true)) > 2:
raise SkipTestError(
"Precision Recall Curve is only supported for binary classification models"
Expand Down
1 change: 1 addition & 0 deletions validmind/tests/model_validation/sklearn/ROCCurve.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def ROCCurve(model: VMModel, dataset: VMDataset) -> Tuple[go.Figure, RawData]:
incorrect, provided that the model's ranking format is retained. This phenomenon is commonly termed the "Class
Imbalance Problem".
"""
# Binary-only by design: multiclass is skipped, not handled (unlike MinimumF1Score).
if len(np.unique(dataset.y)) > 2:
raise SkipTestError(
"ROC Curve is only supported for binary classification models"
Expand Down
Loading