diff --git a/tests/unit_tests/model_validation/sklearn/test_MinimumF1Score.py b/tests/unit_tests/model_validation/sklearn/test_MinimumF1Score.py new file mode 100644 index 000000000..537609a6c --- /dev/null +++ b/tests/unit_tests/model_validation/sklearn/test_MinimumF1Score.py @@ -0,0 +1,76 @@ +import unittest + +import numpy as np +import pandas as pd +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import f1_score + +import validmind as vm +from validmind.tests.model_validation.sklearn.MinimumF1Score import MinimumF1Score + + +def _dataset_with_predictions(input_id, y_true, y_pred): + """Build a VMDataset whose predictions are injected verbatim. + + A fitted model is only needed to construct a valid VMModel; the predictions + are supplied via ``prediction_values`` so the model's ``predict`` is never + called and the true/predicted label sets can be controlled exactly. + """ + df = pd.DataFrame( + { + "f1": np.linspace(-1.0, 1.0, len(y_true)), + "f2": np.linspace(1.0, -1.0, len(y_true)), + "target": y_true, + } + ) + dataset = vm.init_dataset( + input_id=input_id, dataset=df, target_column="target", __log=False + ) + + model = LogisticRegression(max_iter=1000) + model.fit(df[["f1", "f2"]].to_numpy(), np.array(y_true)) + vm_model = vm.init_model(input_id=f"{input_id}_model", model=model, __log=False) + + dataset.assign_predictions(model=vm_model, prediction_values=y_pred) + return dataset, vm_model + + +class TestMinimumF1Score(unittest.TestCase): + def test_predicted_class_absent_from_true_labels(self): + # Regression test for ZD-704. This split's true labels are binary ({0, 1}), + # but the model predicts a third class. scikit-learn's f1_score derives the + # target type from the union of y_true and y_pred, so deciding the averaging + # mode from y_true alone selects average="binary" and raises + # "Target is multiclass but average='binary'". The test must instead detect + # the multiclass label space and use macro averaging. + y_true = [0, 1, 1, 0, 1, 0, 1, 0] + y_pred = [0, 1, 2, 0, 1, 0, 2, 0] # class 2 never appears in y_true + dataset, model = _dataset_with_predictions("f1_multiclass_pred", y_true, y_pred) + + result = MinimumF1Score(dataset, model, min_threshold=0.5) + + score = result[0][0]["Score"] + expected = f1_score(np.array(y_true), np.array(y_pred), average="macro") + # Matching the macro value confirms the multiclass branch was taken (the + # binary branch would have raised rather than returned a number). + self.assertAlmostEqual(score, expected) + self.assertEqual(result[1], score > 0.5) + + def test_binary_uses_binary_average(self): + # A genuinely binary problem (true and predicted labels both within {0, 1}) + # must still use sklearn's default binary averaging and be unaffected by the + # fix. + y_true = [0, 1, 1, 0, 1, 0] + y_pred = [0, 1, 0, 0, 1, 1] + dataset, model = _dataset_with_predictions("f1_binary", y_true, y_pred) + + result = MinimumF1Score(dataset, model, min_threshold=0.5) + + score = result[0][0]["Score"] + expected = f1_score(np.array(y_true), np.array(y_pred)) # binary default + self.assertAlmostEqual(score, expected) + self.assertEqual(result[1], score > 0.5) + + +if __name__ == "__main__": + unittest.main() diff --git a/validmind/tests/model_validation/sklearn/MinimumF1Score.py b/validmind/tests/model_validation/sklearn/MinimumF1Score.py index eaeb84bad..5deb83500 100644 --- a/validmind/tests/model_validation/sklearn/MinimumF1Score.py +++ b/validmind/tests/model_validation/sklearn/MinimumF1Score.py @@ -58,10 +58,19 @@ def MinimumF1Score( closely with specific requirements. """ - if len(np.unique(dataset.y)) > 2: - score = f1_score(dataset.y, dataset.y_pred(model), average="macro") + y_true = dataset.y + y_pred = dataset.y_pred(model) + + # Decide the averaging method from the labels f1_score actually sees -- the + # union of y_true and y_pred. Inspecting y_true alone is not enough: a split + # whose true labels collapse to <=2 classes while the model predicts a third + # class is still multiclass to sklearn, and average="binary" would raise + # "Target is multiclass but average='binary'". + n_classes = len(np.unique(np.concatenate([y_true, y_pred]))) + if n_classes > 2: + score = f1_score(y_true, y_pred, average="macro") else: - score = f1_score(dataset.y, dataset.y_pred(model)) + score = f1_score(y_true, y_pred) return ( [ diff --git a/validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py b/validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py index 321218404..c6b2bd76a 100644 --- a/validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +++ b/validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py @@ -63,6 +63,7 @@ def PrecisionRecallCurve( raise SkipTestError("Skipping PrecisionRecallCurve for Foundation models") y_true = dataset.y + # Binary-only by design: multiclass is skipped, not handled (unlike MinimumF1Score). if len(np.unique(y_true)) > 2: raise SkipTestError( "Precision Recall Curve is only supported for binary classification models" diff --git a/validmind/tests/model_validation/sklearn/ROCCurve.py b/validmind/tests/model_validation/sklearn/ROCCurve.py index 7f8c99690..6109ca1d8 100644 --- a/validmind/tests/model_validation/sklearn/ROCCurve.py +++ b/validmind/tests/model_validation/sklearn/ROCCurve.py @@ -68,6 +68,7 @@ def ROCCurve(model: VMModel, dataset: VMDataset) -> Tuple[go.Figure, RawData]: incorrect, provided that the model's ranking format is retained. This phenomenon is commonly termed the "Class Imbalance Problem". """ + # Binary-only by design: multiclass is skipped, not handled (unlike MinimumF1Score). if len(np.unique(dataset.y)) > 2: raise SkipTestError( "ROC Curve is only supported for binary classification models"