Jetzt vergleichen Sie die bisher behandelten Modelle anhand eines einzelnen Datensatzes – dem **Breast Cancer Dataset**. Die Zielvariable ist die Spalte `'diagnosis'`, wobei `1` für maligne und `0` für benigne Fälle steht.

Sie wenden `GridSearchCV` auf jedes Modell an, um die besten Parameter zu finden. In dieser Aufgabe verwenden Sie **Recall** als Bewertungsmetrik, da das **Minimieren von False Negatives** entscheidend ist. Damit `GridSearchCV` die besten Parameter basierend auf dem Recall auswählt, setzen Sie `scoring='recall'`.

import unittest
import importlib


def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)


class TestUserCode(unittest.TestCase):

    def test_knn_params_is_correct(self):
        import user_code

        expected_value = {'n_neighbors': [3, 5, 7, 12]}

        variable = 'knn_params'
        actual_value = getattr(user_code, variable, None)
        if actual_value is None:
            condition = False
            failure_message = f"The `{variable}` variable is not declared."
        elif isinstance(actual_value, dict):
            condition = actual_value == expected_value
            failure_message = f"Expected `{variable}` to contain `{expected_value}`, but got `{actual_value}`."
        else:
            condition = False
            failure_message = f"`{variable}` is not a `dict`."

        _dynamic_test(
            self,
            condition,
            f"`{variable}` contains the correct values.",
            failure_message
        )

    def test_lr_params_is_correct(self):
        import user_code

        expected_value = {'C': [0.1, 1, 10]}

        variable = 'lr_params'
        actual_value = getattr(user_code, variable, None)
        if actual_value is None:
            condition = False
            failure_message = f"The `{variable}` variable is not declared."
        elif isinstance(actual_value, dict):
            condition = actual_value == expected_value
            failure_message = f"Expected `{variable}` to contain `{expected_value}`, but got `{actual_value}`."
        else:
            condition = False
            failure_message = f"`{variable}` is not a `dict`."

        _dynamic_test(
            self,
            condition,
            f"`{variable}` contains the correct values.",
            failure_message
        )

    def test_dt_params_is_correct(self):
        import user_code

        expected_value = {'max_depth': [2, 4, 6, 10], 'min_samples_leaf': [1, 2, 4, 7]}

        variable = 'dt_params'
        actual_value = getattr(user_code, variable, None)
        if actual_value is None:
            condition = False
            failure_message = f"The `{variable}` variable is not declared."
        elif isinstance(actual_value, dict):
            condition = actual_value == expected_value
            failure_message = f"Expected `{variable}` to contain `{expected_value}`, but got `{actual_value}`."
        else:
            condition = False
            failure_message = f"`{variable}` is not a `dict`."

        _dynamic_test(
            self,
            condition,
            f"`{variable}` contains the correct values.",
            failure_message
        )

    def test_rf_params_is_correct(self):
        import user_code

        expected_value = {'max_depth': [2, 4, 6], 'n_estimators': [20, 50, 100]}

        variable = 'rf_params'
        actual_value = getattr(user_code, variable, None)
        if actual_value is None:
            condition = False
            failure_message = f"The `{variable}` variable is not declared."
        elif isinstance(actual_value, dict):
            condition = actual_value == expected_value
            failure_message = f"Expected `{variable}` to contain `{expected_value}`, but got `{actual_value}`."
        else:
            condition = False
            failure_message = f"`{variable}` is not a `dict`."

        _dynamic_test(
            self,
            condition,
            f"`{variable}` contains the correct values.",
            failure_message
        )

    def test_knn_grid_is_correct(self):
        import user_code
        from sklearn.model_selection import GridSearchCV
        from sklearn.neighbors import KNeighborsClassifier

        param_grid = {'n_neighbors': [3, 5, 7, 12]}

        variable = 'knn_grid'
        actual_value = getattr(user_code, variable, None)
        if actual_value is None:
            condition = False
            failure_message = f"The `{variable}` variable is not declared."
        elif isinstance(actual_value, GridSearchCV):
            condition = isinstance(actual_value.estimator,
                                   KNeighborsClassifier) and actual_value.param_grid == param_grid
            failure_message = f"Expected `{variable}` to be a `GridSearchCV` with `estimator=KNeighborsClassifier()`, `param_grid={param_grid}`, but got `estimator={actual_value.estimator}`, `param_grid={actual_value.param_grid}`."
        else:
            condition = False
            failure_message = f"`{variable}` is not a `GridSearchCV`."

        _dynamic_test(
            self,
            condition,
            f"`{variable}` is a `GridSearchCV` with `estimator=knn` and `param_grid=knn_grid`.",
            failure_message
        )

    def test_lr_grid_is_correct(self):
        import user_code
        from sklearn.model_selection import GridSearchCV
        from sklearn.linear_model import LogisticRegression

        param_grid = {'C': [0.1, 1, 10]}

        variable = 'lr_grid'
        actual_value = getattr(user_code, variable, None)
        if actual_value is None:
            condition = False
            failure_message = f"The `{variable}` variable is not declared."
        elif isinstance(actual_value, GridSearchCV):
            condition = isinstance(actual_value.estimator,
                                   LogisticRegression) and actual_value.param_grid == param_grid
            failure_message = f"Expected `{variable}` to be a `GridSearchCV` with `estimator=LogisticRegression()`, `param_grid={param_grid}`, but got `estimator={actual_value.estimator}`, `param_grid={actual_value.param_grid}`."
        else:
            condition = False
            failure_message = f"`{variable}` is not a `GridSearchCV`."

        _dynamic_test(
            self,
            condition,
            f"`{variable}` is a `GridSearchCV` with `estimator=lr` and `param_grid=lr_grid`.",
            failure_message
        )

    def test_dt_grid_is_correct(self):
        import user_code
        from sklearn.model_selection import GridSearchCV
        from sklearn.tree import DecisionTreeClassifier

        param_grid = {'max_depth': [2, 4, 6, 10], 'min_samples_leaf': [1, 2, 4, 7]}

        variable = 'dt_grid'
        actual_value = getattr(user_code, variable, None)
        if actual_value is None:
            condition = False
            failure_message = f"The `{variable}` variable is not declared."
        elif isinstance(actual_value, GridSearchCV):
            condition = isinstance(actual_value.estimator,
                                   DecisionTreeClassifier) and actual_value.param_grid == param_grid
            failure_message = f"Expected `{variable}` to be a `GridSearchCV` with `estimator=DecisionTreeClassifier()`, `param_grid={param_grid}`, but got `estimator={actual_value.estimator}`, `param_grid={actual_value.param_grid}`."
        else:
            condition = False
            failure_message = f"`{variable}` is not a `GridSearchCV`."

        _dynamic_test(
            self,
            condition,
            f"`{variable}` is a `GridSearchCV` with `estimator=dt` and `param_grid=dt_grid`.",
            failure_message
        )

    def test_rf_grid_is_correct(self):
        import user_code
        from sklearn.model_selection import GridSearchCV
        from sklearn.ensemble import RandomForestClassifier

        param_grid = {'max_depth': [2, 4, 6], 'n_estimators': [20, 50, 100]}

        variable = 'rf_grid'
        actual_value = getattr(user_code, variable, None)
        if actual_value is None:
            condition = False
            failure_message = f"The `{variable}` variable is not declared."
        elif isinstance(actual_value, GridSearchCV):
            condition = isinstance(actual_value.estimator,
                                   RandomForestClassifier) and actual_value.param_grid == param_grid
            failure_message = f"Expected `{variable}` to be a `GridSearchCV` with `estimator=RandomForestClassifier()`, `param_grid={param_grid}`, but got `estimator={actual_value.estimator}`, `param_grid={actual_value.param_grid}`."
        else:
            condition = False
            failure_message = f"`{variable}` is not a `GridSearchCV`."

        _dynamic_test(
            self,
            condition,
            f"`{variable}` is a `GridSearchCV` with `estimator=rf` and `param_grid=rf_grid`.",
            failure_message
        )


if __name__ == '__main__':
    unittest.main()

test_main.py

Beherrschen Sie die grundlegenden Klassifikationsalgorithmen, die moderne Machine-Learning-Anwendungen antreiben. Erforschen Sie, wie Modelle wie k-NN, logistische Regression, Entscheidungsbäume und Random Forests Vorhersagen treffen, deren Genauigkeit bewerten und verstehen, wann welches Modell eingesetzt wird. Entwickeln Sie die Fähigkeiten, Modelle zu vergleichen und das beste für Ihre Daten auszuwählen.

Erfahren Sie, wie der k-nächste-Nachbarn-Algorithmus Vorhersagen auf Basis von Ähnlichkeiten trifft. Umgang mit mehreren Merkmalen, Parametereinstellung und Anwendung von Kreuzvalidierung zur Verbesserung der Genauigkeit.

Verstehen, wie die logistische Regression Wahrscheinlichkeiten modelliert und Ergebnisse klassifiziert. Anwendung der Implementierung, Interpretation von Entscheidungsgrenzen und Einsatz von Regularisierung zur Vermeidung von Overfitting.

Erfahren Sie, wie Entscheidungsbäume Daten anhand von Merkmalswerten in sinnvolle Gruppen unterteilen. Untersuchen Sie, wie Parameter wie Baumtiefe und minimale Stichprobengröße pro Blatt die Modellleistung und Generalisierung beeinflussen.

Erkunden, wie Random Forests mehrere Entscheidungsbäume kombinieren, um Genauigkeit und Robustheit zu verbessern.
Die Rolle des Zufalls verstehen und diese Ensemble-Methode auf reale Daten anwenden.

Bewertung von Modellen anhand von Metriken wie Genauigkeit, Präzision, Recall und F1-Score. Interpretation von Konfusionsmatrizen und Vergleich mehrerer Klassifikatoren zur Identifikation des leistungsstärksten Modells.

Herausforderung: Vergleich von Modellen

Lösung