In deze uitdaging wordt de volledige workflow toegepast die in de cursus is geleerd — van gegevensvoorbewerking tot training en modelbeoordeling.


import unittest
import pandas as pd
import numpy as np

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

class TestPipelineWithGridSearch(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        cls.df = pd.read_csv(
            'https://codefinity-content-media.s3.eu-west-1.amazonaws.com/a65bbc96-309e-4df9-a790-a1eb8c815a1c/penguins.csv'
        )
        cls.df = cls.df[cls.df.isna().sum(axis=1) < 2]
        import user_code
        cls.uc = user_code  # очікується: label_enc, y, X_train, X_test, y_train, y_test, ct, param_grid, grid_search, pipe

    def test_target_encoded_with_labelencoder(self):
        from sklearn.preprocessing import LabelEncoder
        uc = self.uc
        y_arr = np.asarray(uc.y)
        cond = isinstance(getattr(uc, 'label_enc', None), LabelEncoder) \
               and hasattr(uc.label_enc, 'classes_') \
               and y_arr.ndim == 1 and np.issubdtype(y_arr.dtype, np.integer)
        _dynamic_test(
            self,
            cond,
            "Target is encoded with LabelEncoder to integer dtype",
            "Target must be encoded with LabelEncoder to integer dtype"
        )

    def test_train_test_split_33_percent(self):
        uc = self.uc
        has_vars = all(hasattr(uc, v) for v in ['X_train', 'X_test', 'y_train', 'y_test'])
        if not has_vars:
            cond = False
        else:
            n_total = len(uc.X_train) + len(uc.X_test)
            expected_test = int(round(0.33 * n_total))
            cond = len(uc.y_train) + len(uc.y_test) == n_total and abs(len(uc.X_test) - expected_test) <= 2
        _dynamic_test(
            self,
            cond,
            "Data split uses approximately 33% for the test set",
            "Data must be split with test_size=0.33"
        )

    def test_columntransformer_ohe_passthrough(self):
        from sklearn.compose import ColumnTransformer
        from sklearn.preprocessing import OneHotEncoder
        uc = self.uc
        ct = getattr(uc, 'ct', None)
        cond_ct = isinstance(ct, ColumnTransformer)
        found_ohe = False
        if cond_ct:
            for _, trans, cols in ct.transformers:
                if isinstance(trans, OneHotEncoder):
                    cols_set = set(cols) if isinstance(cols, (list, tuple)) else {cols}
                    if cols_set == {'island', 'sex'}:
                        found_ohe = True
                        break
        cond = cond_ct and found_ohe and getattr(ct, 'remainder', None) == 'passthrough'
        _dynamic_test(
            self,
            cond,
            "ColumnTransformer applies OneHotEncoder to ['island', 'sex'] with remainder='passthrough'",
            "ColumnTransformer must OneHotEncode ['island', 'sex'] and set remainder='passthrough'"
        )

    def test_param_grid_values(self):
        uc = self.uc
        pg = getattr(uc, 'param_grid', None)
        
        cond = False
        if isinstance(pg, dict) and 'n_neighbors' in pg:
            n_neighbors_vals = pg['n_neighbors']
            # Перевіряємо, що це список (або кортеж), він не порожній, і всі його елементи — непарні числа
            if len(n_neighbors_vals) > 0:
                cond = all(isinstance(x, int) and x % 2 != 0 for x in n_neighbors_vals)
                
        _dynamic_test(
            self,
            cond,
            "param_grid defines odd values for n_neighbors, allowing experimentation",
            "param_grid must be a dict and include 'n_neighbors' with a list of odd integer values"
        )

    def test_gridsearchcv_with_knn(self):
        from sklearn.model_selection import GridSearchCV
        from sklearn.neighbors import KNeighborsClassifier
        uc = self.uc
        gs = getattr(uc, 'grid_search', None)
        cond = isinstance(gs, GridSearchCV) and isinstance(getattr(gs, 'estimator', None), KNeighborsClassifier)
        _dynamic_test(
            self,
            cond,
            "GridSearchCV is initialized with KNeighborsClassifier and given param_grid",
            "GridSearchCV must be initialized with KNeighborsClassifier and given param_grid"
        )

    def test_pipeline_structure_and_order(self):
        # Очікуваний порядок: columntransformer -> simpleimputer -> standardscaler -> gridsearchcv
        uc = self.uc
        step_names = [name for name, _ in getattr(uc, 'pipe', getattr(uc, 'pipeline', object())).steps] \
                     if hasattr(uc, 'pipe') else []
        expected = ['columntransformer', 'simpleimputer', 'standardscaler', 'gridsearchcv']
        cond = step_names == expected
        _dynamic_test(
            self,
            cond,
            "Pipeline steps are in order: ColumnTransformer, SimpleImputer, StandardScaler, GridSearchCV",
            "Pipeline steps must be: ColumnTransformer, SimpleImputer, StandardScaler, GridSearchCV"
        )

    def test_fitted_on_train_and_scores_on_test(self):
        uc = self.uc
        try:
            score = uc.pipe.score(uc.X_test, uc.y_test)
            cond = isinstance(score, (float, np.floating)) and np.isfinite(score)
        except Exception:
            cond = False
        _dynamic_test(
            self,
            cond,
            "Pipeline is fitted on train and computes a finite score on the test set",
            "Pipeline must be fitted on train and compute a finite score on the test set"
        )

    def test_predictions_returned_for_X_test(self):
        uc = self.uc
        try:
            y_pred = uc.pipe.predict(uc.X_test)
            cond = isinstance(y_pred, (np.ndarray, list)) and len(y_pred) == len(uc.X_test)
        except Exception:
            cond = False
        _dynamic_test(
            self,
            cond,
            "Pipeline returns predictions for X_test",
            "Pipeline must return predictions for X_test"
        )

    def test_best_estimator_available(self):
        uc = self.uc
        try:
            be = uc.grid_search.best_estimator_
            cond = be is not None
        except Exception:
            cond = False
        _dynamic_test(
            self,
            cond,
            "Best estimator is available via grid_search.best_estimator_",
            "Best estimator must be available via grid_search.best_estimator_"
        )

if __name__ == "__main__":
    unittest.main()

test_code.py

Machine learning wordt tegenwoordig overal toegepast. Wil je het zelf leren? Deze cursus is een introductie tot de wereld van machine learning waarin je basisconcepten leert, werkt met Scikit-learn – de populairste bibliotheek voor ML – en je eerste machine learning-project bouwt.
Deze cursus is bedoeld voor studenten met basiskennis van Python, Pandas en Numpy.

Leer de concepten van machine learning en de workflow van een ML-project.

Preprocessing is waarschijnlijk de belangrijkste fase van een ML-project. Dit hoofdstuk behandelt de preprocessing-stappen die nodig zijn voor vrijwel elke dataset.

Een pipeline is een overzichtelijke manier om alle preprocessingsstappen en een model te combineren. Pipelines maken het veel eenvoudiger om een model te trainen en te gebruiken.

Modellering is de meest interessante fase van een ML-project. Leer het model opbouwen, verfijnen en evalueren!

Uitdaging: Alles Samenbrengen

Oplossing