Summary  
This chapter demonstrates how to use pandas methods to calculate basic descriptive statistics (mean, mode) for columns and to generate an overview of key dataset metrics with `describe()`.  

General domain of usage  
Data analysis

`pandas` biedt de handige methode `mean()` die het **gemiddelde** van alle waarden voor elke kolom berekent.
```python
df = pd.read_csv(file.csv)
mean_values = df.mean()
```

Je kunt dezelfde methode ook gebruiken om de gemiddelde waarde voor een specifieke kolom te bepalen:
```python
df = pd.read_csv(file.csv)
mean_values = df['column_name'].mean()
```

`pandas` biedt ook de methode `mode()`, die de **meest voorkomende waarde** in elke kolom identificeert.
```python
df = pd.read_csv(file.csv)
mode_values = df.mode()
```

Om de modus voor een specifieke kolom te vinden, wordt dezelfde methode gebruikt:
```python
df = pd.read_csv(file.csv)
mode_values = df['column_name'].mode()[0]
```

Gebruik `[0]` na `.mode()` om de eerste waarde te extraheren als er meerdere modi zijn. Zonder deze toevoeging retourneert de methode een volledige `Series`.


Opmerking

Een andere nuttige methode in `pandas` is `describe()`.
```python
df = pd.read_csv(file.csv)
important_metrics = df.describe()
```

Deze methode geeft een **overzicht van verschillende statistieken** uit de dataset, waaronder:
- Totaal aantal rijen;
- Gemiddelde waarde;
- Standaarddeviatie;
- De minimale en maximale waarden;
- De 25e, 50e (mediaan) en 75e percentielen.

import unittest
import importlib
import pandas as pd


def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)


def get_first_differing_row(expected_df, actual_df):
    expected_list = expected_df.astype(str).apply(tuple, axis=1)
    actual_list = actual_df.astype(str).apply(tuple, axis=1)

    for i, (row1, row2) in enumerate(zip(expected_list, actual_list)):
        if row1 != row2:
            return i, expected_df.iloc[i].values, actual_df.iloc[i].values

    if len(expected_df) > len(actual_df):
        return len(actual_df), expected_df.iloc[len(actual_df)].values, None
    else:
        return len(expected_df), None, actual_df.iloc[len(expected_df)].values


class TestUserCode(unittest.TestCase):

    def test_residual_sugar_mean_is_declared(self):
        import user_code
        _dynamic_test(
            self,
            hasattr(user_code, 'residual_sugar_mean'),
            "The `residual_sugar_mean` variable is declared.",
            "Expected `residual_sugar_mean` to be declared."
        )

    def test_residual_sugar_mean_is_correct(self):
        import user_code

        wine_data = pd.read_csv(
            'https://codefinity-content-media.s3.eu-west-1.amazonaws.com/a43d24b6-df61-4e11-9c90-5b36552b3437/wine.csv')
        expected_value = wine_data['residual sugar'].mean()

        try:
            condition = user_code.residual_sugar_mean == expected_value
            failure_message = f"Expected `residual_sugar_mean` to be `{expected_value}`, but got `{user_code.residual_sugar_mean}`."
        except AttributeError:
            condition = False
            failure_message = "The `residual_sugar_mean` variable is not declared."

        _dynamic_test(
            self,
            condition,
            f"`residual_sugar_mean` contains the correct value.",
            failure_message
        )

    def test_fixed_acidity_mode_is_declared(self):
        import user_code
        _dynamic_test(
            self,
            hasattr(user_code, 'fixed_acidity_mode'),
            "The `fixed_acidity_mode` variable is declared.",
            "Expected `fixed_acidity_mode` to be declared."
        )

    def test_fixed_acidity_mode_is_correct(self):
        import user_code

        wine_data = pd.read_csv(
            'https://codefinity-content-media.s3.eu-west-1.amazonaws.com/a43d24b6-df61-4e11-9c90-5b36552b3437/wine.csv')
        expected_value = wine_data['fixed acidity'].mode()[0]

        try:
            condition = user_code.fixed_acidity_mode == expected_value
            failure_message = f"Expected `fixed_acidity_mode` to be `{expected_value}`, but got `{user_code.fixed_acidity_mode}`."
        except AttributeError:
            condition = False
            failure_message = "The `fixed_acidity_mode` variable is not declared."

        _dynamic_test(
            self,
            condition,
            f"`fixed_acidity_mode` contains the correct value.",
            failure_message
        )

    def test_described_data_is_declared(self):
        import user_code
        _dynamic_test(
            self,
            hasattr(user_code, 'described_data'),
            "The `described_data` variable is declared.",
            "Expected `described_data` to be declared."
        )

    def test_described_data_is_dataframe(self):
        import user_code

        try:
            condition = isinstance(user_code.described_data, pd.DataFrame)
            failure_message = f"Expected `described_data` to be a `DataFrame`, but got `{type(user_code.described_data).__name__}`."
        except AttributeError:
            condition = False
            failure_message = "The `described_data` variable is not declared."

        _dynamic_test(
            self,
            condition,
            "The variable `described_data` is a `DataFrame`.",
            failure_message
        )

    def test_described_data_is_correct(self):
        import user_code

        wine_data = pd.read_csv(
            'https://codefinity-content-media.s3.eu-west-1.amazonaws.com/a43d24b6-df61-4e11-9c90-5b36552b3437/wine.csv')
        expected_df = wine_data.describe()

        variable = 'described_data'
        actual_df = getattr(user_code, variable, None)
        condition = False
        if actual_df is None:
            failure_message = f"The `{variable}` variable is not declared."
        elif not isinstance(actual_df, pd.DataFrame):
            failure_message = f"`{variable}` is not a `DataFrame`."
        elif actual_df.empty:
            failure_message = f"`{variable}` is empty."
        elif actual_df.equals(expected_df):
            condition = True
            failure_message = None
        else:
            idx, expected_row, actual_row = get_first_differing_row(expected_df, actual_df)
            failure_message = f"Expected `{variable}` to contain `{expected_row}` at row {idx}, but got `{actual_row}`."

        _dynamic_test(
            self,
            condition,
            f"`{variable}` contains the correct statistical summary.",
            failure_message
        )


if __name__ == '__main__':
    unittest.main()

test_main.py

Pandas is een uiterst gebruiksvriendelijke bibliotheek voor data-analyse. Het is ook ontworpen om grote datasets te verwerken, met behulp van datastructuren zoals DataFrame en Series. Dit maakt het een onmisbaar hulpmiddel voor Data Science. In deze gids maak je kennis met diverse statistische functies, waaronder het vinden van correlaties, modi, medianen en maximale en minimale waarden binnen een dataset. Je leert ook hoe je omgaat met ontbrekende waarden en specifieke waarden kunt manipuleren en verwijderen.

Verdiep je in de basisprincipes van de pandas-bibliotheek. Het aanmaken en manipuleren van Series en DataFrames, het verkennen van hun structuur en het opbouwen van vertrouwen in het uitvoeren van essentiële data-operaties.

Beheers het laden en beheren van gegevens uit CSV- en TXT-bestanden. Verkrijg praktische ervaring met het importeren van datasets en het voorbereiden ervan voor verdere analyse met behulp van pandas-tools.

Leer hoe u gegevens effectief kunt verkennen, opschonen en samenvatten. Begrijp hoe u omgaat met ontbrekende waarden, inzichten verkrijgt en basisstatistische bewerkingen uitvoert met pandas.

De Data Beschrijven

Oplossing