import unittest
import user_code as uc


def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)


class TestFeaturePipeline(unittest.TestCase):

    def test_features_count(self):
        _dynamic_test(self,
            isinstance(uc.features_count, int) and uc.features_count > 0,
            "The `features_count` is a positive integer.",
            "Expected `features_count` to be a positive integer.")

    def test_airline_vec_column_exists(self):
        _dynamic_test(self,
            "AIRLINE_VEC" in uc.features_df.columns,
            "The `features_df` contains the `AIRLINE_VEC` column.",
            "Expected `features_df` to contain the `AIRLINE_VEC` column.")

    def test_features_scaled_column_exists(self):
        _dynamic_test(self,
            "FEATURES_SCALED" in uc.features_df.columns,
            "The `features_df` contains the `FEATURES_SCALED` column.",
            "Expected `features_df` to contain the `FEATURES_SCALED` column.")

    def test_no_nulls_in_delay(self):
        from pyspark.sql.functions import col
        null_count = uc.features_df.filter(col("Delay").isNull()).count()
        _dynamic_test(self,
            null_count == 0,
            "The `features_df` has no null values in `Delay`.",
            f"Expected no nulls in `Delay`, but found {null_count}.")


if __name__ == "__main__":
    unittest.main()

test_main.py

Master the essential techniques of feature engineering using PySpark, from handling categorical and numerical data to building robust machine learning pipelines. This course guides you through practical methods for transforming raw data into meaningful features, preparing datasets for advanced analytics and machine learning workflows.

Explore the core concepts and practical techniques of feature engineering in PySpark, including handling different data types, building feature pipelines, and preparing datasets for machine learning.

Challenge: Building a Feature Pipeline for Customer Data

解答