import unittest
import importlib
import re
import types
from nltk.tokenize import RegexpTokenizer


# Helper function to require a successful test
def _require_success(test_case, condition, failure_message):
    if not condition:
        test_case._testMethodName = failure_message  # Rename the test with a failure description
        test_case.fail()  # Fail the test with a failure message


# Helper function to check if test is successful
def _check_success(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message  # Rename the test for better readability in test reports
    else:
        test_case._testMethodName = failure_message  # Rename the test with a failure description
        test_case.fail()  # Fail the test with a failure message

class TestUserCode(unittest.TestCase):
    # Test that everything is imported correctly
    def test_imports(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "RegexpTokenizer"),
            "`RegexpTokenizer` is not declared"
        )
        _require_success(
            self,
            user_code.RegexpTokenizer == RegexpTokenizer,
            "`RegexpTokenizer` is declared, but it's not `nltk.tokenize.RegexpTokenizer`"
        )
        self._testMethodName = "imports are correct"

    # Test that message is declared and has the correct value
    def test_message(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "message"),
            "`message` is not declared"
        )
        expected = "Amazing event at #NLPConference_20! Over 1000 attendees from 20+ countries. #Networking #Tech"
        _check_success(
            self,
            user_code.message == expected,
            "`message` contains correct value",
            "`message` shouldn't be modified"
        )

    # Test that message_lower is declared and is lowercase
    def test_message_lower(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "message"),
            "`message` is not declared"
        )
        _require_success(
            self,
            isinstance(user_code.message, str),
            "`message` is not a string"
        )
        _require_success(
            self,
            hasattr(user_code, "message_lower"),
            "`message_lower` is not declared"
        )
        _check_success(
            self,
            user_code.message_lower == user_code.message.lower(),
            "`message_lower` is computed correctly",
            "`message_lower` is not the lowercase of `message`"
        )

    # Test that word_tokenizer is declared correctly with the right pattern
    def test_word_tokenizer(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "word_tokenizer"),
            "`word_tokenizer` is not declared"
        )
        _require_success(
            self,
            isinstance(user_code.word_tokenizer, RegexpTokenizer),
            "`word_tokenizer` is not an instance of `RegexpTokenizer`"
        )
        _check_success(
            self,
            user_code.word_tokenizer._pattern == r"\w+",
            "`word_tokenizer` is defined correctly",
            "`word_tokenizer`'s pattern is incorrect'"
        )

    # Test that words is declared and correctly tokenizes the lowercase message
    def test_words(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "message_lower"),
            "`message_lower` is not declared"
        )
        _require_success(
            self,
            isinstance(user_code.message_lower, str),
            "`message_lower` is not a string"
        )
        _require_success(
            self,
            hasattr(user_code, "word_tokenizer"),
            "`word_tokenizer` is not declared"
        )
        _require_success(
            self,
            isinstance(user_code.word_tokenizer, RegexpTokenizer),
            "`word_tokenizer` is not an instance of `RegexpTokenizer`"
        )
        _require_success(
            self,
            hasattr(user_code, "words"),
            "`words` is not declared"
        )
        _check_success(
            self,
            user_code.words == user_code.word_tokenizer.tokenize(user_code.message_lower),
            "`words` is computed correctly",
            "`message_lower` is not correctly tokenized into words"
        )

    # Test that the print statement is present and unmodified
    def test_print(self):
        with open("user_code.py", "r") as f:
            user_code_text = f.read()
            _check_success(
                self,
                re.search(r"print *\( *words *\)", user_code_text),
                "print statement is correct",
                "print statement shouldn't be modified"
            )


if __name__ == '__main__':
    unittest.main()

test_main.py

Explore the fundamentals of Natural Language Processing (NLP) by learning essential text preprocessing techniques and methods for representing text data. Gain practical experience with the tools used to clean, analyze, and interpret textual information. Develop the skills needed to transform raw language into structured insights, laying a strong foundation for advanced applications in artificial intelligence and machine learning.

Dive into the fundamentals of text preprocessing to prepare raw text for analysis. Learn how to tokenize text, filter out stop words, and customize tokenization with regular expressions.

Discover how words can be reduced to their base forms using stemming and lemmatization. Master part-of-speech tagging to enrich text with grammatical context and apply POS-aware lemmatization.

Learn how text can be represented with numbers using vector space models. Get hands-on experience by implementing and customizing two popular vector space models: bag of words and TF-IDF.

Gain a solid understanding of word embeddings and how they capture semantic meaning. Explore CBoW and Skip-gram architectures used in Word2Vec, and implement them on your own.

Challenge: Tokenization with Regex

Solution

Awesome!

Challenge: Tokenization with Regex

Solution

Awesome!