import unittest
import importlib
import re
import types
from nltk.tokenize import RegexpTokenizer


# Helper function to require a successful test
def _require_success(test_case, condition, failure_message):
    if not condition:
        test_case._testMethodName = failure_message  # Rename the test with a failure description
        test_case.fail()  # Fail the test with a failure message


# Helper function to check if test is successful
def _check_success(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message  # Rename the test for better readability in test reports
    else:
        test_case._testMethodName = failure_message  # Rename the test with a failure description
        test_case.fail()  # Fail the test with a failure message

class TestUserCode(unittest.TestCase):
    # Test that everything is imported correctly
    def test_imports(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "RegexpTokenizer"),
            "`RegexpTokenizer` is not declared"
        )
        _require_success(
            self,
            user_code.RegexpTokenizer == RegexpTokenizer,
            "`RegexpTokenizer` is declared, but it's not `nltk.tokenize.RegexpTokenizer`"
        )
        self._testMethodName = "imports are correct"

    # Test that message is declared and has the correct value
    def test_message(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "message"),
            "`message` is not declared"
        )
        expected = "Amazing event at #NLPConference_20! Over 1000 attendees from 20+ countries. #Networking #Tech"
        _check_success(
            self,
            user_code.message == expected,
            "`message` contains correct value",
            "`message` shouldn't be modified"
        )

    # Test that message_lower is declared and is lowercase
    def test_message_lower(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "message"),
            "`message` is not declared"
        )
        _require_success(
            self,
            isinstance(user_code.message, str),
            "`message` is not a string"
        )
        _require_success(
            self,
            hasattr(user_code, "message_lower"),
            "`message_lower` is not declared"
        )
        _check_success(
            self,
            user_code.message_lower == user_code.message.lower(),
            "`message_lower` is computed correctly",
            "`message_lower` is not the lowercase of `message`"
        )

    # Test that word_tokenizer is declared correctly with the right pattern
    def test_word_tokenizer(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "word_tokenizer"),
            "`word_tokenizer` is not declared"
        )
        _require_success(
            self,
            isinstance(user_code.word_tokenizer, RegexpTokenizer),
            "`word_tokenizer` is not an instance of `RegexpTokenizer`"
        )
        _check_success(
            self,
            user_code.word_tokenizer._pattern == r"\w+",
            "`word_tokenizer` is defined correctly",
            "`word_tokenizer`'s pattern is incorrect'"
        )

    # Test that words is declared and correctly tokenizes the lowercase message
    def test_words(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "message_lower"),
            "`message_lower` is not declared"
        )
        _require_success(
            self,
            isinstance(user_code.message_lower, str),
            "`message_lower` is not a string"
        )
        _require_success(
            self,
            hasattr(user_code, "word_tokenizer"),
            "`word_tokenizer` is not declared"
        )
        _require_success(
            self,
            isinstance(user_code.word_tokenizer, RegexpTokenizer),
            "`word_tokenizer` is not an instance of `RegexpTokenizer`"
        )
        _require_success(
            self,
            hasattr(user_code, "words"),
            "`words` is not declared"
        )
        _check_success(
            self,
            user_code.words == user_code.word_tokenizer.tokenize(user_code.message_lower),
            "`words` is computed correctly",
            "`message_lower` is not correctly tokenized into words"
        )

    # Test that the print statement is present and unmodified
    def test_print(self):
        with open("user_code.py", "r") as f:
            user_code_text = f.read()
            _check_success(
                self,
                re.search(r"print *\( *words *\)", user_code_text),
                "print statement is correct",
                "print statement shouldn't be modified"
            )


if __name__ == '__main__':
    unittest.main()

test_main.py

Explora los fundamentos del Procesamiento de Lenguaje Natural (NLP) aprendiendo técnicas esenciales de preprocesamiento de texto y métodos para representar datos textuales. Adquiere experiencia práctica con las herramientas utilizadas para limpiar, analizar e interpretar información textual. Desarrolla las habilidades necesarias para transformar lenguaje en bruto en información estructurada, estableciendo una base sólida para aplicaciones avanzadas en inteligencia artificial y aprendizaje automático.

Explora los fundamentos de la preprocesamiento de texto para preparar texto sin procesar para su análisis. Aprende a tokenizar texto, filtrar palabras vacías y personalizar la tokenización con expresiones regulares.

Descubra cómo las palabras pueden reducirse a sus formas base mediante stemming y lematización. Dominio del etiquetado de partes del discurso para enriquecer el texto con contexto gramatical y aplicación de lematización sensible al POS.

Aprenda cómo el texto puede representarse con números utilizando modelos de espacio vectorial. Experiencia práctica implementando y personalizando dos modelos populares de espacio vectorial: bolsa de palabras y TF-IDF.

Adquiera una comprensión sólida de los embeddings de palabras y cómo capturan el significado semántico. Explore las arquitecturas CBoW y Skip-gram utilizadas en Word2Vec, e impleméntelas por su cuenta.

Desafío: Tokenización con Expresiones Regulares

Solución