import unittest
import importlib
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk


# Helper function to require a successful test
def _require_success(test_case, condition, failure_message):
    if not condition:
        test_case._testMethodName = failure_message  # Rename the test with a failure description
        test_case.fail()  # Fail the test with a failure message


# Helper function to check if test is successful
def _check_success(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message  # Rename the test for better readability in test reports
    else:
      test_case._testMethodName = failure_message  # Rename the test with a failure description
      test_case.fail()  # Fail the test with a failure message


class TestUserCode(unittest.TestCase):
    # Test that everything imported correctly
    def test_imports(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "stopwords"),
            "`stopwords` is not declared"
        )
        _require_success(
            self,
            user_code.stopwords == stopwords,
            "`stopwords` is declared, but it's not `nltk.corpus.stopwords`"
        )
        _require_success(
            self,
            hasattr(user_code, "word_tokenize"),
            "`word_tokenize` is not declared"
        )
        _require_success(
            self,
            user_code.word_tokenize == word_tokenize,
            "`word_tokenize` is declared, but it's not `nltk.tokenize.word_tokenize`"
        )
        _require_success(
            self,
            hasattr(user_code, "nltk"),
            "`nltk` is not declared"
        )
        _require_success(
            self,
            user_code.nltk == nltk,
            "`nltk` is declared, but it's not `nltk` library"
        )
        self._testMethodName = "imports are correct"

    # Test that required NLTK resources are downloaded
    def test_nltk_downloads(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "nltk"),
            "`nltk` is not declared"
        )
        _require_success(
            self,
            user_code.nltk == nltk,
            "`nltk` is declared, but it's not `nltk` library"
        )
        with open("user_code.py", "r") as f:
            user_code_text = f.read()
            _require_success(
                self,
                re.search(r"nltk *\. *download *\( *'punkt_tab' *\)", user_code_text),
                "Code for downloading `'punkt_tab'` tokenizer shouldn't be modified"
            )
            _require_success(
                self,
                re.search(r"nltk *\. *download *\( *'stopwords' *\)", user_code_text),
                "Code for downloading `'stopwords'` shouldn't be modified"
            )
        self._testMethodName = "NLTK resources are downloaded"

    # Test that text is declared and unmodified
    def test_text(self):
        import user_code
        expected = """
The Natural Language Toolkit, or NLTK for short, provides a suite of libraries and programs for symbolic and statistical natural language processing in Python. It includes  a diverse set of linguistic data and tools to work with, making it easier for developers and researchers to build 
NLP applications without having to worry about data gathering and preprocessing.
"""
        _require_success(
            self,
            hasattr(user_code, "text"),
            "`text` is not declared"
        )
        _check_success(
            self,
            user_code.text == expected,
            "`text` contains correct value",
            "`text` shouldn't be modified"
        )

    # Test that text_lower is declared and lowercase
    def test_text_lower(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "text"),
            "`text` is not declared"
        )
        _require_success(
            self,
            isinstance(user_code.text, str),
            "`text` is not a string"
        )
        _require_success(
            self,
            hasattr(user_code, "text_lower"),
            "`text_lower` is not declared"
        )
        _check_success(
            self,
            user_code.text_lower == user_code.text.lower(),
            "`text_lower` is computed correctly",
            "`text_lower` is not the lowercase of `text`"
        )

    # Test that stop_words variable is declared and correct
    def test_stop_words(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "stop_words"),
            "`stop_words` is not declared"
        )
        _require_success(
            self,
            isinstance(user_code.stop_words, set),
            "`stop_words` is not a set"
        )
        _check_success(
            self,
            user_code.stop_words == set(stopwords.words('english')),
            "`stop_words` contains correct English stopwords set",
            "`stop_words` doesn't contain English stopwords"
        )

    # Test that tokens is declared and tokenized correctly
    def test_tokens(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "text_lower"),
            "`text_lower` is not declared"
        )
        _require_success(
            self,
            isinstance(user_code.text_lower, str),
            "`text_lower` is not a string"
        )
        _require_success(
            self,
            hasattr(user_code, "tokens"),
            "`tokens` is not declared"
        )
        _check_success(
            self,
            user_code.tokens == word_tokenize(user_code.text_lower),
            "`tokens` is computed correctly",
            "`text_lower` is not correctly tokenized into words"
        )

    # Test that tokens_clean is declared and filtered correctly
    def test_tokens_clean(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "tokens"),
            "`tokens` is not declared"
        )
        _require_success(
            self,
            isinstance(user_code.tokens, list),
            "`tokens` is not a list"
        )
        _require_success(
            self,
            hasattr(user_code, "stop_words"),
            "`stop_words` is not declared"
        )
        _require_success(
            self,
            isinstance(user_code.stop_words, set),
            "`stop_words` is not a set"
        )
        _require_success(
            self,
            hasattr(user_code, "tokens_clean"),
            "`tokens_clean` is not declared"
        )
        expected = [tok for tok in user_code.tokens if tok not in user_code.stop_words]
        _check_success(
            self,
            user_code.tokens_clean == expected,
            "`tokens_clean` is computed correctly",
            "stop words in `tokens` are not correctly filtered out"
        )

    # Test that the print statement is present and unmodified
    def test_print(self):
        with open("user_code.py", "r") as f:
            user_code_text = f.read()
            _check_success(
                self,
                re.search(r"print *\( *\\"Filtered Tokens:\\" *, *tokens_clean *\)", user_code_text) is not None,
                "print statement is correct",
                "print statement shouldn't be modified"
            )


if __name__ == '__main__':
    nltk.download('punkt_tab')
    nltk.download('stopwords')
    unittest.main()

test_main.py

Lassen Sie uns die Grundlagen der Verarbeitung natürlicher Sprache (NLP) erkunden, während Sie sich mit Techniken zur Textvorverarbeitung und verschiedenen Textmodellen befassen, die zur Darstellung von Textdaten verwendet werden. Sie werden praktische Einblicke und praktische Erfahrungen mit den Werkzeugen und Methoden gewinnen, die für die effektive Analyse und Interpretation von Textdaten unerlässlich sind. Dieser Kurs stattet Sie mit den Fähigkeiten aus, Rohtext in bedeutungsvolle Informationen zu verwandeln und den Weg für fortgeschrittene Anwendungen in KI und maschinellem Lernen zu ebnen.

Wir beginnen unsere Reise mit dem Erlernen und Implementieren der gängigsten Textvorverarbeitungstechniken, die in der NLP verwendet werden, um den ursprünglichen Rohtext in eine saubere, standardisierte Form zu bringen.

Ohne weitere Umschweife, lassen Sie uns Stemming und Lemmatisierung erkunden. Diese Techniken können die Effizienz und Effektivität einiger NLP-Aufgaben verbessern, insbesondere bei der Arbeit mit großen Textkorpora und der Behandlung verschiedener Wortformen als dasselbe Wort.

Vorverarbeiteter Text sollte dann in eine numerische Darstellung umgewandelt werden, um in maschinellen Lern- oder Deep-Learning-Modellen für verschiedene Aufgaben wie Vorhersage, Klassifikation oder Clustering verwendet zu werden. Hier werden wir lernen, die grundlegendsten, aber dennoch beliebten Textmodelle zu implementieren, die Textdaten in Zahlen umwandeln.

Es ist an der Zeit, die Kraft der Wort-Einbettungen freizusetzen und fortgeschrittene Techniken zur Erfassung semantischer Beziehungen zwischen Wörtern zu erkunden. Wir werden verschiedene Einbettungsmodelle wie Word2Vec, GloVe und FastText untersuchen, mit einem besonderen Fokus auf das Word2Vec-Modell und dessen Implementierung.

Challenge: Stop Words

Lösung

Awesome!

Awesome!

Challenge: Stop Words

Lösung

Awesome!