import unittest
import re
import nltk
from gensim.models import Word2Vec
import pandas as pd
from scipy.sparse import csr_matrix


# Helper functions
def _require_success(test_case, condition, failure_message):
    if not condition:
        test_case._testMethodName = failure_message
        test_case.fail()

def _check_success(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
    else:
        test_case._testMethodName = failure_message
        test_case.fail()

class TestUserCode(unittest.TestCase):
    # Test that everything is imported correctly
    def test_imports(self):
        import user_code
        _require_success(
            self, 
            hasattr(user_code, "pd"), 
            "`pd` is not declared"
        )
        _require_success(
            self,
            user_code.pd == pd,
            "`pd` is declared, but it's not `pandas` library"
        )
        _require_success(
            self, 
            hasattr(user_code, "Word2Vec"), 
            "`Word2Vec` is not declared"
        )
        _require_success(
            self,
            user_code.Word2Vec == Word2Vec,
            "`Word2Vec` is declared, but it's not `gensim.models.Word2Vec`"
        )
        self._testMethodName = "imports are correct"

    # Test original corpus
    def test_corpus(self):
        import user_code
        expected = pd.read_csv(
    'https://staging-content-media-cdn.codefinity.com/courses/c68c1f2e-2c90-4d5d-8db9-1e97ca89d15e/section_3/chapter_4/example_corpus.csv')
        _require_success(
            self, 
            hasattr(user_code, "corpus"), 
            "`corpus` is not declared"
        )
        _check_success(
            self,
            expected.equals(user_code.corpus),
            "`corpus` contains correct value",
            "`corpus` shouldn't be modified"
        )

    # Test that sentences are correctly computed
    def test_sentences(self):
        import user_code
        _require_success(
            self, 
            hasattr(user_code, "corpus"), 
            "`corpus` is not declared"
        )
        _require_success(
            self, 
            isinstance(user_code.corpus, pd.DataFrame), 
            "`corpus` is not a DataFrame"
        )
        _require_success(
            self, 
            "Document" in user_code.corpus.columns, 
            "`corpus` doesn't contain `'Document'` column"
        )
        _require_success(
            self, 
            hasattr(user_code, "sentences"), 
            "`sentences` is not declared"
        )
        sentences = user_code.corpus['Document'].str.split()
        _check_success(
            self,
            sentences.equals(user_code.sentences),
            "`sentences` is computed correctly",
            "`sentences` is not computed correctly"
        )
  
    # Test that model is declared correctly
    def test_model(self):
        import user_code
        _require_success(
            self, 
            hasattr(user_code, "sentences"), 
            "`sentences` is not declared"
        )
        _require_success(
            self, 
            isinstance(user_code.sentences, pd.Series), 
            "`sentences` is not a `Series` object"
        )
        model = Word2Vec(user_code.sentences, vector_size=50, window=2, min_count=1, sg=1)
        _require_success(
            self, 
            hasattr(user_code, "model"), 
            "`model` is not declared"
        )
        _require_success(
            self, 
            isinstance(user_code.model, Word2Vec), 
            "`model` is not an instance of `Word2Vec`"
        )
        _require_success(
            self, 
            user_code.model.corpus_count == model.corpus_count and user_code.model.corpus_total_words == model.corpus_total_words, 
            "`model` should be initialized with `sentences`"
        )
        _require_success(
            self, 
            user_code.model.vector_size == model.vector_size, 
            "`model` should be initialized with `vector_size=50`"
        )
        _require_success(
            self, 
            user_code.model.window == model.window, 
            "`model` should be initialized with `window=2`"
        )
        _require_success(
            self, 
            user_code.model.min_count == model.min_count, 
            "`model` should be initialized with `min_count=1`"
        )
        _require_success(
            self, 
            user_code.model.sg == model.sg, 
            "`model` should be initialized with `sg=1`"
        )
        self._testMethodName = "`model` is defined correctly"

    # Test print statement
    def test_print(self):
        with open("user_code.py", "r") as f:
            user_code_text = f.read()
        _check_success(
            self,
            re.search(r"print *\( *model *\. *wv *\. *most_similar *\( *(?P<quote>[\\"'])bowl(?P=quote) *, *topn *= *3 *\) *\)", user_code_text),
            "print statement is correct",
            "print statement is not correct"
        )


if __name__ == '__main__':
    unittest.main()

test_main.py

Explore the fundamentals of Natural Language Processing (NLP) by learning essential text preprocessing techniques and methods for representing text data. Gain practical experience with the tools used to clean, analyze, and interpret textual information. Develop the skills needed to transform raw language into structured insights, laying a strong foundation for advanced applications in artificial intelligence and machine learning.

Dive into the fundamentals of text preprocessing to prepare raw text for analysis. Learn how to tokenize text, filter out stop words, and customize tokenization with regular expressions.

Discover how words can be reduced to their base forms using stemming and lemmatization. Master part-of-speech tagging to enrich text with grammatical context and apply POS-aware lemmatization.

Learn how text can be represented with numbers using vector space models. Get hands-on experience by implementing and customizing two popular vector space models: bag of words and TF-IDF.

Gain a solid understanding of word embeddings and how they capture semantic meaning. Explore CBoW and Skip-gram architectures used in Word2Vec, and implement them on your own.

Challenge: Creating Word Embeddings

Solución

Awesome!

Awesome!

Challenge: Creating Word Embeddings

Solución

Awesome!