import unittest
import importlib
import re
import types
from nltk.tokenize import RegexpTokenizer


# Helper function to require a successful test
def _require_success(test_case, condition, failure_message):
    if not condition:
        test_case._testMethodName = failure_message  # Rename the test with a failure description
        test_case.fail()  # Fail the test with a failure message


# Helper function to check if test is successful
def _check_success(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message  # Rename the test for better readability in test reports
    else:
        test_case._testMethodName = failure_message  # Rename the test with a failure description
        test_case.fail()  # Fail the test with a failure message

class TestUserCode(unittest.TestCase):
    # Test that everything is imported correctly
    def test_imports(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "RegexpTokenizer"),
            "`RegexpTokenizer` is not declared"
        )
        _require_success(
            self,
            user_code.RegexpTokenizer == RegexpTokenizer,
            "`RegexpTokenizer` is declared, but it's not `nltk.tokenize.RegexpTokenizer`"
        )
        self._testMethodName = "imports are correct"

    # Test that message is declared and has the correct value
    def test_message(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "message"),
            "`message` is not declared"
        )
        expected = "Amazing event at #NLPConference_20! Over 1000 attendees from 20+ countries. #Networking #Tech"
        _check_success(
            self,
            user_code.message == expected,
            "`message` contains correct value",
            "`message` shouldn't be modified"
        )

    # Test that message_lower is declared and is lowercase
    def test_message_lower(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "message"),
            "`message` is not declared"
        )
        _require_success(
            self,
            isinstance(user_code.message, str),
            "`message` is not a string"
        )
        _require_success(
            self,
            hasattr(user_code, "message_lower"),
            "`message_lower` is not declared"
        )
        _check_success(
            self,
            user_code.message_lower == user_code.message.lower(),
            "`message_lower` is computed correctly",
            "`message_lower` is not the lowercase of `message`"
        )

    # Test that word_tokenizer is declared correctly with the right pattern
    def test_word_tokenizer(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "word_tokenizer"),
            "`word_tokenizer` is not declared"
        )
        _require_success(
            self,
            isinstance(user_code.word_tokenizer, RegexpTokenizer),
            "`word_tokenizer` is not an instance of `RegexpTokenizer`"
        )
        _check_success(
            self,
            user_code.word_tokenizer._pattern == r"\w+",
            "`word_tokenizer` is defined correctly",
            "`word_tokenizer`'s pattern is incorrect'"
        )

    # Test that words is declared and correctly tokenizes the lowercase message
    def test_words(self):
        import user_code
        _require_success(
            self,
            hasattr(user_code, "message_lower"),
            "`message_lower` is not declared"
        )
        _require_success(
            self,
            isinstance(user_code.message_lower, str),
            "`message_lower` is not a string"
        )
        _require_success(
            self,
            hasattr(user_code, "word_tokenizer"),
            "`word_tokenizer` is not declared"
        )
        _require_success(
            self,
            isinstance(user_code.word_tokenizer, RegexpTokenizer),
            "`word_tokenizer` is not an instance of `RegexpTokenizer`"
        )
        _require_success(
            self,
            hasattr(user_code, "words"),
            "`words` is not declared"
        )
        _check_success(
            self,
            user_code.words == user_code.word_tokenizer.tokenize(user_code.message_lower),
            "`words` is computed correctly",
            "`message_lower` is not correctly tokenized into words"
        )

    # Test that the print statement is present and unmodified
    def test_print(self):
        with open("user_code.py", "r") as f:
            user_code_text = f.read()
            _check_success(
                self,
                re.search(r"print *\( *words *\)", user_code_text),
                "print statement is correct",
                "print statement shouldn't be modified"
            )


if __name__ == '__main__':
    unittest.main()

test_main.py

Ознайомтеся з основами обробки природної мови (NLP), вивчаючи ключові методи попередньої обробки тексту та способи представлення текстових даних. Отримайте практичний досвід роботи з інструментами для очищення, аналізу та інтерпретації текстової інформації. Розвивайте навички перетворення неструктурованої мови у структуровані інсайти, закладаючи міцну основу для подальших застосувань у штучному інтелекті та машинному навчанні.

Занурення в основи попередньої обробки тексту для підготовки сирого тексту до аналізу. Вивчення методів токенізації тексту, фільтрації стоп-слів та налаштування токенізації за допомогою регулярних виразів.

Дізнайтеся, як слова можна звести до їхніх базових форм за допомогою стемінгу та лематизації. Опануйте тегування частин мови для збагачення тексту граматичним контекстом і застосовуйте лематизацію з урахуванням частин мови.

Дізнайтеся, як текст може бути представлений числами за допомогою векторних просторових моделей. Отримайте практичний досвід шляхом реалізації та налаштування двох популярних векторних просторових моделей: мішок слів та TF-IDF.

Отримайте ґрунтовне розуміння векторних подань слів та їх здатності відображати семантичне значення. Дослідіть архітектури CBoW і Skip-gram, що використовуються у Word2Vec, та реалізуйте їх самостійно.

Завдання: Токенізація з Використанням Регулярних Виразів

Рішення