Automating **GC content calculation** for a batch of DNA sequences is a common requirement in genomics research. Rather than processing each sequence individually, you can use Python to efficiently handle a collection of sequences, ensuring that your analysis is both scalable and reproducible. In this challenge, you will implement a function that accepts a list of DNA sequence strings and returns a list of their GC content percentages. This function should robustly handle sequences of varying lengths and ignore any invalid characters that are not one of the standard DNA bases (`A`, `T`, `G`, or `C`).


import unittest
import user_code
import ast
import re   
import importlib
import csv
import unittest
import importlib

class TestTask(unittest.TestCase):
    def test_returns_list_of_floats(self):
        import user_code
        importlib.reload(user_code)
        test_input = ["ATGC", "GGCC", "AATT", "NNNN", ""]
        result = user_code.gc_content_batch(test_input)
        _dynamic_test(
            self,
            isinstance(result, list) and all(isinstance(x, float) for x in result) and len(result) == len(test_input),
            "Returns a list of floats with length equal to input list.",
            f"Expected list of floats of length {len(test_input)}, got {result}",
        )

    def test_gc_content_mixed_bases(self):
        import user_code
        importlib.reload(user_code)
        test_input = ["ATGC", "GCGC", "ATAT", "AACCGGTT"]
        expected = [50.0, 100.0, 0.0, 50.0]
        result = user_code.gc_content_batch(test_input)
        for i, (res, exp) in enumerate(zip(result, expected)):
            _dynamic_test(
                self,
                abs(res - exp) < 1e-6,
                f"Correct GC content for sequence {test_input[i]}",
                f"Expected {exp} for sequence {test_input[i]}, got {res}",
            )

    def test_ignores_invalid_characters(self):
        import user_code
        importlib.reload(user_code)
        test_input = ["ATGCNNNN", "NNNN", "GCATXYZ"]
        expected = [50.0, 0.0, 50.0]
        result = user_code.gc_content_batch(test_input)
        for i, (res, exp) in enumerate(zip(result, expected)):
            _dynamic_test(
                self,
                abs(res - exp) < 1e-6,
                f"Ignores invalid characters in sequence {test_input[i]}",
                f"Expected {exp} for sequence {test_input[i]}, got {res}",
            )

    def test_empty_and_no_valid_bases(self):
        import user_code
        importlib.reload(user_code)
        test_input = ["", "NNNN", "12345", "-.-"]
        expected = [0.0, 0.0, 0.0, 0.0]
        result = user_code.gc_content_batch(test_input)
        for i, (res, exp) in enumerate(zip(result, expected)):
            _dynamic_test(
                self,
                abs(res - exp) < 1e-6,
                f"Returns 0.0 for no valid bases in sequence {test_input[i]}",
                f"Expected {exp} for sequence {test_input[i]}, got {res}",
            )

    def test_case_insensitivity(self):
        import user_code
        importlib.reload(user_code)
        test_input = ["atgc", "gCgC", "AtAt", "aaccggtt"]
        expected = [50.0, 100.0, 0.0, 50.0]
        result = user_code.gc_content_batch(test_input)
        for i, (res, exp) in enumerate(zip(result, expected)):
            _dynamic_test(
                self,
                abs(res - exp) < 1e-6,
                f"Handles upper and lower case for sequence {test_input[i]}",
                f"Expected {exp} for sequence {test_input[i]}, got {res}",
            )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\\s{2,}", " ", text)
    text = re.sub(r"\\s*([,:?])\\s*", r"\\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    changed = False
    # Collect all assignment nodes to modify
    assign_nodes = [
        (i, node)
        for i, node in enumerate(tree.body)
        if isinstance(node, ast.Assign)
        and any(isinstance(target, ast.Name) and target.id == var_name for target in node.targets)
    ]

    # If nothing to change, return unmodified code
    if not assign_nodes:
        return code

    # Perform replacements for all matching assignments (from last to first to not break line offsets)
    for i, node in reversed(assign_nodes):
        start_line = node.lineno - 1
        line = lines[start_line]
        indent = ' ' * (len(line) - len(line.lstrip()))
        lines[start_line] = f"{indent}{var_name} = {value}"
        next_line = len(lines)
        for next_node in tree.body[i+1:]:
            if hasattr(next_node, 'lineno'):
                next_line = next_node.lineno - 1
                break
        if next_line > start_line + 1:
            lines[start_line+1:next_line] = []
        changed = True

    return '\\n'.join(lines) if changed else code

if __name__ == "__main__":
    unittest.main()


test_main.py

Learn how Python is used in biology for analyzing DNA sequences, processing biological data, and visualizing research results. Includes hands-on examples with bioinformatics libraries.

Explore how Python can be used to analyze DNA and other biological sequences, including searching for motifs, calculating GC content, and basic sequence manipulations.

Delve into protein sequences, amino acid composition, and basic protein analysis using Python.

Learn how to visualize biological data using Python, including plotting sequence statistics and creating informative charts for research.

Challenge: Calculate GC Content for Multiple Sequences

Solution

Challenge: Calculate GC Content for Multiple Sequences

Solution