In proteomics, analyzing the **amino acid composition** of protein sequences is crucial for understanding protein properties and comparing different proteins. Manually calculating these values for many sequences is tedious and error-prone. Automating this process with Python ensures consistency and efficiency, especially when working with large datasets or proteome-wide analyses. In this challenge, you will automate the calculation of amino acid composition for multiple protein sequences, preparing you for high-throughput protein analysis tasks.


import unittest
import user_code
import ast
import re   
import importlib
import csv
import unittest
import importlib

class TestTask(unittest.TestCase):
    def test_output_length_matches_input(self):
        import user_code
        importlib.reload(user_code)
        seqs = ["ABC", "DEF", ""]
        result = user_code.amino_acid_compositions(seqs)
        _dynamic_test(
            self,
            isinstance(result, list) and len(result) == len(seqs),
            "Returns a list of dictionaries matching input length",
            f"Expected result length {len(seqs)}, got {len(result)}",
        )

    def test_only_valid_amino_acids(self):
        import user_code
        importlib.reload(user_code)
        seqs = ["ACDZXY"]
        result = user_code.amino_acid_compositions(seqs)
        valid_aas = set("ACDEFGHIKLMNPQRSTVWY")
        _dynamic_test(
            self,
            isinstance(result, list) and isinstance(result[0], dict),
            "Returns a list of dictionaries",
            f"Expected a list of dicts, got {result}",
        )
        for key in result[0].keys():
            _dynamic_test(
                self,
                key in valid_aas,
                f"Dictionary contains only valid amino acid keys: {key}",
                f"Invalid amino acid found in dictionary: {key}",
            )

    def test_ignores_invalid_characters(self):
        import user_code
        importlib.reload(user_code)
        seqs = ["MXXKZZ"]
        result = user_code.amino_acid_compositions(seqs)
        res = result[0]
        _dynamic_test(
            self,
            "M" in res and "K" in res and len(res) == 2,
            "Ignores invalid characters (X, Z)",
            f"Expected only valid amino acids in dictionary, got {res}",
        )
        total = res.get("M", 0) + res.get("K", 0)
        _dynamic_test(
            self,
            abs(total - 100.0) < 0.001,
            "Percentages sum to 100 for valid amino acids only",
            f"Sum of percentages is not 100, got {total}",
        )

    def test_computes_percentages_correctly(self):
        import user_code
        importlib.reload(user_code)
        seqs = ["AAAC"]
        result = user_code.amino_acid_compositions(seqs)
        res = result[0]
        _dynamic_test(
            self,
            abs(res.get("A", 0) - 75.0) < 0.01 and abs(res.get("C", 0) - 25.0) < 0.01,
            "Calculates percentages correctly for each amino acid",
            f"Expected 75.0 for 'A', 25.0 for 'C', got {res}",
        )

    def test_empty_sequence_returns_empty_dict(self):
        import user_code
        importlib.reload(user_code)
        seqs = [""]
        result = user_code.amino_acid_compositions(seqs)
        _dynamic_test(
            self,
            isinstance(result, list) and result[0] == {},
            "Returns empty dictionary for sequence with no valid amino acids",
            f"Expected empty dict for empty sequence, got {result[0]}",
        )

    def test_sequence_with_no_valid_amino_acids(self):
        import user_code
        importlib.reload(user_code)
        seqs = ["ZZZZXXX"]
        result = user_code.amino_acid_compositions(seqs)
        _dynamic_test(
            self,
            isinstance(result, list) and result[0] == {},
            "Returns empty dictionary for sequence with only invalid characters",
            f"Expected empty dict for sequence with only invalid characters, got {result[0]}",
        )

    def test_mixed_case_sequence(self):
        import user_code
        importlib.reload(user_code)
        seqs = ["aCdE"]
        result = user_code.amino_acid_compositions(seqs)
        res = result[0]
        _dynamic_test(
            self,
            set(res.keys()) == set("ACDE"),
            "Handles mixed case amino acid letters",
            f"Expected keys {{'A','C','D','E'}}, got {res.keys()}",
        )
        for v in res.values():
            _dynamic_test(
                self,
                abs(v - 25.0) < 0.01,
                "Each amino acid percentage is 25.0",
                f"Expected 25.0 for each, got {res}",
            )

def _dynamic_test(test_case, condition, success_message, failure_message):
    if condition:
        test_case._testMethodName = success_message
        test_case.assertTrue(True, success_message)
    else:
        test_case._testMethodName = failure_message
        test_case.fail(failure_message)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\\s{2,}", " ", text)
    text = re.sub(r"\\s*([,:?])\\s*", r"\\1 ", text)
    return text.strip()

def change_var(code: str, var_name: str, value: str) -> str:
    tree = ast.parse(code)
    lines = code.splitlines()
    changed = False
    # Collect all assignment nodes to modify
    assign_nodes = [
        (i, node)
        for i, node in enumerate(tree.body)
        if isinstance(node, ast.Assign)
        and any(isinstance(target, ast.Name) and target.id == var_name for target in node.targets)
    ]

    # If nothing to change, return unmodified code
    if not assign_nodes:
        return code

    # Perform replacements for all matching assignments (from last to first to not break line offsets)
    for i, node in reversed(assign_nodes):
        start_line = node.lineno - 1
        line = lines[start_line]
        indent = ' ' * (len(line) - len(line.lstrip()))
        lines[start_line] = f"{indent}{var_name} = {value}"
        next_line = len(lines)
        for next_node in tree.body[i+1:]:
            if hasattr(next_node, 'lineno'):
                next_line = next_node.lineno - 1
                break
        if next_line > start_line + 1:
            lines[start_line+1:next_line] = []
        changed = True

    return '\\n'.join(lines) if changed else code

if __name__ == "__main__":
    unittest.main()


test_main.py

Learn how Python is used in biology for analyzing DNA sequences, processing biological data, and visualizing research results. Includes hands-on examples with bioinformatics libraries.

Explore how Python can be used to analyze DNA and other biological sequences, including searching for motifs, calculating GC content, and basic sequence manipulations.

Delve into protein sequences, amino acid composition, and basic protein analysis using Python.

Learn how to visualize biological data using Python, including plotting sequence statistics and creating informative charts for research.

Challenge: Amino Acid Composition for Multiple Proteins

Solution

Challenge: Amino Acid Composition for Multiple Proteins

Solution