Source code for tidytcells.junction._standardize

import logging
import re
from tidytcells import aa, _utils
from typing import Literal, Optional
from tidytcells._utils.parameter import Parameter
from tidytcells._utils.conserved_aa_lookup import (
    get_conserved_aa_for_j_symbol_for_species,
)

logger = logging.getLogger(__name__)


[docs] def standardize( seq: str, j_symbol: Optional[str] = None, species: Optional[str] = None, allow_uncertain_118: Optional[bool] = None, fix_missing_conserved: Optional[bool] = None, on_fail: Optional[Literal["reject", "keep"]] = None, log_failures: Optional[bool] = None, j_strict: Optional[bool] = None, strict: Optional[bool] = None, suppress_warnings: Optional[bool] = None, ) -> Optional[str]: """ Ensures that a string value looks like a valid junction (CDR3) amino acid sequence. A valid junction sequence must: 1. Be a valid amino acid sequence 2. Begin with a cysteine (C) 3. End with a phenylalanine (F), tryptophan (W) or cysteine (C) in a way consistent with `j_symbol` if supplied :param seq: The junction sequence. :type seq: str :param j_symbol: The TR/IG J symbol used to determine the correct conserved trailing amino acid at position 118 (F / W / C). If the symbol does not resolve to a single allele but all productive alleles consistent with the symbol have the same conserved residue, this will be set as the expected ending residue. If the supplied symbol does not map to any (group of) known J alleles, the function will raise a ``ValueError``. :type j_symbol: str :param species: The species that produced the underlying receptor. Defaults to ``homosapiens``. :type species: str :param allow_uncertain_118: If ``False``, standardization immediately fails if the expected conserved trailing amino acid at position 118 cannot be determined with certainty using `j_symbol`, or if `j_symbol` is not supplied. If ``True``, in the event of an uncertain residue at position 118, either F or W is accepted, and if a trailing residue must be appended (see parameter `fix_missing_conserved`), an F will be added. Defaults to ``True``. :type allow_uncertain_118: bool :param fix_missing_conserved: If ``False``, standardization immediately fails for any input sequence that does not start and end with the expected conserved residues. If ``True``, any inputs that are valid amino acid sequences but do not start and end as expected are corrected by adding a C at the beginning and the expected trailing residue (see `allow_uncertain_118`) at the end. Defaults to ``True``. :type fix_missing_conserved: bool :param on_fail: Behaviour when standardization fails. If set to ``"reject"``, returns ``None`` on failure. If set to ``"keep"``, returns the original input. Defaults to ``"reject"``. :type on_fail: str :param log_failures: Report standardisation failures through logging (at level ``WARNING``). Defaults to ``True``. :type log_failures: bool :param j_strict: Inverse setting to `allow_uncertain_118`. Deprecated in favor of `allow_uncertain_118`. :type j_strict: bool :param strict: Inverse setting to `fix_missing_conserved`. Deprecated in favor of `fix_missing_conserved`. :type strict: bool :param suppress_warnings: Disable warnings that are usually logged when standardisation fails. Deprecated in favour of `log_failures`. :type suppress_warnings: bool :return: If possible, a standardized version of the input string is returned. If the input string cannot be standardized, the function follows the behaviour as set by `on_fail`. :rtype: Optional[str] .. topic:: Example usage Strings that look like junction sequences will be accepted, and returned in capitalised form. >>> tt.junction.standardize("csadaf") 'CSADAF' Strings that are valid amino acid sequences but do not start and end with the appropriate residues will have a C and the appropriate conserved trailing residue at position 118 (defaults to F) appended to its beginning and end as required. >>> tt.junction.standardize("sada") 'CSADAF' The conserved trailing residue can be intelligently inferred if `j_symbol` is supplied. >>> tt.junction.standardize("sada", j_symbol="TRAJ38*01") 'CSADAW' Furthermore, setting `fix_missing_conserved` to ``False`` will cause these cases to be rejected. >>> result = tt.junction.standardize("sada", fix_missing_conserved=False) Input sadaf was rejected as it is not a valid junction sequence. >>> print(result) None .. topic:: Decision Logic To provide an easy way to gauge the scope and limitations of standardization, below is a simplified overview of the decision logic employed when attempting to standardize a junction sequence. For more detail, please refer to the `source code <https://github.com/yutanagano/tidytcells>`_. .. code-block:: none IF input sequence contains non-amino acid symbols: set standardization status to failed skip rest of standardization // inferred using J symbol if supplied IF expected trailing residue at position 118 uncertain: { IF allow_uncertain_118: accept either F or W ELSE: set standardization status to failed skip rest of standardization } IF input sequence starts (C) and ends (F / W / C) as expected: set standardization status to successful ELSE: { IF fix_missing_conserved: append expected starting and ending residues set standardization status to successful ELSE: set standardization status to failed } IF standardization status is set to successful: RETURN standardized sequence ELSE: IF on_fail is set to "reject": RETURN None IF on_fail is set to "keep": RETURN original sequence """ seq = Parameter(seq, "seq").throw_error_if_not_of_type(str).value j_symbol = ( Parameter(j_symbol, "j_symbol") .throw_error_if_not_of_type(str, optional=True) .value ) species = ( Parameter(species, "species") .set_default("homosapiens") .throw_error_if_not_of_type(str) .value ) j_strict_inverted = not j_strict if j_strict is not None else None allow_uncertain_118 = ( Parameter(allow_uncertain_118, "allow_uncertain_118") .set_default(True) .resolve_with_alias(j_strict_inverted, "j_strict") .throw_error_if_not_of_type(bool) .value ) strict_inverted = not strict if strict is not None else None fix_missing_conserved = ( Parameter(fix_missing_conserved, "fix_missing_conserved") .set_default(True) .resolve_with_alias(strict_inverted, "strict") .throw_error_if_not_of_type(bool) .value ) on_fail = ( Parameter(on_fail, "on_fail") .set_default("reject") .throw_error_if_not_of_type(str) .value ) suppress_warnings_inverted = ( not suppress_warnings if suppress_warnings is not None else None ) log_failures = ( Parameter(log_failures, "log_failures") .set_default(True) .resolve_with_alias(suppress_warnings_inverted, "suppress_warnings") .throw_error_if_not_of_type(bool) .value ) original_input = seq seq = aa.standardize(seq=seq, on_fail="reject", log_failures=log_failures) if seq is None: if on_fail == "reject": return None return original_input aa_118_target = "F" aa_118_certain = False junction_matching_regex = None if j_symbol: species = _utils.clean_and_lowercase(species) aa_118_target, aa_118_certain = get_conserved_aa_for_j_symbol_for_species( j_symbol, species, log_failures=log_failures ) if aa_118_certain: junction_matching_regex = re.compile(rf"^C[A-Z]*{aa_118_target}$") else: if not allow_uncertain_118: if on_fail == "reject": return None return original_input else: logger.info( f"Unclear residue at position 118 (j_symbol = {j_symbol}), accepting either F or W." ) junction_matching_regex = re.compile(r"^C[A-Z]*[FW]$") if not junction_matching_regex.match(seq): if not fix_missing_conserved: if log_failures: logger.warning( f"Failed to standardize {original_input}: not a valid junction sequence." ) if on_fail == "reject": return None return original_input seq = "C" + seq + aa_118_target return seq
[docs] def standardise(*args, **kwargs) -> Optional[str]: """ Alias for :py:func:`tidytcells.junction.standardize`. """ return standardize(*args, **kwargs)