Source code for tidytcells.tr._standardize

import logging
from tidytcells import _utils
from tidytcells.result import ReceptorGene
from tidytcells._utils import Parameter
from tidytcells._standardized_gene_symbol import (
    ReceptorGeneSymbolStandardizer,
    HomoSapiensTrSymbolStandardizer,
    MusMusculusTrSymbolStandardizer,
)
from typing import Dict, Optional, Type

logger = logging.getLogger(__name__)


SUPPORTED_SPECIES_AND_THEIR_STANDARDIZERS: Dict[str, Type[ReceptorGeneSymbolStandardizer]] = {
    "homosapiens": HomoSapiensTrSymbolStandardizer,
    "musmusculus": MusMusculusTrSymbolStandardizer,
}


[docs] def standardize( symbol: Optional[str] = None, species: Optional[str] = None, enforce_functional: Optional[bool] = None, allow_subgroup: Optional[bool] = None, log_failures: Optional[str] = None, gene: Optional[str] = None, suppress_warnings: Optional[bool] = None, ) -> ReceptorGene: """ Attempt to standardize a TR gene / allele symbol to be IMGT-compliant. .. topic:: Supported species - ``"homosapiens"`` - ``"musmusculus"`` :param symbol: Potentially non-standardized TR gene / allele symbol. :type symbol: str :param species: Can be specified to standardize to a TR symbol that is known to be valid for that species (see above for supported species). If set to ``"any"``, then first attempts standardization for *Homo sapiens*, then *Mus musculus*. Defaults to ``"homosapiens"``. .. note:: From version 3, the default behaviour will change to ``"any"``. :type species: str :param enforce_functional: If ``True``, disallows TR genes / alleles that are recognised by IMGT but are marked as non-functional (ORF or pseudogene). Defaults to ``False``. :type enforce_functional: bool :param allow_subgroup: If ``True``, allows valid subgroups (as well as more specific gene/allele symbos) to pass standardization. If ``False``, the supplied symbol must point to at least a specific gene. Defaults to ``False``. :type allow_subgroup: bool :param log_failures: Report standardization failures through logging (at level ``WARNING``). Defaults to ``True``. :type log_failures: bool :param gene: Alias for the parameter `symbol`. :type gene: str :param suppress_warnings: Disable warnings that are usually logged when standardization fails. Deprecated in favour of `log_failures`. :type suppress_warnings: bool :return: A standardized receptor gene wrapped in a :py:class:`~tidytcells.result.ReceptorGene` object. For details on how to use this output, please refer to the class documentation. :rtype: :py:class:`~tidytcells.result.ReceptorGene` .. topic:: Example usage TR standardized results will be returned as a :py:class:`~tidytcells.result.ReceptorGene`. When standardization is a success, attributes 'allele', 'gene' and 'subgroup' can be used to retrieve the corrected information. >>> result = tt.tr.standardize("TRAV1-1*01") >>> result.is_standardized True >>> result.allele 'TRAV1-1*01' >>> result.gene 'TRAV1-1' >>> result.subgroup 'TRAV1' Attributes 'allele', 'gene' and 'subgroup' only return a result if the symbol could be standardized up to that level. Attribute 'symbol' is never None for a successful standardization, and always returns the most detailed available result between 'allele', 'gene' and 'subgroup'. >>> tt.tr.standardize("TRAV1-1*01").symbol 'TRAV1-1*01' >>> tt.tr.standardize("TRAV1-1").allele None >>> tt.tr.standardize("TRAV1-1").gene 'TRAV1-1' >>> tt.tr.standardize("TRAV1-1").symbol 'TRAV1-1' Non-standardized input strings will intelligently be corrected to IMGT-compliant gene / allele symbols. >>> tt.tr.standardize("aj1").gene 'TRAJ1' The `enforce_functional` setting will cause non-functional genes or alleles to be rejected. For failed standardizations, the 'error' attribute explains why the standardization failed, and the 'attempted_fix' attribute contains the best attempted result found during standardization. >>> result = tt.tr.standardize("tcrBV1", enforce_functional=True) >>> result.is_standardized False >>> result.error 'Gene has no functional alleles' >>> result.attempted_fix 'TRBV1' Known synonyms are included in the standardization >>> tt.tr.standardize("V4P").symbol 'TRGV11' *Mus musculus* is a supported species. >>> tt.tr.standardize("TRBV1", species="musmusculus").gene 'TRBV1' Other available properties are 'original_input', 'species', 'receptor_type', 'locus' and 'gene_type'. >>> result = tt.tr.standardize("TRAV01-01") >>> result.symbol 'TRAV1-1' >>> result.original_input 'TRAV01-01' >>> result.species 'homosapiens' >>> result.receptor_type 'TR' >>> result.locus 'TRA' >>> result.gene_type 'V' Utility method 'get_all_alleles' can be used to retrieve all (functional) alleles for a given symbol. >>> result = tt.tr.standardize("TRAV1-1") >>> result.get_all_alleles() ['TRAV1-1*01', 'TRAV1-1*02'] >>> result = tt.tr.standardize("TRAV15") >>> result.get_all_alleles(enforce_functional=True) [] >>> result.get_all_alleles(enforce_functional=True) ['TRAV15*02', 'TRAV15*03', 'TRAV15*01'] Utility method 'get_aa_sequences' can be used to retrieve known amino acid sequences per allele. Using sequence_type 'ALL' shows all available sequence data for each allele. >>> result = tt.tr.standardize("TRAV1-1*01") >>> result.get_aa_sequences(sequence_type="CDR1") {'TRAV1-1*01': 'TSGFYG'} >>> result.get_aa_sequences(sequence_type="CDR2") {'TRAV1-1*01': 'NALDGL'} >>> result = tt.tr.standardize("TRAJ15") >>> result.get_aa_sequences(sequence_type="ALL") { 'TRAJ15*01': { 'J-MOTIF': 'FGKG', 'J-REGION': 'NQAGTALIFGKGTTLSVSS', 'functionality': 'F' }, 'TRAJ15*02': { 'J-MOTIF': 'FGKG', 'J-REGION': 'NQAGTALIFGKGTHLSVSS', 'functionality': 'F' } } .. topic:: Decision Logic To provide an easy way to gauge the scope and limitations of standardization, below is a simplified overview of the decision logic employed when attempting to standardize a TR symbol. For more detail, please refer to the `source code <https://github.com/yutanagano/tidytcells>`_. .. code-block:: none 0. sanity-check input Skip standardization if invalid parameters are passed (invalid amino acids in sequence, invalid species, etc) 1. attempt standardization IF symbol is already in IMGT-compliant form: set standardization status as successful, skip to step 2 IF symbol is a known deprecated symbol: overwrite symbol with current IMGT-compliant symbol set standardization status as successful, skip to step 2. replace "TCR" with "TR" //e.g. TCRAV1-1 -> TRAV1-1 replace "S" with "-" //e.g. TRAV1S1 -> TRAV1-1 replace "." with "-" //e.g. TRAV1.1 -> TRAV1-1 add back any missing backslashes //e.g. TRAV14DV4 -> TRAV14/DV4 remove any unnecessary trailing zeros //e.g. TRAV1-01 -> TRAV1-1 IF symbol is now in IMGT-compliant form: set standardization status as successful, skip to step 2 add "TR" to the beginning of the symbol if necessary //e.g. AV1-1 -> TRAV1-1 IF symbol is now in IMGT-compliant form: set standardization status as successful, skip to step 2 resolve compound TRAV/TRDV designation if necessary //e.g. TRDV4 -> TRAV14/DV4 or TRAV14 -> TRAV14/DV4 IF symbol is now in IMGT-compliant form: set standardization status as successful, skip to step 2 try removing "-1" from the end of the symbol //e.g. TRAV1-1 -> TRAV1 IF symbol is now a valid IMGT-compliant *gene* (do not correct to subgroup): set standardization status as successful, skip to step 2 set standardization status as failed 2. finalisation IF standardization has not failed: consider standardization a success RETURN :py:class:`~tidytcells.result.ReceptorGene` """ symbol = ( Parameter(symbol, "symbol") .resolve_with_alias(gene, "gene") .throw_error_if_not_of_type(str) .value ) species = ( Parameter(species, "species") .set_default("homosapiens") .throw_error_if_not_of_type(str) .value ) enforce_functional = ( Parameter(enforce_functional, "enforce_functional") .set_default(False) .throw_error_if_not_of_type(bool) .value ) allow_subgroup = ( Parameter(allow_subgroup, "allow_subgroup") .set_default(False) .throw_error_if_not_of_type(bool) .value ) suppress_warnings_inverted = ( not suppress_warnings if suppress_warnings is not None else None ) log_failures = ( Parameter(log_failures, "log_failures") .set_default(True) .resolve_with_alias(suppress_warnings_inverted, "suppress_warnings") .throw_error_if_not_of_type(bool) .value ) species = _utils.clean_and_lowercase(species) if species == "any": best_attempt_result = ReceptorGene(symbol, f'Failed with any species') for ( species, standardizer_cls, ) in SUPPORTED_SPECIES_AND_THEIR_STANDARDIZERS.items(): tr_standardizer = standardizer_cls(symbol, enforce_functional=enforce_functional, allow_subgroup=allow_subgroup) if tr_standardizer.result.is_standardized: return tr_standardizer.result if species == "homosapiens": best_attempt_result = tr_standardizer.result if log_failures: _utils.warn_result_failure( result=best_attempt_result, logger=logger, ) return best_attempt_result if species not in SUPPORTED_SPECIES_AND_THEIR_STANDARDIZERS: if log_failures: _utils.warn_unsupported_species(species, "TR", logger) return ReceptorGene(symbol, f'Unsupported species: {species}') standardizer_cls = SUPPORTED_SPECIES_AND_THEIR_STANDARDIZERS[species] tr_standardizer = standardizer_cls(symbol, enforce_functional=enforce_functional, allow_subgroup=allow_subgroup) if (not tr_standardizer.result.is_standardized) and log_failures: _utils.warn_result_failure( result=tr_standardizer.result, logger=logger, ) return tr_standardizer.result
[docs] def standardise(*args, **kwargs) -> ReceptorGene: """ Alias for :py:func:`tidytcells.tr.standardize`. :rtype: :py:class:`~tidytcells.result.ReceptorGene` """ return standardize(*args, **kwargs)