Source code for tidytcells.tr._standardize

from typing import Dict, Optional, Type

from tidytcells import _utils
from tidytcells._utils import Parameter
from tidytcells._standardized_gene_symbol import (
    StandardizedGeneSymbol,
    StandardizedHomoSapiensTrSymbol,
    StandardizedMusMusculusTrSymbol,
)


SUPPORTED_SPECIES_AND_THEIR_STANDARDIZERS: Dict[str, Type[StandardizedGeneSymbol]] = {
    "homosapiens": StandardizedHomoSapiensTrSymbol,
    "musmusculus": StandardizedMusMusculusTrSymbol,
}


[docs] def standardize( gene: Optional[str] = None, species: str = "homosapiens", enforce_functional: bool = False, precision: str = "allele", on_fail: str = "reject", suppress_warnings: bool = False, ) -> str: """ Attempt to standardize a TR gene name to be IMGT-compliant. .. topic:: Supported species - ``"homosapiens"`` - ``"musmusculus"`` :param gene: Potentially non-standardized TR gene name. :type gene: str :param species: Species to which the TR gene belongs (see above for supported species). Defaults to ``"homosapiens"``. :type species: str :param enforce_functional: If ``True``, disallows TR genes that are recognised by IMGT but are marked as non-functional (ORF or pseudogene). Defaults to ``False``. :type enforce_functional: bool :param precision: The maximum level of precision to standardize to. ``"allele"`` standardizes to the maximum precision possible. ``"gene"`` standardizes only to the level of the gene. Defaults to ``"allele"``. :type precision: str :param on_fail: Behaviour when standardization fails. If set to ``"reject"``, returns ``None`` on failure. If set to ``"keep"``, returns the original input. Defaults to ``"reject"``. :type on_fail: str :param suppress_warnings: Disable warnings that are usually emitted when standardisation fails. Defaults to ``False``. :type suppress_warnings: bool :return: If the specified ``species`` is supported, and ``gene`` could be standardized, then return the standardized gene name. If ``species`` is unsupported, then the function does not attempt to standardize , and returns the unaltered ``gene`` string. Else follows the behaviour as set by ``on_fail``. :rtype: Union[str, None] .. topic:: Example usage Input strings will intelligently be corrected to IMGT-compliant gene symbols. >>> tt.tr.standardize("aj1") 'TRAJ1' The ``precision`` setting can truncate unnecessary information. >>> tt.tr.standardize("TRBV6-4*01", precision="gene") 'TRBV6-4' The ``enforce_functional`` setting will cause non-functional genes or alleles to be rejected. >>> result = tt.tr.standardize("TRBV1", enforce_functional=True) UserWarning: Failed to standardize "TRBV1" for species homosapiens: gene has no functional alleles. Attempted fix "TRBV1". >>> print(result) None *Mus musculus* is a supported species. >>> tt.tr.standardize("TCRBV22S1A2N1T", species="musmusculus") 'TRBV2' .. topic:: Decision Logic To provide an easy way to gauge the scope and limitations of standardization, below is a simplified overview of the decision logic employed when attempting to standardize a TR symbol. For more detail, please refer to the `source code <https://github.com/yutanagano/tidytcells>`_. .. code-block:: none IF the specified species is not supported for standardization: RETURN original gene symbol without modification ELSE: // attempt standardization { IF gene symbol is already in IMGT-compliant form: set standardization status as successful skip rest of standardization IF gene symbol is a known deprecated symbol: overwrite gene symbol with current IMGT-compliant symbol set standardization status as successful skip rest of standardization replace "TCR" with "TR" //e.g. TCRAV1-1 -> TRAV1-1 replace "S" with "-" //e.g. TRAV1S1 -> TRAV1-1 replace "." with "-" //e.g. TRAV1.1 -> TRAV1-1 add back any missing backslashes //e.g. TRAV14DV4 -> TRAV14/DV4 remove any unnecessary trailing zeros //e.g. TRAV1-01 -> TRAV1-1 IF gene symbol is now in IMGT-compliant form: set standardization status as successful skip rest of standardization add "TR" to the beginning of the gene symbol if necessary //e.g. AV1-1 -> TRAV1-1 IF gene symbol is now in IMGT-compliant form: set standardization status as successful skip rest of standardization resolve compound TRAV/TRDV designation if necessary //e.g. TRDV4 -> TRAV14/DV4 or TRAV14 -> TRAV14/DV4 IF gene symbol is now in IMGT-compliant form: set standardization as successful skip rest of standardization try adding or removing "-1" from the end of the gene symbol //e.g. TRAV1 -> TRAV1-1 IF gene symbol is now in IMGT-compliant form: set standardization status as successful skip rest of standardization set standardization status as failed } IF standardization status is set to successful: RETURN standardized gene symbol ELSE: IF on_fail is set to "reject": RETURN None IF on_fail is set to "keep": RETURN original gene symbol without modification """ Parameter(gene, "gene").throw_error_if_not_of_type(str) Parameter(species, "species").throw_error_if_not_of_type(str) Parameter(enforce_functional, "enforce_functional").throw_error_if_not_of_type(bool) Parameter(precision, "precision").throw_error_if_not_one_of("allele", "gene") Parameter(on_fail, "on_fail").throw_error_if_not_one_of("reject", "keep") Parameter(suppress_warnings, "suppress_warnings").throw_error_if_not_of_type(bool) species = _utils.clean_and_lowercase(species) species_is_supported = species in SUPPORTED_SPECIES_AND_THEIR_STANDARDIZERS if not species_is_supported: if not suppress_warnings: _utils.warn_unsupported_species(species, "TR") return gene StandardizedTrSymbolClass = SUPPORTED_SPECIES_AND_THEIR_STANDARDIZERS[species] standardized_tr_symbol = StandardizedTrSymbolClass(gene) invalid_reason = standardized_tr_symbol.get_reason_why_invalid(enforce_functional) if invalid_reason is not None: if not suppress_warnings: _utils.warn_failure( reason_for_failure=invalid_reason, original_input=gene, attempted_fix=standardized_tr_symbol.compile("allele"), species=species, ) if on_fail == "reject": return None return gene return standardized_tr_symbol.compile(precision)
[docs] def standardise(*args, **kwargs): """ Alias for :py:func:`tidytcells.tr.standardize`. """ return standardize(*args, **kwargs)