Source code for tidytcells.mh._standardize

import logging
from tidytcells import _utils
from tidytcells._utils import Parameter
from tidytcells._standardized_gene_symbol import (
    HlaSymbolStandardizer,
    MusMusculusMhSymbolStandardizer,
)
from typing import Dict, Optional, Type, Union

from tidytcells.result._mh_gene import MhGene

logger = logging.getLogger(__name__)


SUPPORTED_SPECIES_AND_THEIR_STANDARDIZERS: Dict[str, Type[Union[HlaSymbolStandardizer, MusMusculusMhSymbolStandardizer]]] = {
    "homosapiens": HlaSymbolStandardizer,
    "musmusculus": MusMusculusMhSymbolStandardizer,
}


[docs] def standardize( symbol: Optional[str] = None, species: Optional[str] = None, database: Optional[str] = None, log_failures: Optional[bool] = None, gene: Optional[str] = None, suppress_warnings: Optional[bool] = None, ) -> MhGene: """ Attempt to standardize an MH gene / allele symbol to be IMGT-compliant. # todo update docs once MRO mapping available .. topic:: Supported species - ``"homosapiens"`` - ``"musmusculus"`` .. note:: This function will only verify the validity of an MH gene/allele up to the level of the protein. Any further precise allele designations will not be verified, apart from the requirement that the format (colon-separated numbers) look valid. The reasons for this is firstly because new alleles at that level are added to the IMGT list quite often and so accurate verification is difficult, secondly because people rarely need verification to such a precise level, and finally because such verification costs more computational effort with diminishing returns. :param symbol: Potentially non-standardized MH gene / allele symbol. :type symbol: str :param species: Can be specified to standardize to a TR symbol that is known to be valid for that species (see above for supported species). If set to ``"any"``, then first attempts standardization for *Homo sapiens*, then *Mus musculus*. Defaults to ``"homosapiens"``. .. note:: From version 3, the default behaviour will change to ``"any"``. :type species: str :param database: Which gene database to use. Defaults to ``"MRO"``, alternatively, ``"IMGT"`` can be selected. Note that IMGT uses a non-standard representation of mouse MH genes, and using MRO is therefore recommended. See also: https://github.com/IEDB/MRO :type database: str :param log_failures: Report standardization failures through logging (at level ``WARNING``). Defaults to ``True``. :type log_failures: bool :param gene: Alias for `symbol`. :type gene: str :param suppress_warnings: Disable warnings that are usually logged when standardization fails. Deprecated in favour of `log_failures`. :type suppress_warnings: bool :return: A standardized MHC gene wrapped in a :py:class:`~tidytcells.result.MhGene` object. For details on how to use this output, please refer to the class documentation. :rtype: :py:class:`~tidytcells.result.MhGene` .. topic:: Example usage MH standardized results will be returned as a :py:class:`~tidytcells.result.MhGene` (or :py:class:`~tidytcells.result.HLAGene` for human genes). When standardization is a success, attributes 'allele', 'protein' and 'gene' can be used to retrieve the corrected information. >>> result = tt.mh.standardize("HLA-DRB3*01:01:02:01") >>> result.is_standardized True >>> result.allele 'HLA-DRB3*01:01:02:01' >>> result.protein 'HLA-DRB3*01:01' >>> result.gene 'HLA-DRB3' Attributes 'allele', 'protein' and 'gene' only return a result if the symbol could be standardized up to that level. Attribute 'symbol' is never None for a successful standardization, and always returns the most detailed available result between 'allele', 'protein' and 'gene'. >>> tt.mh.standardize("HLA-DRB3*01:01:02:01").symbol 'HLA-DRB3*01:01:02:01' >>> tt.mh.standardize("HLA-DRB3").allele None >>> tt.mh.standardize("HLA-DRB3").gene 'HLA-DRB3' >>> tt.mh.standardize("HLA-DRB3").symbol 'HLA-DRB3' Non-standardized input strings will intelligently be corrected to IMGT-compliant symbols. >>> tt.mh.standardize("A1").allele 'HLA-A*01' *Mus musculus* is a supported species. # todo update example with MRO >>> tt.mh.standardize("CRW2", species="musmusculus").gene 'MH1-M5' For failed standardizations, the 'error' attribute explains why the standardization failed, and the 'attempted_fix' attribute contains the best attempted result found during standardization. >>> result = tt.mh.standardize(symbol="HLA-DRB365") >>> result.is_standardized False >>> result.error 'Nonexistent allele for recognized gene' >>> result.attempted_fix 'HLA-DRB3*65' Other available properties are 'original_input', 'species'. >>> result.original_input 'HLA-DRB365' >>> result.original_input 'homosapiens' .. topic:: Decision Logic #todo: update decision logic once mouse is updated with MRO mapping To provide an easy way to gauge the scope and limitations of standardization, below is a simplified overview of the decision logic employed when attempting to standardize an MH symbol. For more detail, please refer to the `source code <https://github.com/yutanagano/tidytcells>`_. .. code-block:: none IF the specified species is not supported for standardization: RETURN original symbol without modification ELSE: // attempt standardization { IF symbol is already in IMGT-compliant form: set standardization status as successful skip rest of standardization IF symbol is a known deprecated symbol: overwrite symbol with current IMGT-compliant symbol set standardization status as successful skip rest of standardization // the rest is only applicable when species is set to homo sapiens add "HLA-" to the beginning of the symbol if necessary //e.g. A -> HLA-A replace "Cw" with "C" //e.g. HLA-Cw -> HLA-C add back forgotten asterisks if necessary //e.g. HLA-A01 -> HLA-A*01 add back forgotten colons if necessary //e.g. HLA-A*0101 -> HLA-A*01:01 If symbol is now in IMGT-compliant form: set standardization status as successful skip rest of standardization try adding or subtracting leading zeros from allele designation numbers //e.g. HLA-A*001 -> HLA-A*01 If symbol is now in IMGT-compliant form: set standardization status as successful skip rest of standardization set standardization status as failed } RETURN :py:class:`~tidytcells.result.MhGene` """ symbol = ( Parameter(symbol, "symbol") .resolve_with_alias(gene, "gene") .throw_error_if_not_of_type(str) .value ) symbol = ( Parameter(symbol, "symbol") .resolve_with_alias(gene, "gene") .throw_error_if_not_of_type(str) .value ) species = ( Parameter(species, "species") .set_default("homosapiens") .throw_error_if_not_of_type(str) .value ) database = ( Parameter(database, "species") .set_default("MRO") .throw_error_if_not_one_of("MRO", "IMGT") .value ) suppress_warnings_inverted = ( not suppress_warnings if suppress_warnings is not None else None ) log_failures = ( Parameter(log_failures, "log_failures") .set_default(True) .resolve_with_alias(suppress_warnings_inverted, "suppress_warnings") .throw_error_if_not_of_type(bool) .value ) species = _utils.clean_and_lowercase(species) if species == "any": best_attempt_result = MhGene(symbol, f'Failed with any species') for ( species, standardizer_cls, ) in SUPPORTED_SPECIES_AND_THEIR_STANDARDIZERS.items(): mh_standardizer = standardizer_cls(symbol) if mh_standardizer.result.is_standardized: return mh_standardizer.result if species == "homosapiens": best_attempt_result = mh_standardizer.result if log_failures: _utils.warn_result_failure( result=best_attempt_result, logger=logger, ) return best_attempt_result if species not in SUPPORTED_SPECIES_AND_THEIR_STANDARDIZERS: if log_failures: _utils.warn_unsupported_species(species, "MH", logger) return MhGene(symbol, f'Unsupported species: {species}') standardizer_cls = SUPPORTED_SPECIES_AND_THEIR_STANDARDIZERS[species] mh_standardizer = standardizer_cls(symbol) if (not mh_standardizer.result.is_standardized) and log_failures: _utils.warn_result_failure( result=mh_standardizer.result, logger=logger, ) return mh_standardizer.result
[docs] def standardise(*args, **kwargs) -> MhGene: """ Alias for :py:func:`tidytcells.mh.standardize`. :rtype: :py:class:`~tidytcells.result.MhGene` """ return standardize(*args, **kwargs)