import logging
from tidytcells import _utils
from tidytcells._utils import Parameter
from tidytcells._standardized_gene_symbol import (
StandardizedSymbol,
StandardizedHomoSapiensIgSymbol,
)
from typing import Dict, Optional, Type, Literal
logger = logging.getLogger(__name__)
SUPPORTED_SPECIES_AND_THEIR_STANDARDIZERS: Dict[str, Type[StandardizedSymbol]] = {
"homosapiens": StandardizedHomoSapiensIgSymbol,
}
[docs]
def standardize(
symbol: Optional[str] = None,
species: Optional[str] = None,
enforce_functional: Optional[bool] = None,
precision: Optional[Literal["allele", "gene"]] = None,
on_fail: Optional[Literal["reject", "keep"]] = None,
log_failures: Optional[bool] = None,
gene: Optional[str] = None,
suppress_warnings: Optional[bool] = None,
) -> Optional[str]:
"""
Attempt to standardize a IG gene / allele symbol to be IMGT-compliant.
.. topic:: Supported species
- ``"homosapiens"``
:param symbol:
Potentially non-standardized IG gene / allele symbol.
:type symbol:
str
:param species:
Can be specified to standardise to a IG symbol that is known to be valid for that species (see above for supported species).
Currently, only *Homo sapiens* is supported, but this parameter has been kept to keep the interface compatible with that of its sister function in :py:mod:`tidytcells.tr`.
Defaults to ``"homosapiens"``.
:type species:
str
:param enforce_functional:
If ``True``, disallows IG genes / alleles that are recognised by IMGT but are marked as non-functional (ORF or pseudogene).
Defaults to ``False``.
:type enforce_functional:
bool
:param precision:
The maximum level of precision to standardize to.
``"allele"`` standardizes to the maximum precision possible.
``"gene"`` standardizes only to the level of the gene.
Defaults to ``"allele"``.
:type precision:
str
:param on_fail:
Behaviour when standardization fails.
If set to ``"reject"``, returns ``None`` on failure.
If set to ``"keep"``, returns the original input.
Defaults to ``"reject"``.
:type on_fail:
str
:param log_failures:
Report standardisation failures through logging (at level ``WARNING``).
Defaults to ``True``.
:type log_failures:
bool
:param gene:
Alias for the parameter `symbol`.
:type gene:
str
:param suppress_warnings:
Disable warnings that are usually logged when standardisation fails.
Deprecated in favour of `log_failures`.
:type suppress_warnings:
bool
:return:
If the specified `species` is supported, and `symbol` could be standardized, then return the standardized symbol name.
If `species` is unsupported, then the function does not attempt to standardize , and returns the unaltered `symbol` string.
Else follows the behaviour as set by `on_fail`.
:rtype:
Optional[str]
.. topic:: Example usage
Input strings will intelligently be corrected to IMGT-compliant gene / allele symbols.
>>> tt.ig.standardize("lj1")
'IGLJ1'
The `precision` setting can truncate unnecessary information.
>>> tt.ig.standardize("IGHV1-18*02", precision="gene")
'IGHV1-18'
The `enforce_functional` setting will cause non-functional genes or alleles to be rejected.
>>> result = tt.ig.standardize("IGHV1-12", enforce_functional=True)
Failed to standardize "IGHV1-12" for species homosapiens: gene has no functional alleles. Attempted fix "IGHV1-12".
>>> print(result)
None
.. topic:: Decision Logic
To provide an easy way to gauge the scope and limitations of standardization, below is a simplified overview of the decision logic employed when attempting to standardize a TR symbol.
For more detail, please refer to the `source code <https://github.com/yutanagano/tidytcells>`_.
.. code-block:: none
IF the specified species is not supported for standardization:
RETURN original symbol without modification
ELSE:
// attempt standardization
{
IF symbol is already in IMGT-compliant form:
set standardization status as successful
skip rest of standardization
IF symbol is a known deprecated symbol:
overwrite symbol with current IMGT-compliant symbol
set standardization status as successful
skip rest of standardization
replace "." with "-" //e.g. IGHV1.2 -> IGHV1-2
add back any missing backslashes //e.g. IGHV1OR15-1 -> IGHV1/OR15-1
remove any unnecessary trailing zeros //e.g. IGHV1-02 -> IGHV1-2
IF symbol is now in IMGT-compliant form:
set standardization status as successful
skip rest of standardization
add "IG" to the beginning of the symbol if necessary //e.g. HV1-18 -> IGHV1-18
IF symbol is now in IMGT-compliant form:
set standardization status as successful
skip rest of standardization
try adding or removing "-1" from the end of the symbol //e.g. IGHV6 -> IGHV6-1
IF symbol is now in IMGT-compliant form:
set standardization status as successful
skip rest of standardization
set standardization status as failed
}
IF standardization status is set to successful:
RETURN standardized symbol
ELSE:
IF on_fail is set to "reject":
RETURN None
IF on_fail is set to "keep":
RETURN original symbol without modification
"""
symbol = (
Parameter(symbol, "symbol")
.resolve_with_alias(gene, "gene")
.throw_error_if_not_of_type(str)
.value
)
species = (
Parameter(species, "species")
.set_default("homosapiens")
.throw_error_if_not_of_type(str)
.value
)
enforce_functional = (
Parameter(enforce_functional, "enforce_functional")
.set_default(False)
.throw_error_if_not_of_type(bool)
.value
)
precision = (
Parameter(precision, "precision")
.set_default("allele")
.throw_error_if_not_one_of("allele", "gene")
.value
)
on_fail = (
Parameter(on_fail, "on_fail")
.set_default("reject")
.throw_error_if_not_one_of("reject", "keep")
.value
)
suppress_warnings_inverted = (
not suppress_warnings if suppress_warnings is not None else None
)
log_failures = (
Parameter(log_failures, "log_failures")
.set_default(True)
.resolve_with_alias(suppress_warnings_inverted, "suppress_warnings")
.throw_error_if_not_of_type(bool)
.value
)
species = _utils.clean_and_lowercase(species)
if species == "any":
best_attempt_invalid_reason = None
best_attempt_standardised_symbol = None
best_attempt_species = None
for (
species,
StandardizedIgSymbolClass,
) in SUPPORTED_SPECIES_AND_THEIR_STANDARDIZERS.items():
standardized_ig_symbol = StandardizedIgSymbolClass(symbol)
invalid_reason = standardized_ig_symbol.get_reason_why_invalid(
enforce_functional
)
if invalid_reason is None:
return standardized_ig_symbol.compile(precision)
if species == "homosapiens":
best_attempt_invalid_reason = invalid_reason
best_attempt_standardised_symbol = standardized_ig_symbol
best_attempt_species = species
if log_failures:
_utils.warn_failure(
reason_for_failure=best_attempt_invalid_reason,
original_input=symbol,
attempted_fix=best_attempt_standardised_symbol.compile("allele"),
species=best_attempt_species,
logger=logger,
)
if on_fail == "reject":
return None
return symbol
if species not in SUPPORTED_SPECIES_AND_THEIR_STANDARDIZERS:
if log_failures:
_utils.warn_unsupported_species(species, "IG", logger)
return symbol
StandardizedIgSymbolClass = SUPPORTED_SPECIES_AND_THEIR_STANDARDIZERS[species]
standardized_ig_symbol = StandardizedIgSymbolClass(symbol)
invalid_reason = standardized_ig_symbol.get_reason_why_invalid(enforce_functional)
if invalid_reason is None:
return standardized_ig_symbol.compile(precision)
if log_failures:
_utils.warn_failure(
reason_for_failure=invalid_reason,
original_input=symbol,
attempted_fix=standardized_ig_symbol.compile("allele"),
species=species,
logger=logger,
)
if on_fail == "reject":
return None
return symbol
[docs]
def standardise(*args, **kwargs) -> Optional[str]:
"""
Alias for :py:func:`tidytcells.ig.standardize`.
"""
return standardize(*args, **kwargs)