Source code for tidytcells.junction._standardize
import logging
import re
from tidytcells import aa
from typing import Literal, Optional
from tidytcells._utils.parameter import Parameter
logger = logging.getLogger(__name__)
JUNCTION_MATCHING_REGEX = re.compile(f"^C[A-Z]*[FW]$")
[docs]
def standardize(
seq: str,
strict: Optional[bool] = None,
on_fail: Optional[Literal["reject", "keep"]] = None,
log_failures: Optional[bool] = None,
suppress_warnings: Optional[bool] = None,
):
"""
Ensures that a string value looks like a valid junction (CDR3) amino acid sequence.
This function is a special variant of :py:func:`tidytcells.aa.standardize`.
A valid junction sequence must:
1. Be a valid amino acid sequence
2. Begin with a cysteine (C)
3. End with a phenylalanine (F) or a tryptophan (W)
:param seq:
String value representing a junction sequence.
:type seq:
str
:param strict:
If ``True``, any string that does not look like a junction sequence is rejected.
If ``False``, any inputs that are valid amino acid sequences but do not start with C and end with F/W are not rejected and instead are corrected by having a C appended to the beginning and an F appended at the end.
Defaults to ``False``.
:type strict:
bool
:param on_fail:
Behaviour when standardization fails.
If set to ``"reject"``, returns ``None`` on failure.
If set to ``"keep"``, returns the original input.
Defaults to ``"reject"``.
:type on_fail:
str
:param log_failures:
Report standardisation failures through logging (at level ``WARNING``).
Defaults to ``True``.
:type log_failures:
bool
:param suppress_warnings:
Disable warnings that are usually logged when standardisation fails.
Deprecated in favour of `log_failures`.
:type suppress_warnings:
bool
:return:
If possible, a standardized version of the input string is returned.
If the input string cannot be standardized, the function follows the behaviour as set by `on_fail`.
:rtype:
Union[str, None]
.. topic:: Example usage
Strings that look like junction sequences will be accepted, and returned in capitalised form.
>>> tt.junction.standardize("csadaf")
'CSADAF'
Strings that are valid amino acid sequences but do not stard and end with the appropriate residues will have a C and an F appended to its beginning and end as required.
>>> tt.junction.standardize("sada")
'CSADAF'
However, setting `strict` to ``True`` will cause these cases to be rejected.
>>> result = tt.junction.standardize("sada", strict=True)
Input sadaf was rejected as it is not a valid junction sequence.
>>> print(result)
None
.. topic:: Decision Logic
To provide an easy way to gauge the scope and limitations of standardization, below is a simplified overview of the decision logic employed when attempting to standardize a junction sequence.
For more detail, please refer to the `source code <https://github.com/yutanagano/tidytcells>`_.
.. code-block:: none
IF input sequence contains non-amino acid symbols:
set standardization status to failed
IF input sequence does not start with C and end with W / F:
IF strict is set to True:
set standardization status to failed
ELSE:
add C to the beginning and F to the end of the input sequence as required
set standardization status to successful
ELSE:
set standardization status to successful
IF standardization status is set to successful:
RETURN standardized sequence
ELSE:
IF on_fail is set to "reject":
RETURN None
IF on_fail is set to "keep":
RETURN original sequence
"""
seq = Parameter(seq, "seq").throw_error_if_not_of_type(str).value
strict = (
Parameter(strict, "strict")
.set_default(False)
.throw_error_if_not_of_type(bool)
.value
)
on_fail = (
Parameter(on_fail, "on_fail")
.set_default("reject")
.throw_error_if_not_of_type(str)
.value
)
suppress_warnings_inverted = (
not suppress_warnings if suppress_warnings is not None else None
)
log_failures = (
Parameter(log_failures, "log_failures")
.set_default(True)
.resolve_with_alias(suppress_warnings_inverted, "suppress_warnings")
.throw_error_if_not_of_type(bool)
.value
)
original_input = seq
seq = aa.standardize(seq=seq, on_fail="reject", log_failures=log_failures)
not_valid_amino_acid_sequence = seq is None
if not_valid_amino_acid_sequence:
if on_fail == "reject":
return None
return original_input
if not JUNCTION_MATCHING_REGEX.match(seq):
if strict:
if log_failures:
logger.warning(
f"Failed to standardize {original_input}: not a valid junction sequence."
)
if on_fail == "reject":
return None
return original_input
if not seq.startswith("C"):
seq = "C" + seq
if not JUNCTION_MATCHING_REGEX.match(seq):
seq = seq + "F"
return seq
[docs]
def standardise(*args, **kwargs):
"""
Alias for :py:func:`tidytcells.junction.standardize`.
"""
return standardize(*args, **kwargs)