Skip to content

Commit fd56919

Browse files
committed
[ancestral] optional validation of aa vs nuc reconstructions
There's been a tension between the two ways we handle translations in augur: the "old" way of inferring ancestral seqs then translating them vs the "new" way of using nextclade to get translated tip-sequences and reconstructing nucleotide & aa seqs independently across the tree. The new way is arguable better, as nextclade has some alignment heuristics around CDSs to produce better translations, but opens the door to having mismatches between the inferred nuc seq and the corresponding AA residue. These differences would be surfaced in Auspice when looking at branch mutations.This commit adds an optional warning to check for such mismatches. A trial n=5.5k H3N2 HA dataset had the following differences: - SigPep: 2 terminal nodes. Median residue mismatch count: 13 (range: 1 - 13) - HA1: 2 terminal nodes and 4 internal nodes. Median residue mismatch count: 3 (range: 1 - 5) - HA2: 1 terminal node and 1 internal node were different. Median residue mismatch count: 2 (range: 1 - 2)
1 parent d7a21d5 commit fd56919

File tree

1 file changed

+57
-1
lines changed

1 file changed

+57
-1
lines changed

augur/ancestral.py

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from Bio.Align import MultipleSeqAlignment
3333
from Bio.Seq import Seq
3434
from Bio.SeqRecord import SeqRecord
35+
from .translate import safe_translate
3536
from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_augur_json, get_json_name, \
3637
genome_features_to_auspice_annotation
3738
from .io.file import open_file
@@ -42,6 +43,7 @@
4243
from .util_support.node_data_file import NodeDataObject
4344
from typing import cast, Any, Callable, TypedDict
4445
from typing_extensions import NotRequired
46+
from math import floor
4547

4648
VCF_Alignment = dict[str, dict[int, str]]
4749

@@ -360,6 +362,9 @@ def register_parser(parent_subparsers):
360362
"Currently only supported for FASTA-input. Specify the file name via a "
361363
"template like 'aa_sequences_%%GENE.fasta' where %%GENE will be replaced "
362364
"by the gene name.")
365+
amino_acid_options_group.add_argument('--report-inconsistent-translation', action="store_true",
366+
help='Report where amino acid reconstruction differed from a translation of the reconstructed nuc sequence')
367+
363368

364369
# ----------------------------- OUTPUTS -----------------------------
365370
output_group = parser.add_argument_group(
@@ -463,6 +468,53 @@ def _to_ancestral_json(anc: Ancestral_Reconstruction) -> Ancestral_JSON:
463468
j['mask'] = "".join(['1' if x else '0' for x in anc["mutations"]["mask"]])
464469
return j
465470

471+
def _validate_translated_consistency(anc_seqs, aa_result, feat, gene, T):
472+
"""Compare the independently reconstructed AA sequences with translations
473+
of the reconstructed nucleotide sequences for a given gene. Reports any
474+
inconsistencies as warnings.
475+
476+
Parameters
477+
----------
478+
anc_seqs : Ancestral_JSON
479+
Contains reconstructed nucleotide sequences for all nodes.
480+
aa_result : Ancestral_Reconstruction
481+
The independent AA reconstruction for this gene.
482+
feat : Bio.SeqFeature.SeqFeature
483+
The gene feature, used to extract the nucleotide region.
484+
gene : str
485+
Gene name, for reporting.
486+
T : Bio.Phylo.BaseTree.Tree
487+
The tree (used to iterate over all nodes).
488+
"""
489+
nodes_with_differences = {'tips': 0, 'internal': 0}
490+
difference_counts: list[int] = []
491+
492+
for node in T.find_clades():
493+
nuc_seq = anc_seqs['nodes'].get(node.name, {}).get('sequence')
494+
assert nuc_seq is not None
495+
nuc_translated = safe_translate(str(feat.extract(Seq(nuc_seq))))
496+
# re-reconstruct the inferred protein sequence since we don't store them on anc_seqs
497+
aa_seq = aa_result['tt'].sequence(node, as_string=True, reconstructed=True)
498+
assert len(aa_seq)==len(nuc_translated)
499+
500+
if nuc_translated != aa_seq:
501+
nodes_with_differences["tips" if node.is_terminal() else "internal"] += 1
502+
difference_counts.append(sum(c1!=c2 for c1,c2 in zip(nuc_translated, aa_seq)))
503+
504+
if len(difference_counts) > 0:
505+
difference_counts.sort()
506+
median = difference_counts[floor(len(difference_counts)/2)]
507+
print(
508+
f"WARNING: gene {gene!r} has differences between the independently reconstructed"
509+
f" amino acid sequence and a translation of the reconstructed nucleotide sequence."
510+
f" {nodes_with_differences['tips']} terminal nodes and"
511+
f" {nodes_with_differences['internal']} internal nodes were different."
512+
f" Median residue mismatch count: {median:,}"
513+
f" (range: {difference_counts[0]:,} - {difference_counts[-1]:,})",
514+
file=sys.stderr,
515+
)
516+
517+
466518
def reconstruct_translations(
467519
anc_seqs: Ancestral_JSON,
468520
ref: str|None,
@@ -475,6 +527,7 @@ def reconstruct_translations(
475527
marginal: bool,
476528
rng_seed: int,
477529
output_fname_pattern: str|None,
530+
report_inconsistent_translation: bool,
478531
):
479532
genes = parse_genes_argument(genes_arg)
480533
if genes is None or not len(genes):
@@ -521,6 +574,9 @@ def reconstruct_translations(
521574
node["aa_sequences"] = {}
522575

523576
node["aa_sequences"][gene] = aa_result['tt'].sequence(T.root, as_string=True, reconstructed=True)
577+
578+
if report_inconsistent_translation:
579+
_validate_translated_consistency(anc_seqs, aa_result, feat, gene, T)
524580

525581
anc_seqs['reference'][gene] = aa_result['root_seq']
526582
anc_seqs['annotations'].update(genome_features_to_auspice_annotation({gene: feat}, annotation_fname))
@@ -568,7 +624,7 @@ def run(args: argparse.Namespace):
568624
assert not is_vcf # guaranteed by validate_arguments() but good to double check
569625
reconstruct_translations(anc_seqs, ref, T, args.genes, args.annotation, args.translations,
570626
infer_ambiguous, fill_overhangs, marginal_inference, rng_seed,
571-
args.output_translations)
627+
args.output_translations, args.report_inconsistent_translation)
572628

573629

574630
out_name = get_json_name(args, '.'.join(args.alignment.split('.')[:-1]) + '_mutations.json')

0 commit comments

Comments
 (0)