Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,13 @@ sub fetch_input {
$var_dba->dbc->do(qq/DELETE pfp.* FROM protein_function_predictions pfp, attrib a WHERE pfp.analysis_attrib_id = a.attrib_id AND a.value = 'cadd'/);
}
if ($dbnsfp_run_type == FULL ) {
$var_dba->dbc->do(qq/DELETE pfp.* FROM protein_function_predictions pfp, attrib a WHERE pfp.analysis_attrib_id = a.attrib_id AND a.value IN ('dbnsfp_cadd', 'dbnsfp_meta_lr', 'dbnsfp_mutation_assessor', 'dbnsfp_revel')/);
$var_dba->dbc->do(qq/DELETE pfp.* FROM protein_function_predictions pfp, attrib a WHERE pfp.analysis_attrib_id = a.attrib_id AND a.value IN (
'dbnsfp_cadd',
'dbnsfp_meta_lr',
'dbnsfp_mutation_assessor',
'dbnsfp_revel',
'dbnsfp_alphamissense',
'dbnsfp_esm1b')/);
}
# Also truncate the protein_function_prediction + _attrib tables if in sift FULL mode
if ($sift_run_type == FULL) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,12 +217,12 @@ sub default_options {
dbnsfp_max_workers => 50,
dbnsfp_working => $self->o('species_dir').'/dbnsfp_working',
dbnsfp_annotation => { GRCh37 =>
{ file => $self->o('variation_data') . '/dbNSFP/4.9a/dbNSFP4.9a_grch37.gz',
version => '4.9a',
{ file => $self->o('variation_data') . '/dbNSFP/5.2a/dbNSFP5.2a_grch37.gz',
version => '5.2a',
},
GRCh38 =>
{ file => $self->o('variation_data') . '/dbNSFP/4.9a/dbNSFP4.9a_grch38.gz',
version => '4.9a',
{ file => $self->o('variation_data') . '/dbNSFP/5.2a/dbNSFP5.2a_grch38.gz',
version => '5.2a',
}
},
cadd_run_type => NONE,
Expand Down
19 changes: 18 additions & 1 deletion modules/Bio/EnsEMBL/Variation/ProteinFunctionPredictionMatrix.pm
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,17 @@ my $PREDICTION_TO_VAL = {
'likely disease causing' => 0,
'likely benign' => 1,
},
dbnsfp_alphamissense => {
'likely benign' => 0,
'benign' => 1,
'ambiguous' => 2,
'likely pathogenic' => 3,
'pathogenic' => 4,
},
dbnsfp_esm1b => {
'tolerated' => 0,
'deleterious' => 1,
},
dbnsfp_meta_lr => {
'tolerated' => 0,
'damaging' => 1,
Expand Down Expand Up @@ -519,8 +530,11 @@ sub prediction_to_short {
# this value

my $val = $prob;
if ($self->{analysis} eq 'dbnsfp_esm1b') {
$val = ($val + 50) / 100; # convert to 0-1 scale
}
if ($self->{analysis} ne 'cadd') {
$val = $prob * 1000;
$val = $val * 1000;
}

# we store the prediction in the top $NUM_PRED_BITS bits
Expand Down Expand Up @@ -578,6 +592,9 @@ sub prediction_from_short {
if ($self->{analysis} ne 'cadd') {
$prob = $prob / 1000;
}
if ($self->{analysis} eq 'dbnsfp_esm1b') {
$prob = ($prob * 100) - 50; # convert to (-50)-50 scale
}

printf("pfs: 0x%04x => $pred ($prob)\n", $val) if $DEBUG;

Expand Down
64 changes: 64 additions & 0 deletions modules/Bio/EnsEMBL/Variation/TranscriptVariationAllele.pm
Original file line number Diff line number Diff line change
Expand Up @@ -1083,6 +1083,38 @@ sub dbnsfp_revel_prediction {
return $self->_prediction('dbnsfp_revel_prediction', $dbnsfp_revel_prediction);
}

=head2 dbnsfp_alphamissense_prediction

Description: Return the qualitative AlphaMissense prediction for the effect of this allele.
(Note that we currently only have predictions for variants that
result in single amino acid substitutions in human)
Returntype : string (one of 'likely benign', 'benign', 'ambiguous', 'likely pathogenic', 'pathogenic').
Exceptions : none
Status : At Risk

=cut

sub dbnsfp_alphamissense_prediction {
my ($self, $dbnsfp_alphamissense_prediction) = @_;
return $self->_prediction('dbnsfp_alphamissense_prediction', $dbnsfp_alphamissense_prediction);
}

=head2 dbnsfp_esm1b_prediction

Description: Return the qualitative ESM1b prediction for the effect of this allele.
(Note that we currently only have predictions for variants that
result in single amino acid substitutions in human)
Returntype : string (one of 'tolerated' or 'deleterious').
Exceptions : none
Status : At Risk

=cut

sub dbnsfp_esm1b_prediction {
my ($self, $dbnsfp_esm1b_prediction) = @_;
return $self->_prediction('dbnsfp_esm1b_prediction', $dbnsfp_esm1b_prediction);
}

=head2 dbnsfp_meta_lr_prediction

Description: Return the qualitative MetaLR prediction for the effect of this allele.
Expand Down Expand Up @@ -1171,6 +1203,38 @@ sub dbnsfp_revel_score {
return $self->_score('dbnsfp_revel_score');
}

=head2 dbnsfp_alphamissense_score

Description: Return the AlphaMissense score for this allele. The score is retrieved from dbNSFP. (We only
have predictions for variants that result in single amino acid substitutions in human)
Returntype : float if this is a missense change and a prediction is
available, undef otherwise
Exceptions : none
Status : At Risk

=cut

sub dbnsfp_alphamissense_score {
my ($self, $dbnsfp_alphamissense_score) = @_;
return $self->_score('dbnsfp_alphamissense_score', $dbnsfp_alphamissense_score);
}

=head2 dbnsfp_esm1b_score

Description: Return the ESM1b score for this allele. The score is retrieved from dbNSFP. (We only
have predictions for variants that result in single amino acid substitutions in human)
Returntype : float if this is a missense change and a prediction is
available, undef otherwise
Exceptions : none
Status : At Risk

=cut

sub dbnsfp_esm1b_score {
my ($self, $dbnsfp_esm1b_score) = @_;
return $self->_score('dbnsfp_esm1b_score', $dbnsfp_esm1b_score);
}

=head2 dbnsfp_meta_lr_score

Description: Return the MetaLR score for this allele. The score is retrieved from dbNSFP. (We only
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ sub run {

my $all_triplets = $self->get_triplets($translation_stable_id);

$self->load_predictions_for_triplets($all_triplets);
$self->load_predictions_for_triplets($all_triplets, $transcript);

$self->store_protein_matrix($translation_stable_id, $translation_md5) if ($self->{'pipeline_mode'});

Expand Down Expand Up @@ -161,7 +161,7 @@ sub amino_acids {

=head2 analysis
Arg 1 : Arrayref of string analysis (optional)
Example : $self->analysis([qw/dbnsfp_revel dbnsfp_meta_lr dbnsfp_mutation_assessor/]);
Example : $self->analysis([qw/dbnsfp_revel dbnsfp_alphamissense dbnsfp_esm1b/]);
Description: Set and get available analysis.
Returntype :
Exceptions : None
Expand Down
129 changes: 123 additions & 6 deletions modules/Bio/EnsEMBL/Variation/Utils/DbNSFPProteinFunctionAnnotation.pm
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ limitations under the License.

Module is used in protein function prediction pipeline for
annotating all possible amino acid substitutions in a translation
with dbNSFP (revel, meta_lr and mutation_assessor) scores and predictions.
with dbNSFP (until 4.9a - revel, meta_lr and mutation_assessor and
after 4.9a - revel, alphamissense and esm1b) scores and predictions.

=cut

Expand All @@ -43,6 +44,8 @@ package Bio::EnsEMBL::Variation::Utils::DbNSFPProteinFunctionAnnotation;
use Bio::EnsEMBL::Variation::Utils::BaseProteinFunctionAnnotation;
our @ISA = ('Bio::EnsEMBL::Variation::Utils::BaseProteinFunctionAnnotation');

use List::MoreUtils qw(firstidx);

my $REVEL_CUTOFF = 0.5;

=head2 new
Expand All @@ -69,7 +72,7 @@ sub new {

my $self = $class->SUPER::new(@_);

my @versions = ('3.5a', '4.0a', '4.1a', '4.2a', '4.3a', '4.4a', '4.5c', '4.6c', '4.7c', '4.8c', '4.9c', '4.9a');
my @versions = ('3.5a', '4.0a', '4.1a', '4.2a', '4.3a', '4.4a', '4.5c', '4.6c', '4.7c', '4.8c', '4.9c', '4.9a', '5.2c', '5.2a');
if (! grep {$_ eq $self->annotation_file_version} @versions) {
die "dbNSFP version " . $self->annotation_file_version . " is not supported.";
}
Expand All @@ -79,13 +82,30 @@ sub new {
# include extra scores if using academic licenced file
@analysis = qw/dbnsfp_revel/;
}
push @analysis, qw/dbnsfp_meta_lr dbnsfp_mutation_assessor/;
if ($self->annotation_file_version =~ /^[3|4]/) {
# for dbNSFP versions 3 and 4, include meta lr and mutation assessor
push @analysis, qw/dbnsfp_meta_lr dbnsfp_mutation_assessor/;
}
else {
push @analysis, qw/dbnsfp_alphamissense dbnsfp_esm1b/;
}
$self->analysis(\@analysis);

return $self;
}

my $predictions = {
dbnsfp_alphamissense => {
LB => 'likely benign',
B => 'benign',
A => 'ambiguous',
LP => 'likely pathogenic',
P => 'pathogenic',
},
dbnsfp_esm1b => {
T => 'tolerated',
D => 'deleterious',
},
dbnsfp_meta_lr => {
T => 'tolerated',
D => 'damaging',
Expand Down Expand Up @@ -380,11 +400,61 @@ my $column_names = {
},
},
},
'5.2c' => {
assembly_unspecific => {
chr => '#chr',
ref => 'ref',
refcodon => 'refcodon',
alt => 'alt',
aaalt => 'aaalt',
aaref => 'aaref',
transcripts => 'Ensembl_transcriptid',
revel_score => 'undef', # it is in the file, but not used in the pipeline
alphamissense_score => 'AlphaMissense_score',
alphamissense_pred => 'AlphaMissense_pred',
esm1b_score => 'ESM1b_score',
esm1b_pred => 'ESM1b_pred',
},
'assembly_specific' => {
'GRCh37' => {
pos => 'hg19_pos(1-based)'
},
'GRCh38' => {
pos => 'pos(1-based)'
},
},
},
'5.2a' => {
assembly_unspecific => {
chr => '#chr',
ref => 'ref',
refcodon => 'refcodon',
alt => 'alt',
aaalt => 'aaalt',
aaref => 'aaref',
transcripts => 'Ensembl_transcriptid',
revel_score => 'REVEL_score',
alphamissense_score => 'AlphaMissense_score',
alphamissense_pred => 'AlphaMissense_pred',
esm1b_score => 'ESM1b_score',
esm1b_pred => 'ESM1b_pred',
},
'assembly_specific' => {
'GRCh37' => {
pos => 'hg19_pos(1-based)'
},
'GRCh38' => {
pos => 'pos(1-based)'
},
},
},
};

sub load_predictions_for_triplets {
my $self = shift;
my $triplets = shift;
my $triplets = shift;
my $transcript = shift;

foreach my $entry (@$triplets) {
my $aa = $entry->{aa};
$self->amino_acids($aa);
Expand All @@ -401,6 +471,7 @@ sub load_predictions_for_triplets {
next if (!defined $iter);
while (my $line = $iter->next) {
my $data = $self->get_dbNSFP_row($line);
$self->pick_transcript_specific_data($data, $transcript) if $transcript;
my $chr = $data->{'chr'};
my $pos = $data->{'pos'};
my $ref = $data->{'ref'};
Expand All @@ -425,11 +496,20 @@ sub add_predictions {
my $prediction = ($data->{revel_score} >= $REVEL_CUTOFF) ? 'likely disease causing' : 'likely benign';
$self->add_prediction($i, $mutated_aa, 'dbnsfp_revel', $data->{revel_score}, $prediction);
}
if ($data->{meta_lr_score} ne '.') {
if (defined $data->{alphamissense_score} && $data->{alphamissense_score} ne '.') {
my $prediction = $predictions->{dbnsfp_alphamissense}->{$data->{alphamissense_pred}};
$self->add_prediction($i, $mutated_aa, 'dbnsfp_alphamissense', $data->{alphamissense_score}, $prediction);
}
if (defined $data->{esm1b_score} && $data->{esm1b_score} ne '.') {
my $prediction = $predictions->{dbnsfp_esm1b}->{$data->{esm1b_pred}};
my $score = sprintf '%.1f', $data->{esm1b_score}; # round up so that we can have accuracy upto 1 decimal place
$self->add_prediction($i, $mutated_aa, 'dbnsfp_esm1b', $score, $prediction);
}
if (defined $data->{meta_lr_score} && $data->{meta_lr_score} ne '.') {
my $prediction = $predictions->{dbnsfp_meta_lr}->{$data->{meta_lr_pred}};
$self->add_prediction($i, $mutated_aa, 'dbnsfp_meta_lr', $data->{meta_lr_score}, $prediction);
}
if ($data->{mutation_assessor_score} ne '.') {
if (defined $data->{mutation_assessor_score} && $data->{mutation_assessor_score} ne '.') {
my $prediction;
if ($self->annotation_file_version eq '3.5a') {
$prediction = $predictions->{dbnsfp_mutation_assessor}->{$data->{mutation_assessor_pred}};
Expand All @@ -453,6 +533,43 @@ sub add_predictions {
}
}

=head2 pick_transcript_specific_data

Arg 1 : Hashref $data from parser
Arg 2 : Bio::EnsEMBL::Transcript $transcript
Description: - Check if the data is for multiple transcripts
- If not, return the data as is
- If yes, split data and pick the right value for the specific transcript
Returntype : Hashref mapping header column to single value from multiple ; delimited values
Exceptions : None
Caller : load_predictions_for_triplets()
Status :
=cut
sub pick_transcript_specific_data {
my ($self, $data, $transcript) = @_;

# speedy return in case nothing to do
return unless grep(defined && /;/, values %$data);

return unless defined $data->{transcripts} && $data->{transcripts} =~ /;/;

# determing the target transcript index in row data value
my $target_transcript = $transcript->stable_id;
my @transcripts_in_data = split(/;/, $data->{transcripts});
my $transcript_index = firstidx {defined && $_ eq $target_transcript} @transcripts_in_data;

# return the specific data for the target transcript and undef if data for the transcript is not available
foreach my $key (keys %$data) {
next unless defined $data->{$key} && $data->{$key} =~ /;/;

my @values = split(/;/, $data->{$key});
$data->{$key} = $transcript_index <= $#values ? $values[$transcript_index] : undef;

$data->{$key} = undef if $transcript_index == -1; # transcript not found in the data
}
return $data;
}

=head2 get_dbNSFP_row

Arg 1 : String $line from parser
Expand Down
Loading