Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ baseline commit does not change.
When `--details` is provided, the following additional SAM tags are output for
mapped reads.

`na`: Number of NAMs (seeds) found
`na`: Number of chains found
`mr`: Number of times mate rescue was attempted (local alignment of a mate to
the expected region given its mate)
`al`: Number of times an attempt was made to extend a seed (by gapped or ungapped alignment)
Expand Down
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -256,8 +256,7 @@ number of anchors and h is a constant set at 50 by default.
Several command line options allow fine tuning of the chaining behavior: `-H`
controls the chaining look-back window heuristic, `--gd` sets the diagonal gap cost,
`--gl` sets the gap length cost, `--vp` determines the best-chain score threshold,
and `--sg` controls the maximum skip distance on the reference. The previous NAM method
remains available via the `--nams` flag.
and `--sg` controls the maximum skip distance on the reference.

## Changelog

Expand Down
93 changes: 46 additions & 47 deletions src/nam.rs → src/chain.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,30 +7,29 @@ use log::trace;

use crate::chainer::Anchor;
use crate::chainer::Chainer;
use crate::details::NamDetails;
use crate::details::ChainingDetails;
use crate::index::StrobemerIndex;
use crate::io::fasta::RefSequence;
use crate::mcsstrategy::McsStrategy;
use crate::read::Read;
use crate::seeding::randstrobes_query;

/// Non-overlapping approximate match
#[derive(Clone, Debug)]
pub struct Nam {
pub nam_id: usize,
pub struct Chain {
pub id: usize,
pub ref_start: usize,
pub ref_end: usize,
pub query_start: usize,
pub query_end: usize,
pub n_matches: usize,
pub n_anchors: usize,
pub matching_bases: usize,
pub ref_id: usize,
pub score: f32,
pub is_revcomp: bool,
pub anchors: Vec<Anchor>,
}

impl Nam {
impl Chain {
pub fn ref_span(&self) -> usize {
self.ref_end - self.ref_start
}
Expand All @@ -43,7 +42,7 @@ impl Nam {
self.ref_start.saturating_sub(self.query_start)
}

/// Returns whether a NAM represents a consistent match between read and
/// Returns whether a chain represents a consistent match between read and
/// reference by comparing the nucleotide sequences of the first and last
/// strobe (taking orientation into account).
pub fn is_consistent(&self, read: &Read, references: &[RefSequence], k: usize) -> bool {
Expand All @@ -62,11 +61,11 @@ impl Nam {
}
}

impl Display for Nam {
impl Display for Chain {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(
f,
"Nam(ref_id={}, query: {}..{}, ref: {}..{}, rc={}, score={})",
"Chain(ref_id={}, query: {}..{}, ref: {}..{}, rc={}, score={})",
self.ref_id,
self.query_start,
self.query_end,
Expand All @@ -79,30 +78,30 @@ impl Display for Nam {
}
}

/// Determine whether the NAM represents a match to the forward or
/// Determine whether the chain represents a match to the forward or
/// reverse-complemented sequence by checking in which orientation the
/// first and last strobe in the NAM match
/// first and last strobe in the chain match
///
/// - If first and last strobe match in forward orientation, return true.
/// - If first and last strobe match in reverse orientation, update the NAM
/// - If first and last strobe match in reverse orientation, update the chain
/// in place and return true.
/// - If first and last strobe do not match consistently, return false.
pub fn reverse_nam_if_needed(
nam: &mut Nam,
pub fn reverse_chain_if_needed(
chain: &mut Chain,
read: &Read,
references: &[RefSequence],
k: usize,
) -> bool {
let ref_start_kmer = &references[nam.ref_id].sequence[nam.ref_start..nam.ref_start + k];
let ref_end_kmer = &references[nam.ref_id].sequence[nam.ref_end - k..nam.ref_end];
let ref_start_kmer = &references[chain.ref_id].sequence[chain.ref_start..chain.ref_start + k];
let ref_end_kmer = &references[chain.ref_id].sequence[chain.ref_end - k..chain.ref_end];

let (seq, seq_rc) = if nam.is_revcomp {
let (seq, seq_rc) = if chain.is_revcomp {
(read.rc(), read.seq())
} else {
(read.seq(), read.rc())
};
let read_start_kmer = &seq[nam.query_start..nam.query_start + k];
let read_end_kmer = &seq[nam.query_end - k..nam.query_end];
let read_start_kmer = &seq[chain.query_start..chain.query_start + k];
let read_end_kmer = &seq[chain.query_end - k..chain.query_end];
if ref_start_kmer == read_start_kmer && ref_end_kmer == read_end_kmer {
return true;
}
Expand All @@ -111,32 +110,32 @@ pub fn reverse_nam_if_needed(
// we need two extra checks for this - hopefully this will remove all the false matches we see
// (true hash collisions should be very few)
let read_len = read.len();
let q_start_tmp = read_len - nam.query_end;
let q_end_tmp = read_len - nam.query_start;
// false reverse match, change coordinates in nam to forward
let q_start_tmp = read_len - chain.query_end;
let q_end_tmp = read_len - chain.query_start;
// false reverse match, change coordinates to forward
let read_start_kmer = &seq_rc[q_start_tmp..q_start_tmp + k];
let read_end_kmer = &seq_rc[q_end_tmp - k..q_end_tmp];
if ref_start_kmer == read_start_kmer && ref_end_kmer == read_end_kmer {
nam.is_revcomp = !nam.is_revcomp;
nam.query_start = q_start_tmp;
nam.query_end = q_end_tmp;
chain.is_revcomp = !chain.is_revcomp;
chain.query_start = q_start_tmp;
chain.query_end = q_end_tmp;
true
} else {
false
}
}

/// Obtain NAMs for a sequence record, doing rescue if needed.
/// Obtain chains for a sequence record, doing rescue if needed.
///
/// NAMs are returned sorted by decreasing score
pub fn get_nams_by_chaining(
/// The chains are returned sorted by decreasing score
pub fn get_chains(
sequence: &[u8],
index: &StrobemerIndex,
chainer: &Chainer,
rescue_distance: usize,
mcs_strategy: McsStrategy,
rng: &mut Rng,
) -> (NamDetails, Vec<Nam>) {
) -> (ChainingDetails, Vec<Chain>) {
let timer = Instant::now();
let query_randstrobes = randstrobes_query(sequence, &index.parameters);
let time_randstrobes = timer.elapsed().as_secs_f64();
Expand All @@ -147,44 +146,44 @@ pub fn get_nams_by_chaining(
query_randstrobes[1].len()
);

let (mut nam_details, mut nams) =
let (mut chain_details, mut chains) =
chainer.get_chains(&query_randstrobes, index, rescue_distance, mcs_strategy);

let timer = Instant::now();

nams.sort_by(|a, b| b.score.total_cmp(&a.score));
shuffle_top_nams(&mut nams, rng);
nam_details.time_sort_nams = timer.elapsed().as_secs_f64();
nam_details.time_randstrobes = time_randstrobes;
chains.sort_by(|a, b| b.score.total_cmp(&a.score));
shuffle_top_chains(&mut chains, rng);
chain_details.time_sort_chains = timer.elapsed().as_secs_f64();
chain_details.time_randstrobes = time_randstrobes;

if log::log_enabled!(Trace) {
trace!("Found {} NAMs", nams.len());
trace!("Found {} NAMs", chains.len());
let mut printed = 0;
for nam in &nams {
if nam.n_matches > 1 || printed < 10 {
trace!("- {}", nam);
for chain in &chains {
if chain.n_anchors > 1 || printed < 10 {
trace!("- {}", chain);
printed += 1;
}
}
if printed < nams.len() {
trace!("+ {} single-anchor chains", nams.len() - printed);
if printed < chains.len() {
trace!("+ {} single-anchor chains", chains.len() - printed);
}
}

(nam_details, nams)
(chain_details, chains)
}

/// Shuffle the top-scoring NAMs. Input must be sorted by score.
/// Shuffle the top-scoring chains. Input must be sorted by score.
/// This helps to ensure we pick a random location in case there are multiple
/// equally good ones.
fn shuffle_top_nams(nams: &mut [Nam], rng: &mut Rng) {
if let Some(best) = nams.first() {
fn shuffle_top_chains(chains: &mut [Chain], rng: &mut Rng) {
if let Some(best) = chains.first() {
let best_score = best.score;

let pos = nams.iter().position(|nam| nam.score != best_score);
let end = pos.unwrap_or(nams.len());
let pos = chains.iter().position(|chain| chain.score != best_score);
let end = pos.unwrap_or(chains.len());
if end > 1 {
rng.shuffle(&mut nams[0..end]);
rng.shuffle(&mut chains[0..end]);
}
}
}
25 changes: 12 additions & 13 deletions src/chainer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ use std::time::Instant;

use log::trace;

use crate::details::NamDetails;
use crate::chain::Chain;
use crate::details::ChainingDetails;
use crate::hit::{Hit, HitsDetails, find_hits};
use crate::index::StrobemerIndex;
use crate::mcsstrategy::McsStrategy;
use crate::nam::Nam;
use crate::seeding::QueryRandstrobe;

const N_PRECOMPUTED: usize = 1024;
Expand Down Expand Up @@ -155,7 +155,7 @@ impl Chainer {
index: &StrobemerIndex,
rescue_distance: usize,
mcs_strategy: McsStrategy,
) -> (NamDetails, Vec<Nam>) {
) -> (ChainingDetails, Vec<Chain>) {
let hits_timer = Instant::now();

let mut hits = [vec![], vec![]];
Expand Down Expand Up @@ -230,17 +230,17 @@ impl Chainer {
}
let mut hits_details12 = hits_details[0].clone();
hits_details12 += hits_details[1].clone();
let details = NamDetails {
let details = ChainingDetails {
hits: hits_details12,
n_reads: 1,
n_randstrobes: query_randstrobes[0].len() + query_randstrobes[1].len(),
n_anchors,
n_nams: chains.len(),
n_chains: chains.len(),
time_randstrobes: 0.0,
time_find_hits,
time_chaining,
time_rescue: 0.0,
time_sort_nams: 0f64,
time_sort_chains: 0f64,
};

(details, chains)
Expand Down Expand Up @@ -366,7 +366,7 @@ fn extract_chains_from_dp(
best_score: f32,
k: usize,
is_revcomp: bool,
chains: &mut Vec<Nam>,
chains: &mut Vec<Chain>,
chaining_parameters: &ChainingParameters,
) {
let n = anchors.len();
Expand All @@ -388,7 +388,6 @@ fn extract_chains_from_dp(
}

let mut j = i;
let mut c = 1;
let mut overlaps = false;
let mut chain_anchors = vec![anchors[i]];

Expand All @@ -403,7 +402,6 @@ fn extract_chains_from_dp(
}
chain_anchors.push(anchors[j]);
used[j] = true;
c += 1;

matching_bases += ref_coverage.saturating_sub(anchors[j].ref_start).min(k);
ref_coverage = anchors[j].ref_start;
Expand All @@ -416,16 +414,17 @@ fn extract_chains_from_dp(
let first = &anchors[j];
let last = &anchors[i];

chains.push(Nam {
nam_id: chains.len(),
let n_anchors = chain_anchors.len();
chains.push(Chain {
id: chains.len(),
query_start: first.query_start,
query_end: last.query_start + k,
ref_start: first.ref_start,
ref_end: last.ref_start + k,
n_matches: c,
n_anchors,
matching_bases,
ref_id: last.ref_id,
score: score + c as f32 * chaining_parameters.matches_weight,
score: score + n_anchors as f32 * chaining_parameters.matches_weight,
is_revcomp,
anchors: chain_anchors,
});
Expand Down
30 changes: 15 additions & 15 deletions src/details.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::ops;
use crate::hit::HitsDetails;

#[derive(Default, Debug, Clone)]
pub struct NamDetails {
pub struct ChainingDetails {
// TODO should be moved out of here and into Details
pub hits: HitsDetails,

Expand All @@ -15,37 +15,37 @@ pub struct NamDetails {

pub n_anchors: usize,

/// Number of NAMs found
pub n_nams: usize,
/// Number of chains found
pub n_chains: usize,

pub time_randstrobes: f64,
pub time_find_hits: f64,
pub time_chaining: f64,
pub time_rescue: f64,
pub time_sort_nams: f64,
pub time_sort_chains: f64,
}

impl ops::AddAssign<NamDetails> for NamDetails {
fn add_assign(&mut self, rhs: NamDetails) {
impl ops::AddAssign<ChainingDetails> for ChainingDetails {
fn add_assign(&mut self, rhs: ChainingDetails) {
self.hits += rhs.hits;
self.n_reads += rhs.n_reads;
self.n_randstrobes += rhs.n_randstrobes;
self.n_anchors += rhs.n_anchors;
self.n_nams += rhs.n_nams;
self.n_chains += rhs.n_chains;
self.time_randstrobes += rhs.time_randstrobes;
self.time_find_hits += rhs.time_find_hits;
self.time_chaining += rhs.time_chaining;
self.time_rescue += rhs.time_rescue;
self.time_sort_nams += rhs.time_sort_nams;
self.time_sort_chains += rhs.time_sort_chains;
}
}

/// Details about aligning a single read
#[derive(Default, Debug, Clone)]
pub struct Details {
pub nam: NamDetails,
pub chain: ChainingDetails,

pub inconsistent_nams: usize,
pub inconsistent_chains: usize,

/// No. of times rescue by local alignment was attempted
pub mate_rescue: usize,
Expand All @@ -64,8 +64,8 @@ pub struct Details {

impl ops::AddAssign<Details> for Details {
fn add_assign(&mut self, rhs: Details) {
self.nam += rhs.nam;
self.inconsistent_nams += rhs.inconsistent_nams;
self.chain += rhs.chain;
self.inconsistent_chains += rhs.inconsistent_chains;
self.mate_rescue += rhs.mate_rescue;
self.tried_alignment += rhs.tried_alignment;
self.gapped += rhs.gapped;
Expand All @@ -80,10 +80,10 @@ impl Details {
}
}

impl From<NamDetails> for Details {
fn from(nam_details: NamDetails) -> Self {
impl From<ChainingDetails> for Details {
fn from(chain_details: ChainingDetails) -> Self {
Details {
nam: nam_details,
chain: chain_details,
..Details::default()
}
}
Expand Down
Loading
Loading