Skip to content

Commit c1003a2

Browse files
committed
new merge noisy regs: merge_dis <- min{max{ref1, read1}, max{ref2, read2}}
1 parent d425467 commit c1003a2

File tree

7 files changed

+506
-381
lines changed

7 files changed

+506
-381
lines changed

src/align.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1550,7 +1550,8 @@ int collect_noisy_reg_msa_cons(const call_var_opt_t *opt, bam_chunk_t *chunk, ht
15501550

15511551
sort_by_full_cover_and_length(n_noisy_reg_reads, noisy_reads, lens, seqs, fully_covers, names, haps, phase_sets);
15521552
// at least 5 reads for each hap
1553-
hts_pos_t ps_with_both_haps = collect_phase_set_with_both_haps(n_noisy_reg_reads, haps, phase_sets, fully_covers, 5);
1553+
int min_hap_full_read_count = opt->min_hap_full_reads, min_no_hap_full_read_count = opt->min_no_hap_full_reads;
1554+
hts_pos_t ps_with_both_haps = collect_phase_set_with_both_haps(n_noisy_reg_reads, haps, phase_sets, fully_covers, min_hap_full_read_count);
15541555
int n_full_reads = 0;
15551556
for (int i = 0; i < n_noisy_reg_reads; ++i) if (fully_covers[i] == 3) n_full_reads++;
15561557
int n_cons = 0;
@@ -1562,7 +1563,7 @@ int collect_noisy_reg_msa_cons(const call_var_opt_t *opt, bam_chunk_t *chunk, ht
15621563
ref_seq, ref_seq_len,
15631564
cons_lens, cons_seqs, clu_n_seqs, clu_read_ids, msa_len, msa_seqs);
15641565
// } else if (n_full_cover_reads > n_noisy_reg_reads * 0.75 && reg_end - reg_beg + 1 <= 10000) { // de novo consensus calling using all reads, up to 2 consensus sequences
1565-
} else if (ps_with_both_haps <= 0 && n_full_reads >= 10) { // XXX >= 10
1566+
} else if (ps_with_both_haps <= 0 && n_full_reads >= min_no_hap_full_read_count) {
15661567
// fprintf(stderr, "De nove calling region: %s:%ld-%ld %ld %d (all)\n", chunk->tname, reg_beg, reg_end, reg_end-reg_beg+1, n_noisy_reg_reads);
15671568
n_cons = collect_noisy_msa_cons_no_ps_hap(opt, n_noisy_reg_reads, noisy_reads, lens, seqs, names, fully_covers, ref_seq, ref_seq_len,
15681569
cons_lens, cons_seqs, clu_n_seqs, clu_read_ids, msa_len, msa_seqs);

src/assign_hap.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ int update_var_aln_hap1(int target_read_i, int cur_hap, bam_chunk_t *chunk, rea
255255
if (var->var_type == BAM_CDIFF) var_weight = 2;
256256
else var_weight = 1;
257257
} else if (var_i_to_cate[var_i] == LONGCALLD_CLEAN_HET_VAR) {
258-
if (var->var_type == BAM_CDIFF) var_weight = 100;
258+
if (var->var_type == BAM_CDIFF) var_weight = 4;
259259
else var_weight = 3;
260260
}
261261

src/bam_utils.c

Lines changed: 106 additions & 83 deletions
Large diffs are not rendered by default.

src/call_var_main.c

Lines changed: 45 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -32,18 +32,25 @@ const struct option call_var_opt [] = {
3232
{ "out-vcf", 1, NULL, 'o'},
3333
{ "out-bam", 1, NULL, 'b' },
3434
{ "no-vcf-header", 0, NULL, 'H'},
35+
{ "gap-aln", 1, NULL, 'g'},
3536
// { "out-bam", 1, NULL, 'b' },
3637
{ "sample-name", 1, NULL, 'n'},
38+
3739
{ "min-depth", 1, NULL, 'd' },
40+
{ "min-allele-freq", 1, NULL, 'a' },
3841
{ "min-alt-depth", 1, NULL, 'D' },
39-
{ "max-ploidy", 1, NULL, 'p' },
42+
// { "max-ploidy", 1, NULL, 'p' },
43+
4044
{ "max-xgap", 1, NULL, 'x' },
4145
{ "win-size", 1, NULL, 'w'},
46+
{ "noisy-rat", 1, NULL, 'j' },
4247
{ "noisy-flank", 1, NULL, 'f' },
4348
{ "end-clip", 1, NULL, 'c' },
4449
{ "clip-flank", 1, NULL, 'F' },
45-
{ "gap-aln", 1, NULL, 'g'},
50+
{ "hap-reads", 1, NULL, 'p' },
51+
{ "full-reads", 1, NULL, 'f' },
4652
{ "no-re-aln", 1, NULL, 'N'},
53+
4754
{ "threads", 1, NULL, 't' },
4855
{ "help", 0, NULL, 'h' },
4956
{ "version", 0, NULL, 'v' },
@@ -87,9 +94,9 @@ call_var_opt_t *call_var_init_para(void) {
8794
// opt->noisy_reg_merge_win = LONGCALLD_NOISY_REG_MERGE_WIN;
8895
opt->noisy_reg_flank_len = LONGCALLD_NOISY_REG_FLANK_LEN;
8996

90-
opt->dens_reg_max_xgaps = LONGCALLD_NOISY_REG_MAX_XGAPS;
91-
opt->dens_reg_slide_win = LONGCALLD_NOISY_REG_SLIDE_WIN;
92-
opt->dens_reg_flank_win = LONGCALLD_NOISY_FLANK_WIN;
97+
opt->noisy_reg_max_xgaps = LONGCALLD_NOISY_REG_MAX_XGAPS;
98+
opt->noisy_reg_slide_win = LONGCALLD_NOISY_REG_SLIDE_WIN;
99+
opt->noisy_reg_flank_win = LONGCALLD_NOISY_FLANK_WIN;
93100
opt->indel_flank_win = LONGCALLD_INDEL_FLANK_WIN;
94101

95102
opt->end_clip_reg = LONGCALLD_NOISY_END_CLIP;
@@ -99,11 +106,13 @@ call_var_opt_t *call_var_init_para(void) {
99106
opt->max_noisy_reg_len = LONGCALLD_MAX_NOISY_REG_LEN;
100107
opt->min_noisy_reg_reads = LONGCALLD_NOISY_REG_READS;
101108
opt->min_noisy_reg_ratio = LONGCALLD_NOISY_REG_RATIO;
109+
opt->max_noisy_frac_per_read = LONGCALLD_MAX_NOISY_FRAC_PER_READ;
110+
opt->min_hap_full_reads = LONGCALLD_MIN_HAP_FULL_READS;
111+
opt->min_no_hap_full_reads = LONGCALLD_MIN_NO_HAP_FULL_READS;
102112

103113
opt->min_somatic_af = LONGCALLD_MIN_SOMATIC_AF;
104114
opt->min_af = LONGCALLD_MIN_CAND_AF;
105115
opt->max_af = LONGCALLD_MAX_CAND_AF;
106-
opt->max_low_qual_frac = LONGCALLD_MAX_LOW_QUAL_FRAC;
107116

108117
opt->match = LONGCALLD_MATCH_SCORE;
109118
opt->mismatch = LONGCALLD_MISMATCH_SCORE;
@@ -385,28 +394,33 @@ static void call_var_usage(void) {//main usage
385394
fprintf(stderr, " -H --no-vcf-header do NOT output VCF header\n");
386395
fprintf(stderr, " --amb-base output variant with ambiguous base [False]\n");
387396
fprintf(stderr, " -b --out-bam STR output phased BAM file [NULL]\n");
397+
fprintf(stderr, " -g --gap-aln STR put gap on the \'left\' or \'right\' side in alignment [left/l]\n");
398+
fprintf(stderr, " \'left\': ATTTG\n");
399+
fprintf(stderr, " | |||\n");
400+
fprintf(stderr, " A-TTG\n");
401+
fprintf(stderr, " \'right\': ATTTG\n");
402+
fprintf(stderr, " ||| |\n");
403+
fprintf(stderr, " ATT-G\n");
388404
// fprintf(stderr, "\n");
389-
fprintf(stderr, " Variant:\n");
405+
fprintf(stderr, " Variant calling:\n");
390406
fprintf(stderr, " -d --min-depth INT min. depth to call a variant [%d]\n", LONGCALLD_MIN_CAND_DP);
391407
fprintf(stderr, " -D --alt-depth INT min. alt. depth to call a variant [%d]\n", LONGCALLD_MIN_ALT_DP);
408+
fprintf(stderr, " -a --min-af FLOAT min. allele frequency to call a variant [%.2f]\n", LONGCALLD_MIN_CAND_AF);
392409
// fprintf(stderr, " -p --max-ploidy INT max. ploidy [%d]\n", LONGCALLD_DEF_PLOID);
393-
fprintf(stderr, " -x --max-xgap INT max. number of allowed substitutions/gap-bases in a searching window(-w/--win-size) [%d]\n", LONGCALLD_NOISY_REG_MAX_XGAPS);
410+
fprintf(stderr, " Noisy regions:\n");
411+
fprintf(stderr, " -x --max-xgap INT max. number of allowed substitutions/gap-bases in a sliding window(-w/--win-size) [%d]\n", LONGCALLD_NOISY_REG_MAX_XGAPS);
394412
fprintf(stderr, " window with more than -x subs/gap-bases will be considered as noisy region\n");
395413
fprintf(stderr, " -w --win-size INT window size for searching noisy region [%d]\n", LONGCALLD_NOISY_REG_SLIDE_WIN);
414+
fprintf(stderr, " -j --noisy-rat FLOAT min. ratio of noisy reads in a window to call a noisy region [%.2f]\n", LONGCALLD_NOISY_REG_RATIO);
396415
// fprintf(stderr, " -f --noisy-flank INT flanking mask window size for noisy region [%d]\n", LONGCALLD_DENSE_FLANK_WIN);
397-
fprintf(stderr, " -c --end-clip INT max. number of clipping bases on both ends [%d]\n", LONGCALLD_NOISY_END_CLIP);
398-
fprintf(stderr, " end-clipping region with more than -c bases will be considered as noisy clipping region\n");
399-
fprintf(stderr, " -F --clip-flank INT flanking mask window size for noisy clipping region [%d]\n", LONGCALLD_NOISY_END_CLIP_WIN);
416+
// fprintf(stderr, " -c --end-clip INT max. number of clipping bases on both ends [%d]\n", LONGCALLD_NOISY_END_CLIP);
417+
// fprintf(stderr, " end-clipping region with more than -c bases will be considered as noisy clipping region\n");
418+
// fprintf(stderr, " -F --clip-flank INT flanking mask window size for noisy clipping region [%d]\n", LONGCALLD_NOISY_END_CLIP_WIN);
400419
// fprintf(stderr, "\n");
401-
fprintf(stderr, " Alignment\n");
402-
fprintf(stderr, " -g --gap-aln STR put gap on the \'left\' or \'right\' side in alignment [left/l]\n");
403-
fprintf(stderr, " \'left\': ATTTG\n");
404-
fprintf(stderr, " | |||\n");
405-
fprintf(stderr, " A-TTG\n");
406-
fprintf(stderr, " \'right\': ATTTG\n");
407-
fprintf(stderr, " ||| |\n");
408-
fprintf(stderr, " ATT-G\n");
409-
fprintf(stderr, " -N --no-re-aln disable read re-alignment\n");
420+
fprintf(stderr, " -p --hap-reads INT when haplotype is available, min. number of full-spanning reads for each haplotype in noisy region to call a variant [%d]\n", LONGCALLD_MIN_HAP_FULL_READS);
421+
fprintf(stderr, " -f --full-reads INT when haplotype is not available, min. number of full-spanning reads in noisy region to call a variant [%d]\n", LONGCALLD_MIN_NO_HAP_FULL_READS);
422+
423+
// fprintf(stderr, " -N --no-re-aln disable read re-alignment\n");
410424
// fprintf(stderr, "\n");
411425
fprintf(stderr, " General:\n");
412426
fprintf(stderr, " -t --threads INT number of threads to use [%d]\n", MIN_OF_TWO(CALL_VAR_THREAD_N, get_num_processors()));
@@ -423,7 +437,7 @@ int call_var_main(int argc, char *argv[]) {
423437
// _err_cmd("%s\n", CMD);
424438
int c, op_idx; call_var_opt_t *opt = call_var_init_para();
425439
double realtime0 = realtime();
426-
while ((c = getopt_long(argc, argv, "r:o:Hb:d:D:n:x:w:f:F:c:a:Nt:hvV:", call_var_opt, &op_idx)) >= 0) {
440+
while ((c = getopt_long(argc, argv, "r:o:Hb:d:D:a:n:x:w:j:f:p:g:Nt:hvV:", call_var_opt, &op_idx)) >= 0) {
427441
switch(c) {
428442
case 'r': opt->ref_fa_fn = strdup(optarg); break;
429443
// case 'b': cgp->var_block_size = atoi(optarg); break;
@@ -433,13 +447,17 @@ int call_var_main(int argc, char *argv[]) {
433447
case 'b': opt->out_bam = hts_open(optarg, "wb"); break;
434448
case 'd': opt->min_dp = atoi(optarg); break;
435449
case 'D': opt->min_alt_dp = atoi(optarg); break;
450+
case 'a': opt->min_af = atof(optarg); break;
436451
case 'n': opt->sample_name = strdup(optarg); break;
437-
case 'x': opt->dens_reg_max_xgaps = atoi(optarg); break;
438-
case 'w': opt->dens_reg_slide_win = atoi(optarg); break;
439-
case 'f': opt->dens_reg_flank_win = atoi(optarg); break;
440-
case 'c': opt->end_clip_reg = atoi(optarg); break;
441-
case 'F': opt->end_clip_reg_flank_win = atoi(optarg); break;
442-
case 'a': if (strcmp(optarg, "right") == 0 || strcmp(optarg, "r") == 0) opt->gap_aln = LONGCALLD_GAP_RIGHT_ALN;
452+
case 'x': opt->noisy_reg_max_xgaps = atoi(optarg); break;
453+
case 'w': opt->noisy_reg_slide_win = atoi(optarg); break;
454+
case 'p': opt->min_hap_full_reads = atoi(optarg); break;
455+
case 'f': opt->min_no_hap_full_reads = atoi(optarg); break;
456+
case 'j': opt->min_noisy_reg_ratio = atof(optarg); break;
457+
// case 'f': opt->dens_reg_flank_win = atoi(optarg); break;
458+
// case 'c': opt->end_clip_reg = atoi(optarg); break;
459+
// case 'F': opt->end_clip_reg_flank_win = atoi(optarg); break;
460+
case 'g': if (strcmp(optarg, "right") == 0 || strcmp(optarg, "r") == 0) opt->gap_aln = LONGCALLD_GAP_RIGHT_ALN;
443461
else if (strcmp(optarg, "left") == 0 || strcmp(optarg, "l") == 0) opt->gap_aln = LONGCALLD_GAP_LEFT_ALN;
444462
else _err_error_exit("\'-a/--gap-aln\' can only be \'left\'/\'l\' or \'right\'/\'r\'\n"); // call_var_usage();
445463
case 'N': opt->disable_read_realign = 1; break;

src/call_var_main.h

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,30 +20,33 @@
2020
#define LONGCALLD_MIN_SOMATIC_AF 0.05 // AF < 0.05: filtered out, 0.05~0.25: candidate somatic
2121
#define LONGCALLD_MIN_CAND_AF 0.25 // AF < 0.25: not germline het.
2222
#define LONGCALLD_MAX_CAND_AF 0.75 // AF > 0.75: not germline het.
23-
#define LONGCALLD_MAX_LOW_QUAL_FRAC 0.20 // LOW_FRAC > 0.25: low-qual
2423
#define LONGCALLD_DEF_PLOID 2 // diploid
2524
#define LONGCALLD_REF_ALLELE 0
2625
#define LONGCALLD_ALT_ALLELE1 1
2726
#define LONGCALLD_ALT_ALLELE2 2
2827

29-
#define LONGCALLD_MAX_NOISY_REG_LEN 50000 // >50kb noisy region will be skipped
3028

3129
// #define LONGCALLD_NOISY_REG_MERGE_WIN 100 // dynamically set merge_win based on insertion size
3230
#define LONGCALLD_NOISY_REG_FLANK_LEN 10 // 10-bp flanking region for both ends of noisy region
3331

3432
#define LONGCALLD_NOISY_REG_MAX_XGAPS 5 // or 10; dense X/gap region: more than n X/gap bases in a 100-bp window
3533
#define LONGCALLD_NOISY_REG_SLIDE_WIN 100 //
34+
#define LONGCALLD_MAX_NOISY_FRAC_PER_READ 0.8 // skip reads with more than 80% bases in noisy region
3635
#define LONGCALLD_NOISY_FLANK_WIN 0 // 25: 100/(3+1)
3736
#define LONGCALLD_NOISY_END_CLIP 100 // >= n bp clipping on both ends
3837
#define LONGCALLD_NOISY_END_CLIP_WIN 100 // n bp flanking end-clipping region will be considered as low-quality region
3938
#define LONGCALLD_INDEL_FLANK_WIN 0 // n bp around indel will be considered as low-quality region
4039

40+
#define LONGCALLD_MAX_NOISY_REG_LEN 50000 // >50kb noisy region will be skipped
41+
#define LONGCALLD_NOISY_REG_READS 5 // >= 5 reads supporting noisy region
42+
#define LONGCALLD_NOISY_REG_RATIO 0.25 // >= 25% reads supporting noisy region
43+
44+
#define LONGCALLD_MIN_HAP_FULL_READS 5 // full read supporting each haplotype
45+
#define LONGCALLD_MIN_NO_HAP_FULL_READS 10 // total full reads in noisy region
4146
// for sdust
4247
#define LONGCALLD_SDUST_T 5
4348
#define LONGCALLD_SDUST_W 20
4449

45-
#define LONGCALLD_NOISY_REG_READS 5 // >= 5 reads supporting noisy region
46-
#define LONGCALLD_NOISY_REG_RATIO 0.20 // >= 25% reads supporting noisy region
4750

4851

4952
#define LONGCALLD_MAX_READ_DEPTH 500 // vars with >500 reads will be skipped
@@ -89,16 +92,18 @@ typedef struct call_var_opt_t {
8992
char *region_list; uint8_t region_is_file; // for -R/--region/--region-file option
9093
// filters for variant calling
9194
int max_ploid, min_mq, min_bq, min_dp, min_alt_dp;
92-
double min_af, max_af, min_somatic_af, max_low_qual_frac;
93-
int dens_reg_max_xgaps, dens_reg_slide_win, dens_reg_flank_win;
95+
double min_af, max_af, min_somatic_af;
96+
int noisy_reg_max_xgaps, noisy_reg_slide_win, noisy_reg_flank_win;
9497
// skip read if contans a noisy region with 1) >= min_non_low_comp_noisy_reg_ratio non-low-comp XIDs, and
9598
// 2) >= min_non_low_comp_noisy_reg_size bps
9699
// int min_non_low_comp_noisy_reg_size; float min_non_low_comp_noisy_reg_ratio; int min_non_low_comp_noisy_reg_XID;
97100
int indel_flank_win;
98101
int end_clip_reg, end_clip_reg_flank_win;
99102
int noisy_reg_flank_len; // noisy_reg_merge_win; // for re-alignment
100103
// filters for noisy region, i.e., coverage/ratio
101-
int max_noisy_reg_reads, max_noisy_reg_len, min_noisy_reg_reads; float min_noisy_reg_ratio;
104+
int max_noisy_reg_reads, max_noisy_reg_len, min_noisy_reg_reads;
105+
float max_noisy_frac_per_read, min_noisy_reg_ratio;
106+
int min_hap_full_reads, min_no_hap_full_reads;
102107
// alignment
103108
int match, mismatch, gap_open1, gap_ext1, gap_open2, gap_ext2;
104109
int gap_aln; // default: 1: left (minimap2, abpoa), 2: right (wfa2)

0 commit comments

Comments
 (0)