Skip to content

Commit 909ae05

Browse files
committed
add max_var_ratio_per_read
1 parent 9d5e281 commit 909ae05

File tree

6 files changed

+212
-78
lines changed

6 files changed

+212
-78
lines changed

src/align.c

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,7 @@ int collect_partial_aln_beg_end(int gap_aln, int a, int b, int q, int e, int q2,
519519
return ret;
520520
}
521521

522-
int abpoa_partial_aln_msa_cons(const call_var_opt_t *opt, abpoa_t *ab, int wb, int n_reads, int *read_ids, uint8_t **read_seqs, int *read_lens, int *read_full_cover, char **names,
522+
int abpoa_partial_aln_msa_cons(const call_var_opt_t *opt, abpoa_t *ab, int wb, int n_reads, int *read_ids, uint8_t **read_seqs, uint8_t **read_quals, int *read_lens, int *read_full_cover, char **names,
523523
int max_n_cons, int *cons_lens, uint8_t **cons_seqs, int *clu_n_seqs, int **clu_read_ids, int *msa_seq_lens, uint8_t **msa_seqs) {
524524
// abpoa_t *ab = abpoa_init();
525525
int needs_free_ab = 0;
@@ -546,7 +546,16 @@ int abpoa_partial_aln_msa_cons(const call_var_opt_t *opt, abpoa_t *ab, int wb, i
546546
ab->abs->n_seq = n_reads;
547547
for (int i = 0; i < n_reads; ++i) {
548548
if (LONGCALLD_VERBOSE >= 2) {
549-
fprintf(stderr, ">%s %d %d\n", names[i], read_lens[i], read_full_cover[i]);
549+
int min_qual = 100, min_i=-1, ave_qual = 0;
550+
for (int j = 0; j < read_lens[i]; ++j) {
551+
if (read_quals[i][j] < min_qual) {
552+
min_qual = read_quals[i][j];
553+
min_i = j;
554+
}
555+
ave_qual += read_quals[i][j];
556+
}
557+
ave_qual /= (read_lens[i]);
558+
fprintf(stderr, ">%s %d %d qual: %d(%d) %d\n", names[i], read_lens[i], read_full_cover[i], min_qual,min_i, ave_qual);
550559
for (int j = 0; j < read_lens[i]; ++j) {
551560
fprintf(stderr, "%c", "ACGTN"[read_seqs[i][j]]);
552561
} fprintf(stderr, "\n");
@@ -988,7 +997,7 @@ hts_pos_t collect_phase_set_with_both_haps(int n_reads, int *read_haps, int *rea
988997
// 1. ref vs cons: 1
989998
// 2. cons vs n_reads: n_reads
990999
// 3. ref vs n_reads: n_reads
991-
int wfa_collect_noisy_aln_str_with_ps_hap(const call_var_opt_t *opt, int n_reads, int *noisy_read_ids, int *lens, uint8_t **seqs, char **names,
1000+
int wfa_collect_noisy_aln_str_with_ps_hap(const call_var_opt_t *opt, int n_reads, int *noisy_read_ids, int *lens, uint8_t **seqs, uint8_t **quals, char **names,
9921001
int *haps, hts_pos_t *phase_sets, int *fully_covers, hts_pos_t ps, uint8_t *ref_seq, int ref_seq_len,
9931002
int *clu_n_seqs, int **clu_read_ids, aln_str_t **aln_strs) {
9941003
// given specific PS, collect consensus sequences for each haplotype
@@ -997,6 +1006,7 @@ int wfa_collect_noisy_aln_str_with_ps_hap(const call_var_opt_t *opt, int n_reads
9971006
int *ps_hap_read_ids = (int*)malloc(total_n_reads * sizeof(int));
9981007
int *ps_hap_read_lens = (int*)malloc(total_n_reads * sizeof(int));
9991008
uint8_t **ps_hap_read_seqs = (uint8_t**)malloc(total_n_reads * sizeof(uint8_t*));
1009+
uint8_t **ps_hap_read_quals = (uint8_t**)malloc(total_n_reads * sizeof(uint8_t*));
10001010
char **ps_hap_read_names = (char**)malloc(total_n_reads * sizeof(char*));
10011011
int *ps_hap_full_covers = (int*)calloc(total_n_reads, sizeof(int));
10021012
int *cons_lens = (int*)malloc(2 * sizeof(int)); uint8_t **cons_seqs = (uint8_t**)malloc(2 * sizeof(uint8_t*));
@@ -1008,6 +1018,7 @@ int wfa_collect_noisy_aln_str_with_ps_hap(const call_var_opt_t *opt, int n_reads
10081018
ps_hap_read_ids[n_ps_hap_reads] = noisy_read_ids[i];
10091019
ps_hap_read_lens[n_ps_hap_reads] = lens[i];
10101020
ps_hap_read_seqs[n_ps_hap_reads] = seqs[i];
1021+
ps_hap_read_quals[n_ps_hap_reads] = quals[i];
10111022
ps_hap_full_covers[n_ps_hap_reads] = fully_covers[i];
10121023
ps_hap_read_names[n_ps_hap_reads] = names[i];
10131024
n_ps_hap_reads++;
@@ -1016,7 +1027,7 @@ int wfa_collect_noisy_aln_str_with_ps_hap(const call_var_opt_t *opt, int n_reads
10161027
if (LONGCALLD_VERBOSE >=2 ) fprintf(stderr, "PS: %ld HAP: %d n_reads: %d\n", ps, hap, n_ps_hap_reads);
10171028
if (n_ps_hap_reads == 0) continue;
10181029
// collect consensus sequences
1019-
n_cons += abpoa_partial_aln_msa_cons(opt, NULL, 10, n_ps_hap_reads, ps_hap_read_ids, ps_hap_read_seqs, ps_hap_read_lens, ps_hap_full_covers, ps_hap_read_names,
1030+
n_cons += abpoa_partial_aln_msa_cons(opt, NULL, 10, n_ps_hap_reads, ps_hap_read_ids, ps_hap_read_seqs, ps_hap_read_quals, ps_hap_read_lens, ps_hap_full_covers, ps_hap_read_names,
10201031
1, cons_lens+hap-1, cons_seqs+hap-1, clu_n_seqs+hap-1, clu_read_ids+hap-1, NULL, NULL);
10211032
}
10221033

@@ -1061,16 +1072,18 @@ int wfa_collect_noisy_aln_str_with_ps_hap(const call_var_opt_t *opt, int n_reads
10611072
if (n_ps_hap_reads == 0) continue;
10621073
}
10631074
}
1064-
free(ps_hap_read_ids); free(ps_hap_read_lens); free(ps_hap_read_seqs); free(ps_hap_full_covers); free(ps_hap_read_names);
1075+
free(ps_hap_read_ids); free(ps_hap_read_lens); free(ps_hap_read_seqs); free(ps_hap_read_quals); free(ps_hap_full_covers); free(ps_hap_read_names);
10651076
for (int i = 0; i < 2; ++i) {
10661077
if (cons_seqs[i] != NULL) free(cons_seqs[i]);
10671078
} free(cons_lens); free(cons_seqs);
10681079
return n_cons;
10691080
}
10701081

1071-
int collect_noisy_read_info(bam_chunk_t *chunk, hts_pos_t reg_beg, hts_pos_t reg_end, int noisy_reg_i, int n_noisy_reg_reads, int *noisy_reg_reads, int **read_lens, uint8_t ***read_seqs, char ***read_names, int **fully_covers, int **read_haps, hts_pos_t **phase_sets) {
1082+
int collect_noisy_read_info(bam_chunk_t *chunk, hts_pos_t reg_beg, hts_pos_t reg_end, int noisy_reg_i, int n_noisy_reg_reads, int *noisy_reg_reads, int **read_lens,
1083+
uint8_t ***read_seqs, uint8_t ***read_quals, char ***read_names, int **fully_covers, int **read_haps, hts_pos_t **phase_sets) {
10721084
*read_lens = (int*)calloc(n_noisy_reg_reads, sizeof(int));
10731085
*read_seqs = (uint8_t**)malloc(n_noisy_reg_reads * sizeof(uint8_t*));
1086+
*read_quals = (uint8_t**)malloc(n_noisy_reg_reads * sizeof(uint8_t*));
10741087
*read_names = (char**)malloc(n_noisy_reg_reads * sizeof(char*));
10751088
*fully_covers = (int*)calloc(n_noisy_reg_reads, sizeof(int));
10761089
*read_haps = (int*)calloc(n_noisy_reg_reads, sizeof(int));
@@ -1115,8 +1128,10 @@ int collect_noisy_read_info(bam_chunk_t *chunk, hts_pos_t reg_beg, hts_pos_t reg
11151128
else (*fully_covers)[i] = 0;
11161129
// if (2*(reg_read_end-reg_read_beg+1) < (reg_end-reg_beg+1)) return 0;
11171130
(*read_seqs)[i] = (uint8_t*)malloc((reg_read_end - reg_read_beg + 1) * sizeof(uint8_t));
1131+
(*read_quals)[i] = (uint8_t*)malloc((reg_read_end - reg_read_beg + 1) * sizeof(uint8_t));
11181132
for (int j = reg_read_beg; j <= reg_read_end; ++j) {
11191133
(*read_seqs)[i][j-reg_read_beg] = seq_nt16_int[bam_seqi(read_digars->bseq, j)];
1134+
(*read_quals)[i][j-reg_read_beg] = read_digars->qual[j];
11201135
}
11211136
(*read_lens)[i] = reg_read_end - reg_read_beg + 1;
11221137
(*read_haps)[i] = chunk->haps[read_i];
@@ -1131,8 +1146,8 @@ int collect_noisy_reg_aln_strs(const call_var_opt_t *opt, bam_chunk_t *chunk, ht
11311146
int *clu_n_seqs, int **clu_read_ids, aln_str_t **aln_strs) {
11321147
if (n_noisy_reg_reads <= 0) return 0;
11331148
// fully_cover: 0 -> none, 1 -> left, 2 -> right, 3 -> both
1134-
char **names = NULL; uint8_t **seqs = NULL; int *fully_covers = NULL, *lens = NULL, *haps = NULL; hts_pos_t *phase_sets = NULL;
1135-
collect_noisy_read_info(chunk, noisy_reg_beg, noisy_reg_end, noisy_reg_i, n_noisy_reg_reads, noisy_reads, &lens, &seqs, &names, &fully_covers, &haps, &phase_sets);
1149+
char **names = NULL; uint8_t **seqs = NULL; int *fully_covers = NULL, *lens = NULL, *haps = NULL; hts_pos_t *phase_sets = NULL; uint8_t **base_quals = NULL;
1150+
collect_noisy_read_info(chunk, noisy_reg_beg, noisy_reg_end, noisy_reg_i, n_noisy_reg_reads, noisy_reads, &lens, &seqs, &base_quals, &names, &fully_covers, &haps, &phase_sets);
11361151

11371152
sort_by_full_cover_and_length(n_noisy_reg_reads, noisy_reads, lens, seqs, fully_covers, names, haps, phase_sets);
11381153
// >= min_hap_full_read_count reads for each hap && >= min_hap_read_count reads (including not full-cover, but >= full-cover length) for each hap
@@ -1145,7 +1160,7 @@ int collect_noisy_reg_aln_strs(const call_var_opt_t *opt, bam_chunk_t *chunk, ht
11451160
// XXX only use fully-covered reads, including cliping reads (after re-align to backbone read)
11461161
// two cases to call consensus sequences
11471162
if (ps_with_both_haps > 0) { // call consensus sequences for each haplotype
1148-
n_cons = wfa_collect_noisy_aln_str_with_ps_hap(opt, n_noisy_reg_reads, noisy_reads, lens, seqs, names, haps, phase_sets, fully_covers, ps_with_both_haps,
1163+
n_cons = wfa_collect_noisy_aln_str_with_ps_hap(opt, n_noisy_reg_reads, noisy_reads, lens, seqs, base_quals, names, haps, phase_sets, fully_covers, ps_with_both_haps,
11491164
ref_seq, ref_seq_len, clu_n_seqs, clu_read_ids, aln_strs);
11501165
// >= min_no_hap_full_read_count full reads in total
11511166
} else if (ps_with_both_haps <= 0 && n_full_reads >= min_no_hap_full_read_count) {

src/assign_hap.c

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,9 @@ int assign_read_hap_phase_set_based_on_cons_alle(int read_i, hts_pos_t *phase_se
149149
int update_var_hap_to_cons_alle(cand_var_t *var, int var_cate, int hap) {
150150
if (hap == 0) return 0;
151151
int max_cov = 0, max_cov_alle_i = -1;
152-
int n_max_alt = 0;
153-
if (var_cate & (LONGCALLD_CLEAN_HET_SNP | LONGCALLD_CLEAN_HET_INDEL | LONGCALLD_NOISY_CAND_HET_VAR)) n_max_alt = 1;
154-
else if (var_cate & (LONGCALLD_CAND_HOM_VAR | LONGCALLD_NOISY_CAND_HOM_VAR)) n_max_alt = 2;
152+
// int n_max_alt = 0;
153+
// if (var_cate & (LONGCALLD_CLEAN_HET_SNP | LONGCALLD_CLEAN_HET_INDEL | LONGCALLD_NOISY_CAND_HET_VAR)) n_max_alt = 1;
154+
// else if (var_cate & (LONGCALLD_CAND_HOM_VAR | LONGCALLD_NOISY_CAND_HOM_VAR)) n_max_alt = 2;
155155

156156
for (int i = 0; i < var->n_uniq_alles; ++i) {
157157
if (var->hap_to_alle_profile[hap][i] > max_cov) { // prefer ref allele
@@ -221,13 +221,15 @@ int check_agree_haps(int read_i, int hap, cand_var_t *vars, int var1, int var2,
221221
}
222222

223223
// reads->haps/haps_to_cons_alle are UpToDate
224-
int update_var_hap_cons_phase_set(bam_chunk_t *chunk, int *var_idx, read_var_profile_t *p, cand_var_t *cand_vars, int n_cand_vars) {
224+
int update_var_hap_cons_phase_set(bam_chunk_t *chunk, int *var_idx, read_var_profile_t *p, cand_var_t *cand_vars, int n_cand_vars, int *var_i_to_cate) {
225225
int *het_var_idx = (int*)malloc(n_cand_vars * sizeof(int));
226226
int n_het_vars = 0;
227227
int *is_het = (int*)calloc(n_cand_vars, sizeof(int));
228228
for (int _var_i = 0; _var_i < n_cand_vars; ++_var_i) {
229229
int var_i = var_idx[_var_i];
230230
cand_var_t *var = cand_vars+var_i;
231+
// XXX only use clean het vars
232+
// if (var_i_to_cate[var_i] == LONGCALLD_NOISY_CAND_HOM_VAR || var_i_to_cate[var_i] == LONGCALLD_NOISY_CAND_HET_VAR) continue;
231233
if (var->hap_to_cons_alle[1] != -1 && var->hap_to_cons_alle[2] != -1
232234
&& var->hap_to_cons_alle[1] != var->hap_to_cons_alle[2]) {
233235
// fprintf(stderr, "is_het: %d %ld %d-%c-%d\n", var_i, var->pos, var->ref_len, BAM_CIGAR_STR[var->var_type], var->alt_len);
@@ -371,7 +373,7 @@ int assign_hap_based_on_het_vars_kmeans(bam_chunk_t *chunk, int target_var_cate,
371373
// for each var: assign reads covering the var to HAP1 or HAP2
372374
for (int _var_i = 0; _var_i < n_valid_vars; ++_var_i) {
373375
int var_i = valid_var_idx[var_ii[_var_i]];
374-
if (var_i_to_cate[var_i] == LONGCALLD_NOISY_CAND_HOM_VAR || var_i_to_cate[var_i] == LONGCALLD_CAND_HOM_VAR)
376+
if (var_i_to_cate[var_i] == LONGCALLD_NOISY_CAND_HOM_VAR || var_i_to_cate[var_i] == LONGCALLD_CAND_HOM_VAR) // || var_i_to_cate[var_i] == LONGCALLD_NOISY_CAND_HET_VAR)
375377
continue;
376378
cand_var_t *var = cand_vars+var_i;
377379
if (LONGCALLD_VERBOSE >= 2)
@@ -408,7 +410,7 @@ int assign_hap_based_on_het_vars_kmeans(bam_chunk_t *chunk, int target_var_cate,
408410
if (LONGCALLD_VERBOSE >= 2) fprintf(stderr, "iter: %d\n", i_iter);
409411
// XXX var-wise loop update PS
410412
// read-wise loop: update read_hap
411-
int changed_hap1 = update_var_hap_cons_phase_set(chunk, valid_var_idx, p, cand_vars, n_valid_vars);
413+
int changed_hap1 = update_var_hap_cons_phase_set(chunk, valid_var_idx, p, cand_vars, n_valid_vars, var_i_to_cate);
412414
int changed_hap2 = update_hap_to_cons_alle(chunk, valid_var_idx, n_valid_vars, p, cand_vars, var_i_to_cate, target_var_cate);
413415
// int changed_hap2 = update_read_hap_phase_set(chunk, valid_var_idx, n_valid_vars, p, cand_vars, var_i_to_cate, target_var_cate);
414416
if (changed_hap1 == 0 && changed_hap2 == 0) break;

0 commit comments

Comments
 (0)