Skip to content

Commit 0c1159f

Browse files
authored
Add UMI support to FASTQ input/output. (#1960)
1 parent 4838906 commit 0c1159f

File tree

6 files changed

+196
-0
lines changed

6 files changed

+196
-0
lines changed

hts.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1208,6 +1208,14 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) {
12081208
strcmp(o->arg, "FASTQ_NAME2") == 0)
12091209
o->opt = FASTQ_OPT_NAME2, o->val.i = 1;
12101210

1211+
else if (strcmp(o->arg, "fastq_umi") == 0 ||
1212+
strcmp(o->arg, "FASTQ_UMI") == 0)
1213+
o->opt = FASTQ_OPT_UMI, o->val.s = val;
1214+
1215+
else if (strcmp(o->arg, "fastq_umi_regex") == 0 ||
1216+
strcmp(o->arg, "FASTQ_UMI_REGEX") == 0)
1217+
o->opt = FASTQ_OPT_UMI_REGEX, o->val.s = val;
1218+
12111219
else {
12121220
hts_log_error("Unknown option '%s'", o->arg);
12131221
free(o->arg);
@@ -1250,6 +1258,8 @@ int hts_opt_apply(htsFile *fp, hts_opt *opts) {
12501258
case HTS_OPT_FILTER:
12511259
case FASTQ_OPT_AUX:
12521260
case FASTQ_OPT_BARCODE:
1261+
case FASTQ_OPT_UMI:
1262+
case FASTQ_OPT_UMI_REGEX:
12531263
if (hts_set_opt(fp, opts->opt, opts->val.s) != 0)
12541264
return -1;
12551265
break;
@@ -1841,6 +1851,8 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) {
18411851
return 0;
18421852

18431853
case FASTQ_OPT_BARCODE:
1854+
case FASTQ_OPT_UMI:
1855+
case FASTQ_OPT_UMI_REGEX:
18441856
if (fp->format.format == fastq_format ||
18451857
fp->format.format == fasta_format) {
18461858
va_start(args, opt);

htslib/hts.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,16 @@ enum hts_fmt_option {
366366
// name to the second field and insert a constructed <run>.<number>
367367
// name in its place.
368368
FASTQ_OPT_NAME2,
369+
370+
// Process the UMI tag. Tag or Tag,tag,tag...
371+
// On read, this converts the last read-name element (Illumina) to the tag.
372+
// On write, it queries the tags in turn and copies the first found
373+
// to the read name suffix, converting any non-alpha to "+".
374+
FASTQ_OPT_UMI,
375+
376+
// Regex to use for matching read name.
377+
// Def: "^[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:([^:#/]+)"
378+
FASTQ_OPT_UMI_REGEX,
369379
};
370380

371381
// Profile options for encoding; primarily used at present in CRAM

sam.c

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ DEALINGS IN THE SOFTWARE. */
3636
#include <signal.h>
3737
#include <inttypes.h>
3838
#include <unistd.h>
39+
#include <regex.h>
3940

4041
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
4142
#include "fuzz_settings.h"
@@ -3701,6 +3702,7 @@ int sam_set_threads(htsFile *fp, int nthreads) {
37013702
return 0;
37023703
}
37033704

3705+
#define UMI_TAGS 5
37043706
typedef struct {
37053707
kstring_t name;
37063708
kstring_t comment; // NB: pointer into name, do not free
@@ -3710,9 +3712,11 @@ typedef struct {
37103712
int aux;
37113713
int rnum;
37123714
char BC[3]; // aux tag ID for barcode
3715+
char UMI[UMI_TAGS][3]; // aux tag list for UMIs.
37133716
khash_t(tag) *tags; // which aux tags to use (if empty, use all).
37143717
char nprefix;
37153718
int sra_names;
3719+
regex_t regex;
37163720
} fastq_state;
37173721

37183722
// Initialise fastq state.
@@ -3723,6 +3727,12 @@ static fastq_state *fastq_state_init(int name_char) {
37233727
return NULL;
37243728
strcpy(x->BC, "BC");
37253729
x->nprefix = name_char;
3730+
// Default Illumina naming convention
3731+
char *re = "^[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:([^:#/]+)";
3732+
if (regcomp(&x->regex, re, REG_EXTENDED) != 0) {
3733+
free(x);
3734+
return NULL;
3735+
}
37263736

37273737
return x;
37283738
}
@@ -3735,6 +3745,7 @@ void fastq_state_destroy(htsFile *fp) {
37353745
ks_free(&x->name);
37363746
ks_free(&x->seq);
37373747
ks_free(&x->qual);
3748+
regfree(&x->regex);
37383749
free(fp->state);
37393750
}
37403751
}
@@ -3795,6 +3806,52 @@ int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
37953806
break;
37963807
}
37973808

3809+
case FASTQ_OPT_UMI: {
3810+
// UMI tag: an empty string disables UMI by setting x->UMI[0] to \0\0\0
3811+
va_start(args, opt);
3812+
char *bc = va_arg(args, char *), *bc_orig = bc;
3813+
va_end(args);
3814+
if (!bc || strcmp(bc, "1") == 0)
3815+
bc = "RX";
3816+
int ntags = 0, err = 0;
3817+
for (ntags = 0; *bc && ntags < UMI_TAGS; ntags++) {
3818+
if (!isalpha(bc[0]) || !isalnum_c(bc[1])) {
3819+
err = 1;
3820+
break;
3821+
}
3822+
3823+
strncpy(x->UMI[ntags], bc, 3);
3824+
bc += 2;
3825+
if (*bc && *bc != ',') {
3826+
err = 1;
3827+
break;
3828+
}
3829+
bc+=(*bc==',');
3830+
x->UMI[ntags][2] = 0;
3831+
}
3832+
for (; ntags < UMI_TAGS; ntags++)
3833+
x->UMI[ntags][0] = x->UMI[ntags][1] = x->UMI[ntags][2] = 0;
3834+
3835+
3836+
if (err)
3837+
hts_log_warning("Bad UMI tag list '%s'", bc_orig);
3838+
3839+
break;
3840+
}
3841+
3842+
case FASTQ_OPT_UMI_REGEX: {
3843+
va_start(args, opt);
3844+
char *re = va_arg(args, char *);
3845+
va_end(args);
3846+
3847+
regfree(&x->regex);
3848+
if (regcomp(&x->regex, re, REG_EXTENDED) != 0) {
3849+
hts_log_error("Regular expression '%s' is not supported", re);
3850+
return -1;
3851+
}
3852+
break;
3853+
}
3854+
37983855
case FASTQ_OPT_RNUM:
37993856
x->rnum = 1;
38003857
break;
@@ -3907,6 +3964,43 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) {
39073964
x->name.s[x->name.l-=2] = 0;
39083965
}
39093966

3967+
// Strip Illumina formatted UMI off read-name
3968+
char UMI_seq[256]; // maximum length in spec
3969+
size_t UMI_len = 0;
3970+
if (x->UMI[0][0]) {
3971+
regmatch_t match[3];
3972+
if (regexec(&x->regex, x->name.s, 2, match, 0) == 0
3973+
&& match[0].rm_so >= 0 // whole regex
3974+
&& match[1].rm_so >= 0) { // bracketted UMI component
3975+
UMI_len = match[1].rm_eo - match[1].rm_so;
3976+
if (UMI_len > 255) {
3977+
hts_log_error("SAM read name is too long");
3978+
return -2;
3979+
}
3980+
3981+
// The SAMTags spec recommends (but not requires) separating
3982+
// barcodes with hyphen ('-').
3983+
size_t i;
3984+
for (i = 0; i < UMI_len; i++)
3985+
UMI_seq[i] = isalpha_c(x->name.s[i+match[1].rm_so])
3986+
? x->name.s[i+match[1].rm_so]
3987+
: '-';
3988+
3989+
// Move any trailing #num earlier in the name
3990+
if (UMI_len) {
3991+
UMI_seq[UMI_len++] = 0;
3992+
3993+
x->name.l = match[1].rm_so;
3994+
if (x->name.l > 0 && x->name.s[x->name.l-1] == ':')
3995+
x->name.l--; // remove colon too
3996+
char *cp = x->name.s + match[1].rm_eo;
3997+
while (*cp)
3998+
x->name.s[x->name.l++] = *cp++;
3999+
x->name.s[x->name.l] = 0;
4000+
}
4001+
}
4002+
}
4003+
39104004
// Convert to BAM
39114005
ret = bam_set1(b,
39124006
x->name.s + x->name.l - name, name,
@@ -3918,6 +4012,12 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) {
39184012
0);
39194013
if (ret < 0) return -2;
39204014

4015+
// Add UMI tag if removed from read-name above
4016+
if (UMI_len) {
4017+
if (bam_aux_append(b, x->UMI[0], 'Z', UMI_len, (uint8_t *)UMI_seq) < 0)
4018+
ret = -2;
4019+
}
4020+
39214021
// Identify Illumina CASAVA strings.
39224022
// <read>:<is_filtered>:<control_bits>:<barcode_sequence>
39234023
char *barcode = NULL;
@@ -4260,6 +4360,39 @@ int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str)
42604360
if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF)
42614361
return -1;
42624362

4363+
// UMI tag
4364+
if (x && *x->UMI[0]) {
4365+
// Temporary copy of '#num' if present
4366+
char plex[256];
4367+
size_t len = str->l;
4368+
while (len && str->s[len] != ':' && str->s[len] != '#')
4369+
len--;
4370+
4371+
if (str->s[len] == '#' && str->l - len < 255) {
4372+
memcpy(plex, &str->s[len], str->l - len);
4373+
plex[str->l - len] = 0;
4374+
str->l = len;
4375+
} else {
4376+
*plex = 0;
4377+
}
4378+
4379+
uint8_t *bc = NULL;
4380+
int n;
4381+
for (n = 0; !bc && n < UMI_TAGS; n++)
4382+
bc = bam_aux_get(b, x->UMI[n]);
4383+
if (bc && *bc == 'Z') {
4384+
int err = kputc(':', str) < 0;
4385+
// Replace any non-alpha with '+'
4386+
while (*++bc)
4387+
err |= kputc(isalpha_c(*bc) ? toupper_c(*bc) : '+', str) < 0;
4388+
if (err)
4389+
return -1;
4390+
}
4391+
4392+
if (*plex && kputs(plex, str) < 0)
4393+
return -1;
4394+
}
4395+
42634396
// /1 or /2 suffix
42644397
if (x && x->rnum && (flag & BAM_FPAIRED)) {
42654398
int r12 = flag & (BAM_FREAD1 | BAM_FREAD2);

test/fastq/UMI.fq

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
@HS25:09827:FID:2:1201:1505:59794
2+
CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT
3+
+
4+
CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI
5+
@HS25:09827:FID:2:1201:1505:59795:A
6+
CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT
7+
+
8+
CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI
9+
@HS25:09827:FID:2:1201:1505:59796:ACG+TTTTTA
10+
CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT
11+
+
12+
CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI
13+
@HS25:09827:FID:2:1201:1505:59797:TACG+TTTTTA/1
14+
CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT
15+
+
16+
CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI
17+
@HS25:09827:FID:2:1201:1505:59797:TACG+TTTTTA/2
18+
CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT
19+
+
20+
CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI
21+
@HS25:09827:FID:2:1201:1505:59798:TTA#49/1
22+
CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT
23+
+
24+
CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI
25+
@HS25:09827:FID:2:1201:1505:59798:TTA#49/2
26+
CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT
27+
+
28+
CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI

test/fastq/UMI.sam

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
HS25:09827:FID:2:1201:1505:59794 4 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI
2+
HS25:09827:FID:2:1201:1505:59795 4 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI RX:Z:A
3+
HS25:09827:FID:2:1201:1505:59796 4 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI RX:Z:ACG-TTTTTA
4+
HS25:09827:FID:2:1201:1505:59797 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI RX:Z:TACG-TTTTTA
5+
HS25:09827:FID:2:1201:1505:59797 141 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI RX:Z:TACG-TTTTTA
6+
HS25:09827:FID:2:1201:1505:59798#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI RX:Z:TTA
7+
HS25:09827:FID:2:1201:1505:59798#49 141 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCT CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GI RX:Z:TTA

test/fastq/fastq.tst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@ P r2-q.sam $tview -i fastq_aux r2.fa
8484
P name2.sam $tview -i fastq_name2 name2.fq
8585
P name2-q.sam $tview -i fastq_name2 name2.fa
8686

87+
# UMI barcodes
88+
P UMI.sam $tview -i fastq_umi=RX UMI.fq
89+
8790
# --------------------
8891
# Writing
8992

@@ -114,3 +117,6 @@ P r1.fq $tview -f -o fastq_aux -o fastq_rnum r1.sam
114117
P r2.fq $tview -f -o fastq_aux -o fastq_rnum r2.sam
115118
P r1.fa $tview -F -o fastq_aux -o fastq_rnum r1.sam
116119
P r2.fa $tview -F -o fastq_aux -o fastq_rnum r2.sam
120+
121+
# UMI barcodes
122+
P UMI.fq $tview -f -o fastq_rnum -o fastq_umi UMI.sam

0 commit comments

Comments
 (0)