Skip to content

Commit 2fc03e5

Browse files
authored
Merge pull request #412 from maxulysse/dsl2_csv_magic
add CI for --skip_markduplicates
2 parents 474706c + 4d55e71 commit 2fc03e5

File tree

12 files changed

+184
-99
lines changed

12 files changed

+184
-99
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ jobs:
2525
# Nextflow versions: check pipeline minimum and current latest
2626
nxf_ver: ['21.04.0', '']
2727
engine: ['docker']
28-
test: ['default', 'aligner', 'gatk4_spark', 'targeted', 'tumor_normal_pair', 'variant_calling', 'annotation']
28+
test: ['default', 'aligner', 'gatk4_spark', 'targeted', 'skip_markduplicates', 'tumor_normal_pair', 'variant_calling', 'annotation']
2929
steps:
3030
- name: Check out pipeline code
3131
uses: actions/checkout@v2

conf/modules.config

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,11 @@ params {
9797
args2 = 'sort'
9898
publish_files = false
9999
}
100+
'samtools_index_mapping' {
101+
publish_by_meta = true
102+
publish_files = ['bai':'mapped']
103+
publish_dir = 'preprocessing'
104+
}
100105
// MARKDUPLICATES
101106
'markduplicates' {
102107
args = 'REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT'

conf/test.config

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,16 @@ profiles {
4444
pair {
4545
params.input = 'https://raw.githubusercontent.com/nf-core/test-datasets/sarek/testdata/csv/tiny-manta-https.csv'
4646
}
47+
prepare_recalibration {
48+
params.input = 'https://raw.githubusercontent.com/nf-core/test-datasets/sarek/testdata/csv/tiny-mapped-normal-https.csv'
49+
params.step = 'prepare_recalibration'
50+
}
4751
save_bam_mapped {
4852
params.save_bam_mapped = true
4953
}
54+
skip_markduplicates {
55+
params.skip_markduplicates = true
56+
}
5057
split_fastq {
5158
params.split_fastq = 2
5259
}

modules/local/gatk4/baserecalibrator/main.nf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ process GATK4_BASERECALIBRATOR {
2323
path fasta
2424
path fai
2525
path dict
26-
path knownSitesTBI
2726
path knownSites
27+
path knownSites_tbi
2828

2929
output:
3030
tuple val(meta), path("*.table"), emit: table

subworkflows/local/build_indices.nf

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ workflow BUILD_INDICES {
6565

6666
result_dbsnp_tbi = Channel.empty()
6767
version_dbsnp_tbi = Channel.empty()
68-
if (!(params.dbsnp_tbi) && params.dbsnp && ('mapping' in step || 'prepare_recalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools|| 'mutect2' in tools || 'tnscope' in tools)) {
68+
if (!(params.dbsnp_tbi) && params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools|| 'mutect2' in tools || 'tnscope' in tools)) {
6969
dbsnp_id = dbsnp.map {it -> [[id:"$it.baseName"], it]}
7070
(result_dbsnp_tbi, version_dbsnp_tbi) = TABIX_DBSNP(dbsnp_id)
7171
result_dbsnp_tbi = result_dbsnp_tbi.map {meta, tbi -> [tbi]}
@@ -81,14 +81,14 @@ workflow BUILD_INDICES {
8181

8282
result_germline_resource_tbi = Channel.empty()
8383
version_germline_resource_tbi = Channel.empty()
84-
if (!(params.germline_resource_tbi) && params.germline_resource && 'mutect2' in tools){
84+
if (!(params.germline_resource_tbi) && params.germline_resource && 'mutect2' in tools) {
8585
germline_resource_id = germline_resource.map {it -> [[id:"$it.baseName"], it]}
8686
(result_germline_resource_tbi, version_germline_resource_tbi) = TABIX_GERMLINE_RESOURCE(germline_resource_id)
8787
}
8888

8989
result_known_indels_tbi = Channel.empty()
9090
version_known_indels_tbi = Channel.empty()
91-
if (!(params.known_indels_tbi) && params.known_indels && ('mapping' in step || 'prepare_recalibration' in step)){
91+
if (!(params.known_indels_tbi) && params.known_indels && ('mapping' in step || 'preparerecalibration' in step)) {
9292
known_indels_id = known_indels.map {it -> [[id:"$it.baseName"], it]}
9393
(result_known_indels_tbi, version_known_indels_tbi) = TABIX_KNOWN_INDELS(known_indels_id)
9494
result_known_indels_tbi = result_known_indels_tbi.map {meta, tbi -> [tbi]}
@@ -101,7 +101,7 @@ workflow BUILD_INDICES {
101101

102102
result_pon_tbi = Channel.empty()
103103
version_pon_tbi = Channel.empty()
104-
if (!(params.pon_tbi) && params.pon && ('tnscope' in tools || 'mutect2' in tools)){
104+
if (!(params.pon_tbi) && params.pon && ('tnscope' in tools || 'mutect2' in tools)) {
105105
pon_id = pon.map {it -> [[id:"$it.baseName"], it]}
106106
(result_pon_tbi, version_pon_tbi) = TABIX_PON(pon_id)
107107
}

subworkflows/local/mapping_csv.nf

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@
66

77
workflow MAPPING_CSV {
88
take:
9-
bam_mapped // channel: [mandatory] meta, bam, bai
9+
bam_indexed // channel: [mandatory] meta, bam, bai
1010
save_bam_mapped // boolean: [mandatory] save_bam_mapped
1111
skip_markduplicates // boolean: [mandatory] skip_markduplicates
1212

1313
main:
1414
if (save_bam_mapped) {
15-
csv_bam_mapped = bam_mapped.map { meta, bam, bai -> [meta] }
15+
csv_bam_mapped = bam_indexed.map { meta, bam, bai -> [meta] }
1616
// Creating csv files to restart from this step
1717
csv_bam_mapped.collectFile(storeDir: "${params.outdir}/preprocessing/csv") { meta ->
1818
patient = meta.patient[0]
@@ -26,7 +26,7 @@ workflow MAPPING_CSV {
2626
}
2727

2828
if (skip_markduplicates) {
29-
csv_bam_mapped = bam_mapped.map { meta, bam, bai -> [meta] }
29+
csv_bam_mapped = bam_indexed.map { meta, bam, bai -> [meta] }
3030
// Creating csv files to restart from this step
3131
csv_bam_mapped.collectFile(storeDir: "${params.outdir}/preprocessing/csv") { meta ->
3232
patient = meta.patient[0]

subworkflows/nf-core/mapping.nf

Lines changed: 38 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,44 +4,46 @@
44
========================================================================================
55
*/
66

7-
params.seqkit_split2_options = [:]
87
params.bwamem1_mem_options = [:]
98
params.bwamem1_mem_tumor_options = [:]
109
params.bwamem2_mem_options = [:]
1110
params.bwamem2_mem_tumor_options = [:]
11+
params.merge_bam_options = [:]
12+
params.samtools_index_options = [:]
13+
params.seqkit_split2_options = [:]
1214

13-
include { SEQKIT_SPLIT2 } from '../../modules/nf-core/modules/seqkit/split2/main.nf' addParams(options: params.seqkit_split2_options)
15+
include { BWAMEM2_MEM as BWAMEM2_MEM_T } from '../../modules/local/bwamem2/mem/main' addParams(options: params.bwamem2_mem_tumor_options)
16+
include { BWAMEM2_MEM } from '../../modules/local/bwamem2/mem/main' addParams(options: params.bwamem2_mem_options)
1417
include { BWA_MEM as BWAMEM1_MEM } from '../../modules/local/bwa/mem/main' addParams(options: params.bwamem1_mem_options)
1518
include { BWA_MEM as BWAMEM1_MEM_T } from '../../modules/local/bwa/mem/main' addParams(options: params.bwamem1_mem_tumor_options)
16-
include { BWAMEM2_MEM } from '../../modules/local/bwamem2/mem/main' addParams(options: params.bwamem2_mem_options)
17-
include { BWAMEM2_MEM as BWAMEM2_MEM_T } from '../../modules/local/bwamem2/mem/main' addParams(options: params.bwamem2_mem_tumor_options)
19+
include { SAMTOOLS_INDEX } from '../../modules/local/samtools/index/main' addParams(options: params.samtools_index_options)
20+
include { SAMTOOLS_MERGE } from '../../modules/nf-core/modules/samtools/merge/main' addParams(options: params.merge_bam_options)
21+
include { SEQKIT_SPLIT2 } from '../../modules/nf-core/modules/seqkit/split2/main.nf' addParams(options: params.seqkit_split2_options)
1822

1923
workflow MAPPING {
2024
take:
21-
aligner // string: [mandatory] "bwa-mem" or "bwa-mem2"
22-
bwa // channel: [mandatory] bwa
23-
fai // channel: [mandatory] fai
24-
fasta // channel: [mandatory] fasta
25-
reads_input // channel: [mandatory] meta, reads_input
25+
aligner // string: [mandatory] "bwa-mem" or "bwa-mem2"
26+
bwa // channel: [mandatory] bwa
27+
fai // channel: [mandatory] fai
28+
fasta // channel: [mandatory] fasta
29+
reads_input // channel: [mandatory] meta, reads_input
30+
skip_markduplicates // boolean: true/false
2631

2732
main:
2833

29-
bam_mapped_index = Channel.empty()
30-
bam_reports = Channel.empty()
31-
34+
bam_indexed = Channel.empty()
3235

33-
if(params.split_fastq > 1){
36+
if (params.split_fastq > 1) {
3437
reads_input_split = SEQKIT_SPLIT2(reads_input).reads.map{
3538
key, reads ->
3639
//TODO maybe this can be replaced by a regex to include part_001 etc.
3740

3841
//sorts list of split fq files by :
3942
//[R1.part_001, R2.part_001, R1.part_002, R2.part_002,R1.part_003, R2.part_003,...]
4043
//TODO: determine whether it is possible to have an uneven number of parts, so remainder: true woud need to be used, I guess this could be possible for unfiltered reads, reads that don't have pairs etc.
41-
return [key, reads.sort{ a,b -> a.getName().tokenize('.')[ a.getName().tokenize('.').size() - 3] <=> b.getName().tokenize('.')[ b.getName().tokenize('.').size() - 3]}
42-
.collate(2)]
44+
return [key, reads.sort{ a,b -> a.getName().tokenize('.')[ a.getName().tokenize('.').size() - 3] <=> b.getName().tokenize('.')[ b.getName().tokenize('.').size() - 3]}.collate(2)]
4345
}.transpose()
44-
}else{
46+
} else {
4547
reads_input_split = reads_input
4648
}
4749

@@ -77,14 +79,31 @@ workflow MAPPING {
7779
bam_bwa.map{ meta, bam ->
7880
meta.remove('read_group')
7981
meta.id = meta.sample
82+
// groupKey is to makes sure that the correct group can advance as soon as it is complete
83+
// and not stall the workflow until all pieces are mapped
8084
def groupKey = groupKey(meta, meta.numLanes * params.split_fastq)
8185
tuple(groupKey, bam)
8286
[meta, bam]
83-
}.groupTuple() //groupKey above is somehow makes sure, the workflow doesn't stall until all pieces are mapped, but that the correct group can advance as soon as it is complete
87+
}.groupTuple()
8488
.set{bam_mapped}
8589

86-
// STEP 1.5: MERGING AND INDEXING BAM FROM MULTIPLE LANES // MarkDuplicates can take care of this
90+
// MarkDuplicates can handle multiple BAMS as input, so no merging/indexing at this step
91+
// Except if and only if skipping MarkDuplicates
92+
93+
if (skip_markduplicates) {
94+
bam_mapped.branch{
95+
single: it[1].size() == 1
96+
multiple: it[1].size() > 1
97+
}.set{ bam_to_merge }
98+
99+
SAMTOOLS_MERGE(bam_to_merge.multiple)
100+
bam_merged = bam_to_merge.single.mix(SAMTOOLS_MERGE.out.bam)
101+
102+
SAMTOOLS_INDEX(bam_merged)
103+
bam_indexed = bam_merged.join(SAMTOOLS_INDEX.out.bai)
104+
}
87105

88106
emit:
89-
bam = bam_mapped
107+
bam = bam_mapped
108+
bam_indexed = bam_indexed
90109
}

subworkflows/nf-core/prepare_recalibration.nf

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -24,30 +24,23 @@ workflow PREPARE_RECALIBRATION {
2424
known_sites // channel: [optional] known_sites
2525
known_sites_tbi // channel: [optional] known_sites_tbi
2626
no_intervals // value: [mandatory] no_intervals
27-
known_indels
28-
dbsnp
2927

3028
main:
3129
cram_markduplicates.combine(intervals)
32-
.map{ meta, cram, crai, intervals ->
33-
new_meta = meta.clone()
34-
new_meta.id = meta.sample + "_" + intervals.baseName
35-
[new_meta, cram, crai, intervals]
36-
}
37-
.set{cram_markduplicates_intervals}
30+
.map{ meta, cram, crai, intervals ->
31+
new_meta = meta.clone()
32+
new_meta.id = meta.sample + "_" + intervals.baseName
33+
[new_meta, cram, crai, intervals]
34+
}.set{cram_markduplicates_intervals}
3835

39-
if(use_gatk_spark){
36+
if (use_gatk_spark) {
4037
BASERECALIBRATOR_SPARK(cram_markduplicates_intervals, fasta, fai, dict, known_sites, known_sites_tbi)
4138
table_baserecalibrator = BASERECALIBRATOR_SPARK.out.table
42-
}else{
43-
BASERECALIBRATOR(cram_markduplicates_intervals, fasta, fai, dict, known_sites_tbi, known_sites)
39+
} else {
40+
BASERECALIBRATOR(cram_markduplicates_intervals, fasta, fai, dict, known_sites, known_sites_tbi)
4441
table_baserecalibrator = BASERECALIBRATOR.out.table
4542
}
4643

47-
//num_intervals = intervals.toList().size.view() //Integer.valueOf()
48-
//.view()
49-
//println(intervals.toList().getClass()) //.value.getClass())
50-
5144
//STEP 3.5: MERGING RECALIBRATION TABLES
5245
if (no_intervals) {
5346
table_baserecalibrator.map { meta, table ->
@@ -62,7 +55,6 @@ workflow PREPARE_RECALIBRATION {
6255

6356
GATHERBQSRREPORTS(recaltable)
6457
table_bqsr = GATHERBQSRREPORTS.out.table
65-
6658
}
6759

6860
emit:

subworkflows/nf-core/qc_markduplicates.nf

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ params.samtools_index_options = [:]
1616
include { GATK4_MARKDUPLICATES } from '../../modules/local/gatk4/markduplicates/main' addParams(options: params.markduplicates_options)
1717
include { GATK4_MARKDUPLICATES_SPARK } from '../../modules/local/gatk4/markduplicatesspark/main' addParams(options: params.markduplicatesspark_options)
1818
include { GATK4_ESTIMATELIBRARYCOMPLEXITY } from '../../modules/local/gatk4/estimatelibrarycomplexity/main' addParams(options: params.estimatelibrarycomplexity_options)
19-
include { SAMTOOLS_MERGE } from '../../modules/nf-core/modules/samtools/merge/main' addParams(options: params.merge_bam_options)
2019
include { QUALIMAP_BAMQC } from '../../modules/local/qualimap/bamqc/main' addParams(options: params.qualimap_bamqc_options)
2120
include { SAMTOOLS_STATS } from '../../modules/local/samtools/stats/main' addParams(options: params.samtools_stats_options)
2221
include { SAMTOOLS_VIEW as SAMTOOLS_BAM_TO_CRAM } from '../../modules/local/samtools/view/main.nf' addParams(options: params.samtools_view_options)
@@ -25,51 +24,43 @@ include { SAMTOOLS_INDEX } from '../../modules/loca
2524

2625
workflow QC_MARKDUPLICATES {
2726
take:
28-
bam_mapped // channel: [mandatory] meta, bam, bai
27+
bam_mapped // channel: [mandatory] meta, bam
28+
bam_indexed // channel: [mandatory] meta, bam, bai
2929
use_gatk_spark // value: [mandatory] use gatk spark
3030
save_metrics // value: [mandatory] save metrics
3131
fasta // channel: [mandatory] fasta
3232
fai // channel: [mandatory] fai
3333
dict // channel: [mandatory] dict
34+
skip_markduplicates // boolean: true/false
3435
skip_bamqc // boolean: true/false
3536
skip_samtools // boolean: true/false
3637
target_bed // channel: [optional] target_bed
3738

3839
main:
3940

4041
report_markduplicates = Channel.empty()
41-
if(params.skip_markduplicates){
42-
43-
bam_mapped.branch{
44-
single: it[1].size() == 1
45-
multiple: it[1].size() > 1
46-
}.set{ bam_to_merge }
47-
48-
SAMTOOLS_MERGE(bam_to_merge.multiple)
49-
bam_merged = bam_to_merge.single.mix(SAMTOOLS_MERGE.out.bam)
50-
51-
SAMTOOLS_INDEX(bam_merged)
52-
bam_markduplicates = bam_merged.join(SAMTOOLS_INDEX.out.bai)
53-
cram_markduplicates = SAMTOOLS_BAM_TO_CRAM(bam_markduplicates, fasta, fai)
54-
} else{
5542

43+
if (skip_markduplicates) {
44+
bam_markduplicates = bam_indexed
45+
SAMTOOLS_BAM_TO_CRAM(bam_markduplicates, fasta, fai)
46+
cram_markduplicates = SAMTOOLS_BAM_TO_CRAM.out.cram
47+
} else {
5648
if (use_gatk_spark) {
57-
5849
//If BAMQC should be run on MD output, then don't use MDSpark to convert to cram, but use bam output instead
59-
if(!skip_bamqc){
50+
if (!skip_bamqc) {
6051
GATK4_MARKDUPLICATES_SPARK(bam_mapped, fasta, fai, dict, "bam")
6152
SAMTOOLS_INDEX(GATK4_MARKDUPLICATES_SPARK.out.output)
6253
bam_markduplicates = GATK4_MARKDUPLICATES_SPARK.out.output.join(SAMTOOLS_INDEX.out.bai)
6354

6455
SAMTOOLS_BAM_TO_CRAM_SPARK(bam_markduplicates, fasta, fai)
6556
cram_markduplicates = SAMTOOLS_BAM_TO_CRAM_SPARK.out.cram
66-
}else{
57+
} else {
6758
GATK4_MARKDUPLICATES_SPARK(bam_mapped, fasta, fai, dict, "cram")
6859
SAMTOOLS_INDEX(GATK4_MARKDUPLICATES_SPARK.out.output)
6960
cram_markduplicates = GATK4_MARKDUPLICATES_SPARK.out.output.join(SAMTOOLS_INDEX.out.crai)
7061
}
7162

72-
if(save_metrics){
63+
if (save_metrics) {
7364
GATK4_ESTIMATELIBRARYCOMPLEXITY(bam_mapped, fasta, fai, dict)
7465
report_markduplicates = GATK4_ESTIMATELIBRARYCOMPLEXITY.out.metrics
7566
}
@@ -88,12 +79,12 @@ workflow QC_MARKDUPLICATES {
8879
//if !skip_markduplicates, then QC tools are run on duplicate marked bams
8980
//After bamqc finishes, convert to cram for further analysis
9081
qualimap_bamqc = Channel.empty()
91-
if(!skip_bamqc){
82+
if (!skip_bamqc && !skip_markduplicates) {
83+
//TODO: after adding CI tests, allow bamqc on mapped bams if no duplicate marking is done
9284
QUALIMAP_BAMQC(bam_markduplicates, target_bed, params.target_bed)
9385
qualimap_bamqc = QUALIMAP_BAMQC.out
9486
}
9587

96-
9788
samtools_stats = Channel.empty()
9889
if (!skip_samtools) {
9990
SAMTOOLS_STATS(cram_markduplicates, fasta)

tests/test_annotation.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
files:
77
- path: results/annotation/1234N/1234N_snpEff.ann.gz
88
- path: results/annotation/1234N/1234N_snpEff.ann.gz.tbi
9+
- path: results/multiqc
910
- name: Run VEP
1011
command: nextflow run main.nf -profile test,annotation,docker --tools vep
1112
tags:
@@ -14,6 +15,7 @@
1415
files:
1516
- path: results/annotation/1234N/1234N_VEP.ann.gz
1617
- path: results/annotation/1234N/1234N_VEP.ann.gz.tbi
18+
- path: results/multiqc
1719
- name: Run snpEff followed by VEP
1820
command: nextflow run main.nf -profile test,annotation,docker --tools merge
1921
tags:
@@ -26,3 +28,4 @@
2628
- path: results/annotation/1234N/1234N_snpEff.ann.gz.tbi
2729
- path: results/annotation/1234N/1234N_snpEff_VEP.ann.gz
2830
- path: results/annotation/1234N/1234N_snpEff_VEP.ann.gz.tbi
31+
- path: results/multiqc

0 commit comments

Comments
 (0)