Merge pull request #613 from maxulysse/dev_vep_plugin

maxulysse · web-flow · commit c81e07a80380 · 2022-07-07T16:35:18.000+02:00
add params for dbnsfp vep plugin + fix filenames for vep plugins
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -34,6 +34,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#597](https://github.com/nf-core/sarek/pull/597) - Added tiddit for tumor variant calling
 - [#600](https://github.com/nf-core/sarek/pull/600) - Added description for UMI related params in schema
 - [#604](https://github.com/nf-core/sarek/pull/604), [#617](https://github.com/nf-core/sarek/pull/617) - Added full size tests WGS 30x NA12878
+- [#613](https://github.com/nf-core/sarek/pull/613) - Added params `--dbnsfp_fields` to allow configuration of fields for the `dbnsfp` `VEP` plugin
+- [#613](https://github.com/nf-core/sarek/pull/613) - Added params `--dbnsfp_consequence` to allow configuration of consequence for the `dbnsfp` `VEP` plugin
+- [#613](https://github.com/nf-core/sarek/pull/613) - Added params `--vep_version` to allow more configuration on the vep container definition
 - [#620](https://github.com/nf-core/sarek/pull/620) - Added checks for sex information when running a CNV tools
 
 ### Changed
@@ -111,6 +114,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#600](https://github.com/nf-core/sarek/pull/600) - Remove `nf-core lint` warnings
 - [#602](https://github.com/nf-core/sarek/pull/602) - Fixed bug in `alignment_to_fastq` and added tests
 - [#609](https://github.com/nf-core/sarek/pull/609) - Remove unused intervals code, reorganize combined intervals file
+- [#613](https://github.com/nf-core/sarek/pull/613) - Fixed filenames for `dbnsfp` and `SpliceAI` `VEP` plugin
 - [#615](https://github.com/nf-core/sarek/pull/615) - Fix ASCAT igenomes file paths
 - [#619](https://github.com/nf-core/sarek/pull/619) - Fix issue with checking samplesheet content with AWS
 
diff --git a/conf/igenomes.config b/conf/igenomes.config
@@ -35,6 +35,7 @@ params {
             vep_cache_version     = 105
             vep_genome            = 'GRCh37'
             vep_species           = 'homo_sapiens'
+            vep_version           = '104.3'
         }
         'GATK.GRCh38' {
             ascat_alleles         = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/ASCAT/G1000_alleles_hg38.zip"
@@ -64,6 +65,7 @@ params {
             vep_cache_version     = 105
             vep_genome            = 'GRCh38'
             vep_species           = 'homo_sapiens'
+            vep_version           = '104.3'
         }
         'Ensembl.GRCh37' {
             bwa                   = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/version0.6.0/"
@@ -92,6 +94,7 @@ params {
             vep_cache_version     = 102
             vep_genome            = 'GRCm38'
             vep_species           = 'mus_musculus'
+            vep_version           = '104.3'
         }
         'TAIR10' {
             bwa                   = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BWAIndex/version0.6.0/"
@@ -116,6 +119,7 @@ params {
             vep_cache_version     = 105
             vep_genome            = 'WBcel235'
             vep_species           = 'caenorhabditis_elegans'
+            vep_version           = '104.3'
         }
         'CanFam3.1' {
             bwa                   = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BWAIndex/version0.6.0/"
diff --git a/conf/modules.config b/conf/modules.config
@@ -1146,17 +1146,18 @@ process{
     // VEP
     if (params.tools && (params.tools.contains('vep') || params.tools.contains('merge'))) {
         withName: 'ENSEMBLVEP' {
-            // If just VEP: <vcf prefix>_VEP.ann.vcf
-            ext.prefix       = { "${vcf.baseName.minus(".vcf")}_VEP" }
             ext.args          = [
                 '--everything --filter_common --per_gene --total_length --offline --format vcf',
-                (params.vep_dbnsfp && params.dbnsfp)                                  ? '--plugin dbNSFP,dbNSFP.gz,rs_dbSNP,HGVSc_VEP,HGVSp_VEP,1000Gp3_EAS_AF,1000Gp3_AMR_AF,LRT_score,GERP++_RS,gnomAD_exomes_AF' : '',
-                (params.vep_loftee)                                                   ? '--plugin LoF,loftee_path:/opt/conda/envs/nf-core-vep-104.3/share/ensembl-vep-104.3-0'                                      : '',
-                (params.vep_spliceai && params.spliceai_snv && params.spliceai_indel) ? '--plugin SpliceAI,snv=spliceai_scores.raw.snv.hg38.vcf.gz,indel=spliceai_scores.raw.indel.hg38.vcf.gz'                     : '',
-                (params.vep_spliceregion)                                             ? '--plugin SpliceRegion'                                                                                                     : '',
-                (params.vep_out_format)                                               ? "--${params.vep_out_format}"                                                                                                : '--vcf'
+                (params.vep_dbnsfp && params.dbnsfp && !params.dbnsfp_consequence)    ? "--plugin dbNSFP,${params.dbnsfp.split("/")[-1]},${params.dbnsfp_fields}"                                              : '',
+                (params.vep_dbnsfp && params.dbnsfp && params.dbnsfp_consequence)     ? "--plugin dbNSFP,'${params.dbnsfp_consequence}',${params.dbnsfp.split("/")[-1]},${params.dbnsfp_fields}"               : '',
+                (params.vep_loftee)                                                   ? "--plugin LoF,loftee_path:/opt/conda/envs/nf-core-vep-${params.vep_version}/share/ensembl-vep-${params.vep_version}-0" : '',
+                (params.vep_spliceai && params.spliceai_snv && params.spliceai_indel) ? "--plugin SpliceAI,snv=${params.spliceai_snv.split("/")[-1]},${params.spliceai_indel.split("/")[-1]}"                  : '',
+                (params.vep_spliceregion)                                             ? '--plugin SpliceRegion'                                                                                                : '',
+                (params.vep_out_format)                                               ? "--${params.vep_out_format}"                                                                                           : '--vcf'
             ].join(' ').trim()
-            if (!params.vep_cache)    container = { params.vep_genome ? "nfcore/vep:104.3.${params.vep_genome}" : "nfcore/vep:104.3.${params.genome}" }
+            // If just VEP: <vcf prefix>_VEP.ann.vcf
+            ext.prefix       = { "${vcf.baseName.minus(".vcf")}_VEP" }
+            if (!params.vep_cache)    container = { params.vep_genome ? "nfcore/vep:${params.vep_version}.${params.vep_genome}" : "nfcore/vep:${params.vep_version}.${params.genome}" }
             publishDir       = [
                 [
                     mode: params.publish_dir_mode,
@@ -1175,7 +1176,7 @@ process{
     // SNPEFF THEN VEP
     if (params.tools && params.tools.contains('merge')) {
         withName: ".*:ANNOTATION_MERGE:ENSEMBLVEP" {
-            // If megre: Output file will have format *_snpEff_VEP.ann.vcf, *_snpEff_VEP.ann.json or *_snpEff_VEP.ann.tab
+            // If merge: Output file will have format *_snpEff_VEP.ann.vcf, *_snpEff_VEP.ann.json or *_snpEff_VEP.ann.tab
             ext.prefix       = { "${vcf.baseName.minus(".ann.vcf")}_VEP" }
         }
     }
diff --git a/conf/test.config b/conf/test.config
@@ -41,6 +41,7 @@ params {
     vep_cache_version = 104
     vep_genome        = 'WBcel235'
     vep_species       = 'caenorhabditis_elegans'
+    vep_version       = '104.3'
 
     // Ignore params that will throw warning through params validation
     schema_ignore_params = "genomes,test_data"
diff --git a/main.nf b/main.nf
@@ -52,6 +52,7 @@ params.pon_tbi               = WorkflowMain.getGenomeAttribute(params, 'pon_tbi'
 params.snpeff_db             = WorkflowMain.getGenomeAttribute(params, 'snpeff_db')
 params.snpeff_genome         = WorkflowMain.getGenomeAttribute(params, 'snpeff_genome')
 params.vep_cache_version     = WorkflowMain.getGenomeAttribute(params, 'vep_cache_version')
+params.vep_version           = WorkflowMain.getGenomeAttribute(params, 'vep_version')
 params.vep_genome            = WorkflowMain.getGenomeAttribute(params, 'vep_genome')
 params.vep_species           = WorkflowMain.getGenomeAttribute(params, 'vep_species')
 
diff --git a/nextflow.config b/nextflow.config
@@ -76,6 +76,8 @@ params {
     vep_dbnsfp = null // dbnsfp plugin disabled within VEP
     dbnsfp = null // No dbnsfp processed file
     dbnsfp_tbi = null // No dbnsfp processed file index
+    dbnsfp_consequence = null // No default consequence for dbnsfp plugin
+    dbnsfp_fields = "rs_dbSNP,HGVSc_VEP,HGVSp_VEP,1000Gp3_EAS_AF,1000Gp3_AMR_AF,LRT_score,GERP++_RS,gnomAD_exomes_AF" // Default fields for dbnsfp plugin
     vep_loftee = null // loftee plugin disabled within VEP
     vep_spliceai = null // spliceai plugin disabled within VEP
     spliceai_snv = null // No spliceai_snv file
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -325,7 +325,7 @@
                     "type": "string",
                     "fa_icon": "fas fa-database",
                     "description": "Path to dbNSFP processed file.",
-                    "help_text": "To be used with `--vep_dbnsfp`.",
+                    "help_text": "To be used with `--vep_dbnsfp`.\ndbNSFP files and more information are available at https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#dbnsfp and https://sites.google.com/site/jpopgen/dbNSFP/",
                     "hidden": true
                 },
                 "dbnsfp_tbi": {
@@ -335,6 +335,21 @@
                     "help_text": "To be used with `--vep_dbnsfp`.",
                     "hidden": true
                 },
+                "dbnsfp_consequence": {
+                    "type": "string",
+                    "fa_icon": "fas fa-database",
+                    "description": "Consequence to annotate with",
+                    "help_text": "To be used with `--vep_dbnsfp`.\nThis params is used to filter/limit outputs to a specific effect of the variant.\nThe set of consequence terms is defined by the Sequence Ontology and an overview of those used in VEP can be found here: https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html\nIf one wants to filter using several consequences, then separate those by using '&' (i.e. 'consequence=3_prime_UTR_variant&intron_variant'.",
+                    "hidden": true
+                },
+                "dbnsfp_fields": {
+                    "type": "string",
+                    "fa_icon": "fas fa-database",
+                    "description": "Fields to annotate with",
+                    "default": "rs_dbSNP,HGVSc_VEP,HGVSp_VEP,1000Gp3_EAS_AF,1000Gp3_AMR_AF,LRT_score,GERP++_RS,gnomAD_exomes_AF",
+                    "help_text": "To be used with `--vep_dbnsfp`.\nThis params can be used to retrieve individual values from the dbNSFP file. The values correspond to the name of the columns in the dbNSFP file and are separated by comma.\nThe column names might differ between the different dbNSFP versions. Please check the Readme.txt file, which is provided with the dbNSFP file, to obtain the correct column names. The Readme file contains also a short description of the provided values and the version of the tools used to generate them.\nDefault value are explained below;/nrs_dbSNP - rs number from dbSNP/nHGVSc_VEP - HGVS coding variant presentation from VEP. Multiple entries separated by ';', corresponds to Ensembl_transcriptid/nHGVSp_VEP - HGVS protein variant presentation from VEP. Multiple entries separated by ';', corresponds to Ensembl_proteinid/n1000Gp3_EAS_AF - Alternative allele frequency in the 1000Gp3 East Asian descendent samples/n1000Gp3_AMR_AF - Alternative allele counts in the 1000Gp3 American descendent samples/nLRT_score - Original LRT two-sided p-value (LRTori), ranges from 0 to 1/nGERP++_RS - Conservation score. The larger the score, the more conserved the site, ranges from -12.3 to 6.17/ngnomAD_exomes_AF - Alternative allele frequency in the whole gnomAD exome samples/n.",
+                    "hidden": true
+                },
                 "vep_loftee": {
                     "type": "boolean",
                     "fa_icon": "fas fa-database",
@@ -401,6 +416,14 @@
                     "description": "Path to VEP cache.",
                     "help_text": "To be used with `--annotation_cache`.",
                     "hidden": true
+                },
+                "vep_out_format": {
+                    "type": "string",
+                    "default": "vcf",
+                    "description": "VEP output-file format.",
+                    "enum": ["json", "tab", "vcf"],
+                    "help_text": "Sets the format of the output-file from VEP. Available formats: json, tab and vcf.",
+                    "fa_icon": "fas fa-table"
                 }
             }
         },
@@ -425,13 +448,12 @@
                 "ascat_chromosomes": {
                     "type": "string",
                     "fa_icon": "fa-solid fa-text",
-                    "default": "'c(1:22, 'X')'",
+                    "default": "c(1:22, 'X')",
                     "help_text": "Specify specific chromosomes to run ASCAT on, i.e 'c('21', '22')'."
                 },
                 "ascat_genome": {
                     "type": "string",
                     "fa_icon": "fa-solid fa-text",
-                    "default": "hg38",
                     "description": "ASCAT genome.",
                     "help_text": "Must be set to run ASCAT, either hg19 or hg38. If you use AWS iGenomes, this has already been set for you appropriately."
                 },
@@ -589,13 +611,11 @@
                     "description": "VEP cache version.",
                     "help_text": "If you use AWS iGenomes, this has already been set for you appropriately."
                 },
-                "vep_out_format": {
+                "vep_version": {
                     "type": "string",
-                    "default": "vcf",
-                    "description": "VEP output-file format.",
-                    "enum": ["json", "tab", "vcf"],
-                    "help_text": "Sets the format of the output-file from VEP. Available formats: json, tab and vcf.",
-                    "fa_icon": "fas fa-table"
+                    "fa_icon": "fas fa-tag",
+                    "description": "VEP version.",
+                    "help_text": "If you use AWS iGenomes, this has already been set for you appropriately."
                 },
                 "save_reference": {
                     "type": "boolean",