nf-core · Darcy220606 · Dec 5, 2024 · Dec 5, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - [#421](https://github.com/nf-core/funcscan/pull/421) Updated to nf-core template 3.0.2. (by @jfy133)
 - [#427](https://github.com/nf-core/funcscan/pull/427) AMPcombi now can use multiple other databases for classifications. (by @darcy220606)
+- [#428](https://github.com/nf-core/funcscan/pull/428) Added InterProScan annotation workflow to the pipeline. The results are coupled to AMPcombi final table. (by @darcy220606)
 - [#429](https://github.com/nf-core/funcscan/pull/429) Updated to nf-core template 3.1.0. (by @jfy133 and @jasmezz)
 - [#433](https://github.com/nf-core/funcscan/pull/433) Updated to nf-core template 3.1.1. (by @jfy133)
 - [#431](https://github.com/nf-core/funcscan/pull/431) Updated AMPcombi, Macrel, all MMseqs2 modules, MultiQC, Pyrodigal, and seqkit, added `--taxa_classification_mmseqs_compressed` parameter. (by @jasmezz)
@@ -26,15 +27,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Dependencies`
 
-| Tool      | Previous version | New version |
-| --------- | ---------------- | ----------- |
-| AMPcombi  | 0.2.2            | 2.0.1       |
-| Bakta     | 1.9.3            | 1.10.4      |
-| Macrel    | 1.2.0            | 1.4.0       |
-| MMseqs2   | 15.6f452         | 17.b804f    |
-| MultiQC   | 1.24.0           | 1.27        |
-| Pyrodigal | 3.3.0            | 3.6.3       |
-| seqkit    | 2.8.1            | 2.9.0       |
+=======
-=======
-=======
+| Tool | Previous version | New version |
+| ------------ | ---------------- | ----------- |
+| AMPcombi | 0.2.2 | 2.0.1 |
+| Bakta | 1.9.3 | 1.10.4 |
+| InterProScan | - | 5.59_91.0 |
+| Macrel | 1.2.0 | 1.4.0 |
+| MMseqs2 | 15.6f452 | 17.b804f |
+| MultiQC | 1.24.0 | 1.27 |
+| Pyrodigal | 3.3.0 | 3.6.3 |
+| seqkit | 2.8.1 | 2.9.0 |
 
 ### `Deprecated`
 

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -70,6 +70,14 @@
 
   > Eddy S. R. (2011). Accelerated Profile HMM Searches. PLoS computational biology, 7(10), e1002195. [DOI: 10.1371/journal.pcbi.1002195](https://doi.org/10.1371/journal.pcbi.1002195)
 
+- [InterPro](https://doi.org/10.1093/nar/gkaa977)
+
+  > Blum, M., Chang, H-Y., Chuguransky, S., Grego, T., Kandasaamy, S., Mitchell, A., Nuka, G., Paysan-Lafosse, T., Qureshi, M., Raj, S., Richardson, L., Salazar, G.A., Williams, L., Bork, P., Bridge, A., Gough, J., Haft, D.H., Letunic, I., Marchler-Bauer, A., Mi, H., Natale, D.A., Necci, M., Orengo, C.A., Pandurangan, A.P., Rivoire, C., Sigrist, C.A., Sillitoe, I., Thanki, N., Thomas, P.D., Tosatto, S.C.E, Wu, C.H., Bateman, A., Finn, R.D. (2021) The InterPro protein families and domains database: 20 years on, Nucleic Acids Research, 49(D1), D344–D354.[DOI: 10.1093/nar/gkaa977](https://doi.org/10.1093/nar/gkaa977).
-  > Blum, M., Chang, H-Y., Chuguransky, S., Grego, T., Kandasaamy, S., Mitchell, A., Nuka, G., Paysan-Lafosse, T., Qureshi, M., Raj, S., Richardson, L., Salazar, G.A., Williams, L., Bork, P., Bridge, A., Gough, J., Haft, D.H., Letunic, I., Marchler-Bauer, A., Mi, H., Natale, D.A., Necci, M., Orengo, C.A., Pandurangan, A.P., Rivoire, C., Sigrist, C.A., Sillitoe, I., Thanki, N., Thomas, P.D., Tosatto, S.C.E, Wu, C.H., Bateman, A., Finn, R.D. (2021) The InterPro protein families and domains database: 20 years on, Nucleic Acids Research, 49(D1), D344–D354.[DOI: 10.1093/nar/gkaa977](https://doi.org/10.1093/nar/gkaa977).
+  > Blum, M., Chang, H-Y., Chuguransky, S., Grego, T., Kandasaamy, S., Mitchell, A., Nuka, G., Paysan-Lafosse, T., Qureshi, M., Raj, S., Richardson, L., Salazar, G. A., Williams, L., Bork, P., Bridge, A., Gough, J., Haft, D. H., Letunic, I., Marchler-Bauer, A., Mi, H., Natale, D. A., Necci, M., Orengo, C. A., Pandurangan, A. P., Rivoire, C., Sigrist, C. A., Sillitoe, I., Thanki, N., Thomas, P. D., Tosatto, S. C. E, Wu, C. H., Bateman, A., Finn, R. D. (2021) The InterPro protein families and domains database: 20 years on. Nucleic Acids Research, 49(D1), D344–D354. [DOI: 10.1093/nar/gkaa977](https://doi.org/10.1093/nar/gkaa977)
-  > Blum, M., Chang, H-Y., Chuguransky, S., Grego, T., Kandasaamy, S., Mitchell, A., Nuka, G., Paysan-Lafosse, T., Qureshi, M., Raj, S., Richardson, L., Salazar, G.A., Williams, L., Bork, P., Bridge, A., Gough, J., Haft, D.H., Letunic, I., Marchler-Bauer, A., Mi, H., Natale, D.A., Necci, M., Orengo, C.A., Pandurangan, A.P., Rivoire, C., Sigrist, C.A., Sillitoe, I., Thanki, N., Thomas, P.D., Tosatto, S.C.E, Wu, C.H., Bateman, A., Finn, R.D. (2021) The InterPro protein families and domains database: 20 years on, Nucleic Acids Research, 49(D1), D344–D354.[DOI: 10.1093/nar/gkaa977](https://doi.org/10.1093/nar/gkaa977).
+  > Blum, M., Chang, H-Y., Chuguransky, S., Grego, T., Kandasaamy, S., Mitchell, A., Nuka, G., Paysan-Lafosse, T., Qureshi, M., Raj, S., Richardson, L., Salazar, G. A., Williams, L., Bork, P., Bridge, A., Gough, J., Haft, D. H., Letunic, I., Marchler-Bauer, A., Mi, H., Natale, D. A., Necci, M., Orengo, C. A., Pandurangan, A. P., Rivoire, C., Sigrist, C. A., Sillitoe, I., Thanki, N., Thomas, P. D., Tosatto, S. C. E, Wu, C. H., Bateman, A., Finn, R. D. (2021) The InterPro protein families and domains database: 20 years on. Nucleic Acids Research, 49(D1), D344–D354. [DOI: 10.1093/nar/gkaa977](https://doi.org/10.1093/nar/gkaa977)
+
+- [InterProScan](https://doi.org/10.1093/bioinformatics/btu031)
+
+  > Jones, P., Binns, D., Chang, H-Y., Fraser, M., Li, W., McAnulla, C., McWilliam, H., Maslen, J., Mitchell, A., Nuka, G., Pesseat, S., Quinn, A.F., Sangrador-Vegas, A., Scheremetjew, M., Yong, S-Y., Lopez, R., Hunter, S. (2014)InterProScan 5: genome-scale protein function classification, Bioinformatics, 30(9), 1236–1240. [DOI: 10.1093/bioinformatics/btu031](https://doi.org/10.1093/bioinformatics/btu031)
-  > Jones, P., Binns, D., Chang, H-Y., Fraser, M., Li, W., McAnulla, C., McWilliam, H., Maslen, J., Mitchell, A., Nuka, G., Pesseat, S., Quinn, A.F., Sangrador-Vegas, A., Scheremetjew, M., Yong, S-Y., Lopez, R., Hunter, S. (2014)InterProScan 5: genome-scale protein function classification, Bioinformatics, 30(9), 1236–1240. [DOI: 10.1093/bioinformatics/btu031](https://doi.org/10.1093/bioinformatics/btu031)
+  > Jones, P., Binns, D., Chang, H-Y., Fraser, M., Li, W., McAnulla, C., McWilliam, H., Maslen, J., Mitchell, A., Nuka, G., Pesseat, S., Quinn, A. F., Sangrador-Vegas, A., Scheremetjew, M., Yong, S-Y., Lopez, R., Hunter, S. (2014) InterProScan 5: genome-scale protein function classification. Bioinformatics, 30(9), 1236–1240. [DOI: 10.1093/bioinformatics/btu031](https://doi.org/10.1093/bioinformatics/btu031)
-  > Jones, P., Binns, D., Chang, H-Y., Fraser, M., Li, W., McAnulla, C., McWilliam, H., Maslen, J., Mitchell, A., Nuka, G., Pesseat, S., Quinn, A.F., Sangrador-Vegas, A., Scheremetjew, M., Yong, S-Y., Lopez, R., Hunter, S. (2014)InterProScan 5: genome-scale protein function classification, Bioinformatics, 30(9), 1236–1240. [DOI: 10.1093/bioinformatics/btu031](https://doi.org/10.1093/bioinformatics/btu031)
+  > Jones, P., Binns, D., Chang, H-Y., Fraser, M., Li, W., McAnulla, C., McWilliam, H., Maslen, J., Mitchell, A., Nuka, G., Pesseat, S., Quinn, A. F., Sangrador-Vegas, A., Scheremetjew, M., Yong, S-Y., Lopez, R., Hunter, S. (2014) InterProScan 5: genome-scale protein function classification. Bioinformatics, 30(9), 1236–1240. [DOI: 10.1093/bioinformatics/btu031](https://doi.org/10.1093/bioinformatics/btu031)
+
 - [Macrel](https://doi.org/10.7717/peerj.10555)
 
   > Santos-Júnior, C. D., Pan, S., Zhao, X. M., & Coelho, L. P. (2020). Macrel: antimicrobial peptide screening in genomes and metagenomes. PeerJ, 8, e10555. [DOI: 10.7717/peerj.10555](https://doi.org/10.7717/peerj.10555)

diff --git a/conf/base.config b/conf/base.config
@@ -231,4 +231,11 @@ process {
         memory = { 6.GB * task.attempt }
         time   = { 2.h * task.attempt }
     }
+
+    withName: INTERPROSCAN_DATABASE {
+        memory = { 6.GB * task.attempt }
+        time   = { 4.h * task.attempt }
+        cpus   = { 6 * task.attempt }
+    }
+
 }
diff --git a/conf/modules.config b/conf/modules.config
@@ -91,7 +91,7 @@ process {
         ].join(' ').trim()
     }
 
-    withName: SEQKIT_SEQ {
+    withName: SEQKIT_SEQ_LENGTH {
         ext.prefix = { "${meta.id}_long" }
         publishDir = [
             path: { "${params.outdir}/bgc/seqkit/" },
@@ -104,6 +104,45 @@ process {
         ].join(' ').trim()
     }
 
+    withName: SEQKIT_SEQ_FILTER {
+        ext.prefix = { "${meta.id}_cleaned.faa" }
+        publishDir = [
+            path: { "${params.outdir}/protein_annotation/interproscan/" },
+            mode: params.publish_dir_mode,
+            enabled: { params.run_protein_annotation },
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+        ext.args   = [
+            "--gap-letters '* \t.' --remove-gaps"
+        ].join(' ').trim()
+    }
+
+    withName: INTERPROSCAN_DATABASE {
+        publishDir = [
+            path: { "${params.outdir}/databases/interproscan/" },
+            mode: params.publish_dir_mode,
+            enabled: params.save_db,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: INTERPROSCAN {
+        ext.prefix = { "${meta.id}_interproscan.faa" }
+        publishDir = [
+            path: { "${params.outdir}/protein_annotation/interproscan/" },
+            mode: params.publish_dir_mode,
+            enabled: params.run_protein_annotation,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+        ext.args   = [
+            "--applications ${params.protein_annotation_interproscan_applications}",
+            params.protein_annotation_interproscan_enableprecalc ? '' : '--disable-precalc',
+            params.protein_annotation_interproscan_enableresidueannot ? '' : '--disable-residue-annot',
+            params.protein_annotation_interproscan_disableresidueannottsv ? '--enable-tsv-residue-annot' : '',
-            params.protein_annotation_interproscan_disableresidueannottsv ? '--enable-tsv-residue-annot' : '',
+            params.protein_annotation_interproscan_disableresidueannottsv ? '' : '--enable-tsv-residue-annot',
-            params.protein_annotation_interproscan_disableresidueannottsv ? '--enable-tsv-residue-annot' : '',
+            params.protein_annotation_interproscan_disableresidueannottsv ? '' : '--enable-tsv-residue-annot',
+            "--formats tsv"
+        ].join(' ').trim()
+    }
+
     withName: PROKKA {
         ext.prefix = { "${meta.id}_prokka" }
         publishDir = [
@@ -686,7 +725,7 @@ process {
 
     withName: AMP_DATABASE_DOWNLOAD {
         publishDir = [
-            path: { "${params.outdir}/databases/${params.amp_ampcombi_db}" },
+            path: { "${params.outdir}/databases/ampcombi/" },
             mode: params.publish_dir_mode,
             enabled: params.save_db,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename },

diff --git a/docs/output.md b/docs/output.md
@@ -25,6 +25,8 @@ results/
 |   ├── prodigal/
 |   ├── prokka/
 |   └── pyrodigal/
+├── protein_annotation/
+|   └── interproscan/
 ├── amp/
 |   ├── ampir/
 |   ├── amplify/
@@ -74,6 +76,10 @@ ORF prediction and annotation with any of:
 - [Prokka](#prokka) – open reading frame prediction and functional protein annotation.
 - [Bakta](#bakta) – open reading frame prediction and functional protein annotation.
 
+CDS domain annotation:
+
+- [InterProScan](#interproscan) (default) – for open reading frame protein and domain predictions.
+
 Antimicrobial Resistance Genes (ARGs):
 
 - [ABRicate](#abricate) – antimicrobial resistance gene detection, based on alignment to one of several databases.
@@ -216,6 +222,23 @@ Output Summaries:
 
 [Bakta](https://github.com/oschwengers/bakta) is a tool for the rapid & standardised annotation of bacterial genomes and plasmids from both isolates and MAGs. It provides dbxref-rich, sORF-including and taxon-independent annotations in machine-readable JSON & bioinformatics standard file formats for automated downstream analysis. The output is used by some of the functional screening tools.
 
+### Protein annotation
+
+[InterProScan](#interproscan)
+
+#### InterProScan
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `interproscan/`
+  - `<samplename>_cleaned.faa`: clean version of the fasta files (in amino acid format) generated by one of the annotation tools (i.e. Pyrodigal, Prokka, Bakta). These contain sequences with no special characters (for eg. '\*' or '-').
-  - `<samplename>_cleaned.faa`: clean version of the fasta files (in amino acid format) generated by one of the annotation tools (i.e. Pyrodigal, Prokka, Bakta). These contain sequences with no special characters (for eg. '\*' or '-').
+  - `<samplename>_cleaned.faa`: clean version of the fasta files (in amino acid format) generated by one of the annotation tools (i.e. Pyrodigal, Prokka, Bakta). These contain sequences with no special characters (for eg. `*` or `-`).
-  - `<samplename>_cleaned.faa`: clean version of the fasta files (in amino acid format) generated by one of the annotation tools (i.e. Pyrodigal, Prokka, Bakta). These contain sequences with no special characters (for eg. '\*' or '-').
+  - `<samplename>_cleaned.faa`: clean version of the fasta files (in amino acid format) generated by one of the annotation tools (i.e. Pyrodigal, Prokka, Bakta). These contain sequences with no special characters (for eg. `*` or `-`).
+  - `<samplename>_interproscan_faa.tsv`: predicted proteins and domains using the InterPro database in TSV format
+
+</details>
+
+[InterProScan](https://academic.oup.com/bioinformatics/article/30/9/1236/237988?login=true) is designed to predict the protein function and and provide possible domain and motif information for the coding regions. It utilizes the InterPro database that consists of multiple sister databases such as PANTHER, ProSite, Pfam, etc. More details can be found in the [documentation](https://interproscan-docs.readthedocs.io/en/latest/index.html).
-[InterProScan](https://academic.oup.com/bioinformatics/article/30/9/1236/237988?login=true) is designed to predict the protein function and and provide possible domain and motif information for the coding regions. It utilizes the InterPro database that consists of multiple sister databases such as PANTHER, ProSite, Pfam, etc. More details can be found in the [documentation](https://interproscan-docs.readthedocs.io/en/latest/index.html).
+[InterProScan](https://github.com/ebi-pf-team/interproscan) is designed to predict protein functions and provide possible domain and motif information of the coding regions. It utilizes the InterPro database that consists of multiple sister databases such as PANTHER, ProSite, Pfam, etc. More details can be found in the [documentation](https://interproscan-docs.readthedocs.io/en/latest/index.html).
-[InterProScan](https://academic.oup.com/bioinformatics/article/30/9/1236/237988?login=true) is designed to predict the protein function and and provide possible domain and motif information for the coding regions. It utilizes the InterPro database that consists of multiple sister databases such as PANTHER, ProSite, Pfam, etc. More details can be found in the [documentation](https://interproscan-docs.readthedocs.io/en/latest/index.html).
+[InterProScan](https://github.com/ebi-pf-team/interproscan) is designed to predict protein functions and provide possible domain and motif information of the coding regions. It utilizes the InterPro database that consists of multiple sister databases such as PANTHER, ProSite, Pfam, etc. More details can be found in the [documentation](https://interproscan-docs.readthedocs.io/en/latest/index.html).
+
 ### AMP detection tools
 
 [ampir](#ampir), [AMPlify](#amplify), [hmmsearch](#hmmsearch), [Macrel](#macrel)
@@ -465,6 +488,11 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation
   - `<sample>/*_ampcombi.tsv`: summarised output in tsv format for each sample
   - `<sample>/*_amp.faa*`: fasta file containing the amino acid sequences for all AMP hits for each sample
   - `<sample>/*_mmseqs_matches.txt*`: alignment file generated by MMseqs2 for each sample
+
+:::info
+In some cases when the AMP and the taxonomic classification subworkflows are turned on, it can happen that only summary files per sample are created in the output folder with **NO** `Ampcombi_summary.tsv` and `Ampcombi_summary_cluster.tsv` files with no taxonomic classifications merged. This can occur if some AMP parameters are 'too strict' or only one AMP tool is run, which can lead to no AMP hits found in any of the samples or in only one sample. Look out for `[nf-core/funcscan] AMPCOMBI2: 0/1 file passed. Skipping AMPCOMBI2_COMPLETE, AMPCOMBI2_CLUSTER, and TAXONOMY MERGING steps.`in the stdout or `.nextflow.log` file. In that case we recommend to lower the amp threshold and run more than one AMP prediction tool.
-In some cases when the AMP and the taxonomic classification subworkflows are turned on, it can happen that only summary files per sample are created in the output folder with **NO** `Ampcombi_summary.tsv` and `Ampcombi_summary_cluster.tsv` files with no taxonomic classifications merged. This can occur if some AMP parameters are 'too strict' or only one AMP tool is run, which can lead to no AMP hits found in any of the samples or in only one sample. Look out for `[nf-core/funcscan] AMPCOMBI2: 0/1 file passed. Skipping AMPCOMBI2_COMPLETE, AMPCOMBI2_CLUSTER, and TAXONOMY MERGING steps.`in the stdout or `.nextflow.log` file. In that case we recommend to lower the amp threshold and run more than one AMP prediction tool.
+In some cases when the AMP and the taxonomic classification subworkflows are turned on, it can happen that only summary files per sample are created in the output folder with **no** `Ampcombi_summary.tsv` and `Ampcombi_summary_cluster.tsv` files with no taxonomic classifications merged. This can occur if some AMP prediction parameters are 'too strict' or only one AMP tool is run, which can lead to no AMP hits found in any of the samples or in only one sample. Look out for the warning `[nf-core/funcscan] AMPCOMBI2: 0/1 file passed. Skipping AMPCOMBI2_COMPLETE, AMPCOMBI2_CLUSTER, and TAXONOMY MERGING steps.` in the stdout or `.nextflow.log` file. In that case we recommend to lower the AMP prediction thresholds and run more than one AMP prediction tool.
-In some cases when the AMP and the taxonomic classification subworkflows are turned on, it can happen that only summary files per sample are created in the output folder with **NO** `Ampcombi_summary.tsv` and `Ampcombi_summary_cluster.tsv` files with no taxonomic classifications merged. This can occur if some AMP parameters are 'too strict' or only one AMP tool is run, which can lead to no AMP hits found in any of the samples or in only one sample. Look out for `[nf-core/funcscan] AMPCOMBI2: 0/1 file passed. Skipping AMPCOMBI2_COMPLETE, AMPCOMBI2_CLUSTER, and TAXONOMY MERGING steps.`in the stdout or `.nextflow.log` file. In that case we recommend to lower the amp threshold and run more than one AMP prediction tool.
+In some cases when the AMP and the taxonomic classification subworkflows are turned on, it can happen that only summary files per sample are created in the output folder with **no** `Ampcombi_summary.tsv` and `Ampcombi_summary_cluster.tsv` files with no taxonomic classifications merged. This can occur if some AMP prediction parameters are 'too strict' or only one AMP tool is run, which can lead to no AMP hits found in any of the samples or in only one sample. Look out for the warning `[nf-core/funcscan] AMPCOMBI2: 0/1 file passed. Skipping AMPCOMBI2_COMPLETE, AMPCOMBI2_CLUSTER, and TAXONOMY MERGING steps.` in the stdout or `.nextflow.log` file. In that case we recommend to lower the AMP prediction thresholds and run more than one AMP prediction tool.
+:::
+
   <summary>AMP summary table header descriptions using DRAMP as reference database</summary>
 
 | Table column              | Description                                                                                                                                                                                                                                                                                                                                                                                                                        |

diff --git a/docs/usage.md b/docs/usage.md
@@ -109,7 +109,7 @@ We highly recommend performing quality control on input contigs before running t
 For example, ideally BGC screening requires contigs of at least 3,000 bp else downstream tools may crash.
 :::
 
-## Notes on screening tools and taxonomic classification
+## Notes on screening tools, taxonomic and functional classifications
 
 The implementation of some tools in the pipeline may have some particular behaviours that you should be aware of before you run the pipeline.
 
@@ -131,6 +131,22 @@ MMseqs2 is currently the only taxonomic classification tool used in the pipeline
   --taxa_classification_mmseqs_db_id 'Kalamari'
   ```
 
+### InterProScan
+
+[InterProScan](https://github.com/ebi-pf-team/interproscan) is currently the only protein annotation tool that gives a snapshot of the protein families and domains for each coding region.
+
+By giving `--run_protein_annotation` the `--protein_annotation_tool InterProScan` is activated by default and the [InterPro database](http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.67-99.0/) v5.67-99.0 is by default downloaded and prepared and the input sequences will be screened against the database.
+
+You can skip database downloading by the pipeline on each run by manually downloading and extracting the files from any [InterPro version](http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/) and giving the resulting directory path to `--protein_annotation_interproscan_db`.
+
+```bash
+--function_interproscan_db 'path/to/InterPro_directory/'
+```
+
+:::info
+By default the databases used to assign the nearest protein domain is set as `PANTHER,ProSiteProfiles,ProSitePatterns,Pfam`. An addition of other application to the list, does not guarantee that the results will be integrated correctly within `AMPcombi`.
-By giving `--run_protein_annotation` the `--protein_annotation_tool InterProScan` is activated by default and the [InterPro database](http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.67-99.0/) v5.67-99.0 is by default downloaded and prepared and the input sequences will be screened against the database.
-
-You can skip database downloading by the pipeline on each run by manually downloading and extracting the files from any [InterPro version](http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/) and giving the resulting directory path to `--protein_annotation_interproscan_db`.
-
-```bash
--function_interproscan_db 'path/to/InterPro_directory/'
-```
-
-:::info
-By default the databases used to assign the nearest protein domain is set as `PANTHER,ProSiteProfiles,ProSitePatterns,Pfam`. An addition of other application to the list, does not guarantee that the results will be integrated correctly within `AMPcombi`.
+The protein annotation workflow is activated with the flag `--run_protein_annotation`. InterProScan is used as the only protein annotation tool at the moment and the [InterPro database](http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.72-103.0) version 5.72-103.0 is downloaded and prepared to screen the input sequences against it.
+
+Since the database download is huge (5.5GB) and might take quite some time, you can skip the automatic database download on each run by manually downloading and extracting the files of any [InterPro version](http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/) beforehand and providing the resulting directory path to `--protein_annotation_interproscan_db <path/to/interprodatabase>`.
+
+:::info
+By default, the databases used by InterProScan is set as `PANTHER,ProSiteProfiles,ProSitePatterns,Pfam`. An addition of other application to the list does not guarantee that the results will be integrated correctly within `AMPcombi`.
-By giving `--run_protein_annotation` the `--protein_annotation_tool InterProScan` is activated by default and the [InterPro database](http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.67-99.0/) v5.67-99.0 is by default downloaded and prepared and the input sequences will be screened against the database.
-
-You can skip database downloading by the pipeline on each run by manually downloading and extracting the files from any [InterPro version](http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/) and giving the resulting directory path to `--protein_annotation_interproscan_db`.
-
-```bash
--function_interproscan_db 'path/to/InterPro_directory/'
-```
-
-:::info
-By default the databases used to assign the nearest protein domain is set as `PANTHER,ProSiteProfiles,ProSitePatterns,Pfam`. An addition of other application to the list, does not guarantee that the results will be integrated correctly within `AMPcombi`.
+The protein annotation workflow is activated with the flag `--run_protein_annotation`. InterProScan is used as the only protein annotation tool at the moment and the [InterPro database](http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.72-103.0) version 5.72-103.0 is downloaded and prepared to screen the input sequences against it.
+
+Since the database download is huge (5.5GB) and might take quite some time, you can skip the automatic database download on each run by manually downloading and extracting the files of any [InterPro version](http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/) beforehand and providing the resulting directory path to `--protein_annotation_interproscan_db <path/to/interprodatabase>`.
+
+:::info
+By default, the databases used by InterProScan is set as `PANTHER,ProSiteProfiles,ProSitePatterns,Pfam`. An addition of other application to the list does not guarantee that the results will be integrated correctly within `AMPcombi`.
+:::
+
 ### antiSMASH
 
 antiSMASH has a minimum contig parameter, in which only contigs of a certain length (or longer) will be screened. In cases where no hits are found in these, the tool ends successfully without hits. However if no contigs in an input file reach that minimum threshold, the tool will end with a 'failure' code, and cause the pipeline to crash.
@@ -256,6 +272,22 @@ The pipeline will automatically run Pyrodigal instead of Prodigal if the paramet
 This is due to an incompatibility issue of Prodigal's output `.gbk` file with multiple downstream tools.
 :::
 
+:::tip
+
+- If `--run_protein_annotation` is given, protein and domain classifications of the coding regions are generated and the output is then integrated into the `AMPcombi parsetables` resulting table for every sample and the complete summary files e.g., `Ampcombi_summary.tsv`.
- If `--run_protein_annotation` is given, protein and domain classifications of the coding regions are generated and the output is then integrated into the `AMPcombi parsetables` resulting table for every sample and the complete summary files e.g., `Ampcombi_summary.tsv`.
+- If `--run_protein_annotation` is activated, protein and domain classifications of the coding regions are generated and then used by the `ampcombi2/parsetables` module to create a table for every sample and the complete summary files e.g., `Ampcombi_summary.tsv`.
- If `--run_protein_annotation` is given, protein and domain classifications of the coding regions are generated and the output is then integrated into the `AMPcombi parsetables` resulting table for every sample and the complete summary files e.g., `Ampcombi_summary.tsv`.
+- If `--run_protein_annotation` is activated, protein and domain classifications of the coding regions are generated and then used by the `ampcombi2/parsetables` module to create a table for every sample and the complete summary files e.g., `Ampcombi_summary.tsv`.
+
+- In some cases when the AMP and the taxonomic classification subworkflows are
+  turned on, it can happen that only summary files per sample are created in the
+  output folder with **NO** `Ampcombi_summary.tsv` and `Ampcombi_summary_cluster.
+tsv` files with no taxonomic classifications merged. This can occur if some AMP
+  parameters are 'too strict' or only one AMP tool is run, which can lead to no AMP
+  hits found in any of the samples or in only one sample. Look out for `[nf-core/
+funcscan] AMPCOMBI2: 0/1 file passed. Skipping AMPCOMBI2_COMPLETE,
+AMPCOMBI2_CLUSTER, and TAXONOMY MERGING steps.`in the stdout or `.nextflow.log`
+  file. In that case we recommend to lower the amp threshold and run more than one
+  AMP prediction tool.
-
- In some cases when the AMP and the taxonomic classification subworkflows are
-  turned on, it can happen that only summary files per sample are created in the
-  output folder with **NO** `Ampcombi_summary.tsv` and `Ampcombi_summary_cluster.
-tsv` files with no taxonomic classifications merged. This can occur if some AMP
-  parameters are 'too strict' or only one AMP tool is run, which can lead to no AMP
-  hits found in any of the samples or in only one sample. Look out for `[nf-core/
-funcscan] AMPCOMBI2: 0/1 file passed. Skipping AMPCOMBI2_COMPLETE,
-AMPCOMBI2_CLUSTER, and TAXONOMY MERGING steps.`in the stdout or `.nextflow.log`
-  file. In that case we recommend to lower the amp threshold and run more than one
-  AMP prediction tool.
+In some cases when the AMP and the taxonomic classification subworkflows are turned on, it can happen that only summary files per sample are created in the output folder with **no** `Ampcombi_summary.tsv` and `Ampcombi_summary_cluster.tsv` files with no taxonomic classifications merged. This can occur if some AMP prediction parameters are 'too strict' or only one AMP tool is run, which can lead to no AMP hits found in any of the samples or in only one sample. Look out for the warning `[nf-core/funcscan] AMPCOMBI2: 0/1 file passed. Skipping AMPCOMBI2_COMPLETE, AMPCOMBI2_CLUSTER, and TAXONOMY MERGING steps.` in the stdout or `.nextflow.log` file. In that case we recommend to lower the AMP prediction thresholds and run more than one AMP prediction tool.
-
- In some cases when the AMP and the taxonomic classification subworkflows are
-  turned on, it can happen that only summary files per sample are created in the
-  output folder with **NO** `Ampcombi_summary.tsv` and `Ampcombi_summary_cluster.
-tsv` files with no taxonomic classifications merged. This can occur if some AMP
-  parameters are 'too strict' or only one AMP tool is run, which can lead to no AMP
-  hits found in any of the samples or in only one sample. Look out for `[nf-core/
-funcscan] AMPCOMBI2: 0/1 file passed. Skipping AMPCOMBI2_COMPLETE,
-AMPCOMBI2_CLUSTER, and TAXONOMY MERGING steps.`in the stdout or `.nextflow.log`
-  file. In that case we recommend to lower the amp threshold and run more than one
-  AMP prediction tool.
+In some cases when the AMP and the taxonomic classification subworkflows are turned on, it can happen that only summary files per sample are created in the output folder with **no** `Ampcombi_summary.tsv` and `Ampcombi_summary_cluster.tsv` files with no taxonomic classifications merged. This can occur if some AMP prediction parameters are 'too strict' or only one AMP tool is run, which can lead to no AMP hits found in any of the samples or in only one sample. Look out for the warning `[nf-core/funcscan] AMPCOMBI2: 0/1 file passed. Skipping AMPCOMBI2_COMPLETE, AMPCOMBI2_CLUSTER, and TAXONOMY MERGING steps.` in the stdout or `.nextflow.log` file. In that case we recommend to lower the AMP prediction thresholds and run more than one AMP prediction tool.
+  :::
+
 ### Abricate
 
 The default ABRicate installation comes with a series of 'default' databases:
@@ -499,14 +531,26 @@ The contents of the database directory should include directories such as `commo
 ```console
 deepbgc_db/
 ├── common
-  └── Pfam-hmm-models*.hmm.*
+A diifferent version of the database can be supplied to the pipeline by passing the InterProScan database directory to `--protein_annotation_interproscan_db path/to/interproscan_db/`. The directory can be created following with:
-A diifferent version of the database can be supplied to the pipeline by passing the InterProScan database directory to `--protein_annotation_interproscan_db path/to/interproscan_db/`. The directory can be created following with:
-A diifferent version of the database can be supplied to the pipeline by passing the InterProScan database directory to `--protein_annotation_interproscan_db path/to/interproscan_db/`. The directory can be created following with:
 └── <version-num>[0.1.0]
   ├── classifier
   | └── myClassifiers*.pkl
   └── detector
     └── myDetectors*.pkl
 ```
 
+### InterProScan
+
+[InterProScan](https://github.com/ebi-pf-team/interproscan) is used to provide more information about the proteins annotated on the contigs. By default, turning on this subworkflow with `--run_protein_annotation` and `--protein_annotation_tool InterProScan` will download and unzip the (as of now) latest [InterPro database](http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.67-99.0/) v5.67-99.0. The database downloaded can be saved in the output directory `<output_directors>/databases/interproscan/*` if the `--save_db` is turned on. Note: the download can take upto 4 hours depending on teh bandwidth.
-[InterProScan](https://github.com/ebi-pf-team/interproscan) is used to provide more information about the proteins annotated on the contigs. By default, turning on this subworkflow with `--run_protein_annotation` and `--protein_annotation_tool InterProScan` will download and unzip the (as of now) latest [InterPro database](http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.67-99.0/) v5.67-99.0. The database downloaded can be saved in the output directory `<output_directors>/databases/interproscan/*` if the `--save_db` is turned on. Note: the download can take upto 4 hours depending on teh bandwidth.
+[InterProScan](https://github.com/ebi-pf-team/interproscan) is used to provide more information about the proteins annotated on the contigs. By default, turning on this subworkflow with `--run_protein_annotation` will download and unzip the [InterPro database](http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.72-103.0/) version 5.72-103.0. The database can be saved in the output directory `<output_directors>/databases/interproscan/` if the `--save_db` is turned on. Note: the huge database download (5.5GB) can take up to 4 hours depending on the bandwidth.
-[InterProScan](https://github.com/ebi-pf-team/interproscan) is used to provide more information about the proteins annotated on the contigs. By default, turning on this subworkflow with `--run_protein_annotation` and `--protein_annotation_tool InterProScan` will download and unzip the (as of now) latest [InterPro database](http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.67-99.0/) v5.67-99.0. The database downloaded can be saved in the output directory `<output_directors>/databases/interproscan/*` if the `--save_db` is turned on. Note: the download can take upto 4 hours depending on teh bandwidth.
+[InterProScan](https://github.com/ebi-pf-team/interproscan) is used to provide more information about the proteins annotated on the contigs. By default, turning on this subworkflow with `--run_protein_annotation` will download and unzip the [InterPro database](http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.72-103.0/) version 5.72-103.0. The database can be saved in the output directory `<output_directors>/databases/interproscan/` if the `--save_db` is turned on. Note: the huge database download (5.5GB) can take up to 4 hours depending on the bandwidth.
+
+A different version of the database can be supplied to the pipeline by passing the InterProScan database directory to `--protein_annotation_interproscan_db path/to/downloaded-untarred-interproscan_db-dir/`. The directory can be created following:
-A different version of the database can be supplied to the pipeline by passing the InterProScan database directory to `--protein_annotation_interproscan_db path/to/downloaded-untarred-interproscan_db-dir/`. The directory can be created following:
+A local version of the database can be supplied to the pipeline by passing the InterProScan database directory to `--protein_annotation_interproscan_db <path/to/downloaded-untarred-interproscan_db-dir/>`. The directory can be created by running (e.g. for database version 5.72-103.0):
-A different version of the database can be supplied to the pipeline by passing the InterProScan database directory to `--protein_annotation_interproscan_db path/to/downloaded-untarred-interproscan_db-dir/`. The directory can be created following:
+A local version of the database can be supplied to the pipeline by passing the InterProScan database directory to `--protein_annotation_interproscan_db <path/to/downloaded-untarred-interproscan_db-dir/>`. The directory can be created by running (e.g. for database version 5.72-103.0):
+
+```
+curl -L https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.67-99.0/interproscan-5.67-99.0-64-bit.tar.gz -o interproscan_db/interproscan-5.67-99.0-64-bit.tar.gz
+tar -xzf interproscan_db/interproscan-5.67-99.0-64-bit.tar.gz -C interproscan_db/
-curl -L https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.67-99.0/interproscan-5.67-99.0-64-bit.tar.gz -o interproscan_db/interproscan-5.67-99.0-64-bit.tar.gz
-tar -xzf interproscan_db/interproscan-5.67-99.0-64-bit.tar.gz -C interproscan_db/
+curl -L https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.72-103.0/interproscan-5.72-103.0-64-bit.tar.gz -o interproscan_db/interproscan-5.72-103.0-64-bit.tar.gz
+tar -xzf interproscan_db/interproscan-5.72-103.0-64-bit.tar.gz -C interproscan_db/
-curl -L https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.67-99.0/interproscan-5.67-99.0-64-bit.tar.gz -o interproscan_db/interproscan-5.67-99.0-64-bit.tar.gz
-tar -xzf interproscan_db/interproscan-5.67-99.0-64-bit.tar.gz -C interproscan_db/
+curl -L https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.72-103.0/interproscan-5.72-103.0-64-bit.tar.gz -o interproscan_db/interproscan-5.72-103.0-64-bit.tar.gz
+tar -xzf interproscan_db/interproscan-5.72-103.0-64-bit.tar.gz -C interproscan_db/
+
-
-
+```
+
-
+
+The contents of the database directory should include the directory `data` in the top level with a couple of subdirectories:
+
+```console
+interproscan_db/
+└── data/
+  ├── antifam
+  ├── cdd
+  ├── funfam
+  ├── gene3d
+  ├── hamap
+  ├── ncbifam
+  ├── panther
+  | └── <version-num>[18.0]
+  ├── pfam
+  | └── <version-num>[36.0]
+  ├── phobius
+  ├── pirsf
+  ├── pirsr
+  ├── prints
+  ├── prosite
+  | └── <version-num>[2023_05]
+  ├── sfld
+  ├── smart
+  ├── superfamily
+  └── tmhmm
-
+
+The contents of the database directory should include the directory `data` in the top level with a couple of subdirectories:
+
+```console
+interproscan_db/
+└── data/
+  ├── antifam
+  ├── cdd
+  ├── funfam
+  ├── gene3d
+  ├── hamap
+  ├── ncbifam
+  ├── panther
+  | └── <version-num>[18.0]
+  ├── pfam
+  | └── <version-num>[36.0]
+  ├── phobius
+  ├── pirsf
+  ├── pirsr
+  ├── prints
+  ├── prosite
+  | └── <version-num>[2023_05]
+  ├── sfld
+  ├── smart
+  ├── superfamily
+  └── tmhmm
 ## Updating the pipeline
 
 When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline:

diff --git a/modules.json b/modules.json
@@ -140,6 +140,11 @@
                         "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
                         "installed_by": ["modules"]
                     },
+                    "interproscan": {
+                        "branch": "master",
+                        "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
+                        "installed_by": ["modules"]
+                    },
                     "macrel/contigs": {
                         "branch": "master",
                         "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",

diff --git a/modules/local/interproscan_download.nf b/modules/local/interproscan_download.nf
@@ -0,0 +1,35 @@
+process INTERPROSCAN_DATABASE {
+    tag "interproscan_database_download"
+    label 'process_medium'
+
+    conda "conda-forge::sed=4.7"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/curl:7.80.0' :
+        'biocontainers/curl:7.80.0' }"
+
+    input:
+    val database_url
+
+    output:
+    path("interproscan_db/*")   , emit: db
+    path "versions.yml"         , emit: versions
-    path("interproscan_db/*")   , emit: db
-    path "versions.yml"         , emit: versions
+    path("interproscan_db/*"), emit: db
+    path "versions.yml", emit: versions
-    path("interproscan_db/*")   , emit: db
-    path "versions.yml"         , emit: versions
+    path("interproscan_db/*"), emit: db
+    path "versions.yml", emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    """
+    mkdir -p interproscan_db/
+
+    filename=\$(basename ${database_url})
+
+    curl -L ${database_url} -o interproscan_db/\$filename
+    tar -xzf interproscan_db/\$filename -C interproscan_db/
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        tar: \$(tar --version 2>&1 | sed -n '1s/tar (busybox) //p')
+        curl: "\$(curl --version 2>&1 | sed -n '1s/^curl \\([0-9.]*\\).*/\\1/p')"
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/interproscan/environment.yml b/modules/nf-core/interproscan/environment.yml
diff --git a/modules/nf-core/interproscan/main.nf b/modules/nf-core/interproscan/main.nf