diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..973fc50 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,16 @@ +# Changelog + +## 1.0.0 (2022-04-20) + + +### Features + +* add polishing step ([d16c791](https://www.github.com/matinnuhamunada/genome_assembly_tryouts/commit/d16c791b54e0cd447ba533fa65d0b35d93eda9a0)) +* initial attempt of trycycler ([2ae19d4](https://www.github.com/matinnuhamunada/genome_assembly_tryouts/commit/2ae19d4003c09801c486748a6a09f32c2ecb3256)) +* split steps ([0eed51f](https://www.github.com/matinnuhamunada/genome_assembly_tryouts/commit/0eed51ffe6c938bd630d82791051b1daf9dbfd8b)) + + +### Bug Fixes + +* clean files ([796bc78](https://www.github.com/matinnuhamunada/genome_assembly_tryouts/commit/796bc78a7fc392b85cbbaee8ad85d17397be7e3b)) +* correct input folder name ([cbad160](https://www.github.com/matinnuhamunada/genome_assembly_tryouts/commit/cbad1602c3f119331428f3300cf5bb2dcf29cae1)) diff --git a/README.md b/README.md index 93090e0..bba22b1 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,8 @@ [![Snakemake](https://img.shields.io/badge/snakemake-≥6.15.1-brightgreen.svg)](https://snakemake.github.io) This is an experimental snakemake workflow for trying out [trycylcer](https://github.com/rrwick/Trycycler). -It follow the author's instruction: https://github.com/rrwick/Trycycler/wiki +It follow the author's instruction: https://github.com/rrwick/Trycycler/wiki, where the assembly are split into different steps. +See [step 4](#step-4-executing-the-workflow) ## Usage ### Step 1: Clone the workflow @@ -56,14 +57,22 @@ Activate the conda environment: conda activate snakemake #### Part 1 - Generating Assemblies +This step generates multiple assemblies as described in: https://github.com/rrwick/Trycycler/wiki/Generating-assemblies snakemake --snakefile workflow/Snakefile-assembly --use-conda --cores +TO DO: Generate a bandage gfa graph for each assemblies for manual curation #### Part 2 - Clustering Contigs +This step clusters the assemblies into per-replicon groups as described in: https://github.com/rrwick/Trycycler/wiki/Clustering-contigs snakemake --snakefile workflow/Snakefile-cluster --use-conda --cores +This step also generate `data/interim/02_trycycler_cluster/cluster.yaml` which should be copied to the config folder in order to proceed to the next step. NOTE: You can select or drops the bad contigs or clusters that will ber run in the next step + +TO DO: Generate a tree image with necessary information (size, depth) for manual curation of the clusters #### Part 3 - Generating Consensus +This step summarizes step 3, 4, 5, and 6 in the Trycycler wiki and generate the consensus contig sequence as described in: https://github.com/rrwick/Trycycler/wiki/Generating-a-consensus + snakemake --snakefile workflow/Snakefile-consensus --use-conda --cores diff --git a/workflow/Snakefile-assembly b/workflow/Snakefile-assembly index 6b2cc95..c909533 100644 --- a/workflow/Snakefile-assembly +++ b/workflow/Snakefile-assembly @@ -3,7 +3,8 @@ include: "rules/common.smk" ##### Target rules ##### rule all: input: - expand('data/interim/01_trycycler_assembly/{strains}/nanopore/assemblies/assembly_{subsample}.fasta', subsample=['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'], strains=STRAINS) + expand('data/interim/01_trycycler_assembly/{strains}/nanopore/assemblies/assembly_{subsample}.fasta', subsample=['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'], strains=STRAINS), + expand('data/interim/01_trycycler_assembly/{strains}/nanopore/assemblies/assembly_{subsample}.png', subsample=['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'], strains=STRAINS) ##### Modules ##### include: "rules/01_trycycler_assembly.smk" \ No newline at end of file diff --git a/workflow/Snakefile-cluster b/workflow/Snakefile-cluster index fa80681..a7c5267 100644 --- a/workflow/Snakefile-cluster +++ b/workflow/Snakefile-cluster @@ -3,7 +3,8 @@ include: "rules/common.smk" ##### Target rules ##### rule all: input: - expand('data/interim/02_trycycler_cluster/cluster.yaml') + 'data/interim/02_trycycler_cluster/cluster.yaml', + expand('data/processed/{strains}/02_trycycler_cluster/{strains}_cluster.png', strains=STRAINS), ##### Modules ##### include: "rules/02_trycycler_cluster.smk" \ No newline at end of file diff --git a/workflow/envs/R.yaml b/workflow/envs/R.yaml new file mode 100644 index 0000000..fa31866 --- /dev/null +++ b/workflow/envs/R.yaml @@ -0,0 +1,16 @@ +name: R_env +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - r-base + - bioconductor-ggtree + - bioconductor-treeio + - bioconductor-ggtreeextra + - r-ggplot2 + - r-ape + - r-dplyr + - r-argparser + - r-aplot + - r-tidytree diff --git a/workflow/envs/utilities.yaml b/workflow/envs/utilities.yaml index ce92a9b..1cadf0c 100644 --- a/workflow/envs/utilities.yaml +++ b/workflow/envs/utilities.yaml @@ -12,6 +12,7 @@ dependencies: - minimap2 - miniasm - racon + - bandage - pip - pip: - git+https://github.com/rrwick/Minipolish.git \ No newline at end of file diff --git a/workflow/rules/01_trycycler_assembly.smk b/workflow/rules/01_trycycler_assembly.smk index 8fe2862..d288bc4 100644 --- a/workflow/rules/01_trycycler_assembly.smk +++ b/workflow/rules/01_trycycler_assembly.smk @@ -120,4 +120,18 @@ rule assemble_raven: shell: """ raven --threads {threads} --graphical-fragment-assembly {output.graph} --disable-checkpoints {input} > {output.assembly} 2>> {log} + """ + +rule draw_graph: + input: + graph = 'data/interim/01_trycycler_assembly/{strains}/nanopore/assemblies/assembly_{subsample}.gfa' + output: + graph = 'data/interim/01_trycycler_assembly/{strains}/nanopore/assemblies/assembly_{subsample}.png' + log: + "workflow/report/logs/01_trycycler_assembly/{strains}/bandage/{strains}_{subsample}.log" + conda: + "../envs/utilities.yaml" + shell: + """ + Bandage image {input.graph} {output.graph} &>> {log} """ \ No newline at end of file diff --git a/workflow/rules/02_trycycler_cluster.smk b/workflow/rules/02_trycycler_cluster.smk index 3059bd7..f0e6608 100644 --- a/workflow/rules/02_trycycler_cluster.smk +++ b/workflow/rules/02_trycycler_cluster.smk @@ -16,7 +16,7 @@ rule trycylcer_cluster: rule cluster_dump: input: - expand('data/interim/02_trycycler_cluster/{strains}', strains=STRAINS) + expand('data/interim/02_trycycler_cluster/{strains}', strains=STRAINS), output: yaml = 'data/interim/02_trycycler_cluster/cluster.yaml' log: @@ -41,4 +41,18 @@ rule cluster_dump: # write as yaml with open(output.yaml, 'w') as f: - yaml.dump(clusters, f, Dumper=yaml_indent_dump, default_flow_style=False) \ No newline at end of file + yaml.dump(clusters, f, Dumper=yaml_indent_dump, default_flow_style=False) + +rule cluster_draw: + input: + cluster='data/interim/02_trycycler_cluster/{strains}' + output: + png = 'data/processed/{strains}/02_trycycler_cluster/{strains}_cluster.png' + log: + "workflow/report/logs/02_trycycler_cluster/draw_cluster_{strains}.log" + conda: + "../envs/R.yaml" + shell: + """ + Rscript workflow/scripts/ggtree.R -i {input.cluster}/contigs.newick -o {output.png} 2>> {log} + """ \ No newline at end of file diff --git a/workflow/rules/03_trycycler_consensus.smk b/workflow/rules/03_trycycler_consensus.smk index 9d4bcd1..aca503a 100644 --- a/workflow/rules/03_trycycler_consensus.smk +++ b/workflow/rules/03_trycycler_consensus.smk @@ -5,7 +5,7 @@ rule trycylcer_intermediate: reconcile = 'data/interim/03_trycycler_consensus/{strains}/{cluster}_copy.log' threads: 12 log: - "workflow/report/logs/trycycler/trycylcer_intermediate-{cluster}-{strains}.log" + "workflow/report/logs/03_trycycler_consensus/trycycler_intermediate/trycycler_intermediate-{cluster}-{strains}.log" params: cluster_file = config["clusters"] shell: @@ -21,7 +21,7 @@ rule trycylcer_reconcile: reconcile = 'data/interim/03_trycycler_consensus/{strains}/{cluster}/2_all_seqs.fasta' threads: 12 log: - "workflow/report/logs/trycycler/trycylcer_reconcile-{cluster}-{strains}.log" + "workflow/report/logs/03_trycycler_consensus/trycylcer_reconcile-{cluster}-{strains}.log" conda: "../envs/trycycler.yaml" params: @@ -38,7 +38,7 @@ rule trycylcer_MSA: msa = 'data/interim/03_trycycler_consensus/{strains}/{cluster}/3_msa.fasta' threads: 12 log: - "workflow/report/logs/trycycler/trycylcer_MSA-{cluster}-{strains}.log" + "workflow/report/logs/03_trycycler_consensus/trycylcer_MSA-{cluster}-{strains}.log" conda: "../envs/trycycler.yaml" params: @@ -57,7 +57,7 @@ rule trycylcer_partition: partition = "data/interim/03_trycycler_consensus/{strains}/partition.log" threads: 12 log: - "workflow/report/logs/trycycler/trycylcer_partition_{strains}.log" + "workflow/report/logs/03_trycycler_consensus/trycylcer_partition_{strains}.log" conda: "../envs/trycycler.yaml" params: @@ -76,7 +76,7 @@ rule trycylcer_consensus: consensus = 'data/interim/03_trycycler_consensus/{strains}/{cluster}/7_final_consensus.fasta' threads: 12 log: - "workflow/report/logs/trycycler/trycylcer_consensus-{cluster}-{strains}.log" + "workflow/report/logs/03_trycycler_consensus/trycylcer_consensus-{cluster}-{strains}.log" conda: "../envs/trycycler.yaml" shell: @@ -86,7 +86,6 @@ rule trycylcer_consensus: rule medaka_polish: input: - cluster = "data/interim/03_trycycler_consensus/{strains}/{cluster}", consensus = 'data/interim/03_trycycler_consensus/{strains}/{cluster}/7_final_consensus.fasta' output: medaka = "data/interim/03_trycycler_consensus/{strains}/{cluster}/8_medaka.fasta" @@ -94,14 +93,15 @@ rule medaka_polish: "../envs/trycycler.yaml" threads: 12 log: - "workflow/report/logs/trycycler/medaka_polish-{cluster}-{strains}.log" + "workflow/report/logs/03_trycycler_consensus/medaka_polish-{cluster}-{strains}.log" params: - model = 'r941_min_sup_g507' + model = 'r941_min_sup_g507', + cluster = "data/interim/03_trycycler_consensus/{strains}/{cluster}", shell: """ - medaka_consensus -i {input.cluster}/4_reads.fastq -d {input.consensus} -o {input.cluster}/medaka -m {params.model} -t 12 2>> {log} - mv {input.cluster}/medaka/consensus.fasta {input.cluster}/8_medaka.fasta - #rm -r {input.cluster}/medaka {input.cluster}/*.fai {input.cluster}/*.mmi + medaka_consensus -i {params.cluster}/4_reads.fastq -d {input.consensus} -o {params.cluster}/medaka -m {params.model} -t 12 &>> {log} + mv {params.cluster}/medaka/consensus.fasta {params.cluster}/8_medaka.fasta + #rm -r {params.cluster}/medaka {params.cluster}/*.fai {params.cluster}/*.mmi """ rule trycylcer_concat: @@ -111,7 +111,7 @@ rule trycylcer_concat: assembly = 'data/interim/03_trycycler_consensus/{strains}/assembly.fasta' threads: 12 log: - "workflow/report/logs/trycycler/trycylcer_concat-{strains}.log" + "workflow/report/logs/03_trycycler_consensus/trycylcer_concat-{strains}.log" conda: "../envs/trycycler.yaml" shell: diff --git a/workflow/scripts/ggtree.R b/workflow/scripts/ggtree.R new file mode 100644 index 0000000..b140053 --- /dev/null +++ b/workflow/scripts/ggtree.R @@ -0,0 +1,90 @@ +#library("treedataverse") +library("argparser") +library("aplot") +#library("svglite") +library("ape") +library("dplyr") +library("ggplot2") +library("tidytree") +library("treeio") +library("ggtree") +library("ggtreeExtra") + +# Parse command line arguments +p <- arg_parser("Draw tree from trycycler cluster, with cluster names, sequence lengths, and read depths") + +p <- add_argument(p, "--input", + short = "-i", + help = "Newick file generated by trycycler", + default = "contigs.newick") + +p <- add_argument(p, "--output", + short = "-o", + default = "tree_cluster.png", + help = "Output image file (PNG)") + +argv <- parse_args(p) + + +# READFILE +newick <- read.newick(argv$input) + +# GENERATE EMPTY DATAFRAME +metadata <- data.frame(old_label = vector(mode = "character"), + label = vector(mode = "character"), + cluster_name = vector(mode = "character"), + length = vector(mode = "integer"), + depth = vector(mode = "numeric")) + +# EXTRACT DATA FROM TREE +label <- newick$tip.label +for (i in seq_along(label)) { + item <- sapply(strsplit(label[i],"_"), function(x){ + old_label <- label[i] + new_label <- paste(x[[2]], x[[3]], sep = "") + cluster_name <- paste(x[[1]], x[[2]], sep = "_") + length <- as.integer(x[length(x) - 2]) + tip_depth <- tail(x, n=1) + depth <- as.numeric(substring(tip_depth, 1, nchar(tip_depth) - 1)) + output <- c(old_label, new_label, cluster_name, length, depth) + }) + metadata[nrow(metadata) + 1, ] <- item +} +metadata <- transform(metadata, length = as.integer(length), + depth = as.integer(depth)) + +# RENAME TIP LABEL AND MERGE DATA INTO TREE +tree <- rename_taxa(newick, metadata, old_label, label) +drops <- c("old_label") +tree_data <- metadata[ , !(names(metadata) %in% drops)] +p <- ggtree(tree) %<+% tree_data + + +# DRAW TREE +g <- p + geom_tiplab(offset = .005, + hjust = 0.005, + align=TRUE, + size = 3.2) + + geom_tippoint(aes(shape = cluster_name, color = cluster_name)) + + theme(legend.position = "right") + + theme_tree2() + +# DRAW BUBBLES +p2 <- ggplot(tree_data, aes(x=0, y = label, label=length)) + + geom_point(aes(size = length, fill = depth), alpha = 0.9, shape = 21) + + geom_text(hjust=-0.5, vjust=0.5, size=2) + + scale_size(trans = "log10") + + scale_fill_gradient(low = "white", high = "blue") + + theme(axis.title = element_blank(), + axis.ticks.x = element_blank(), + axis.text = element_blank(), + panel.background=element_blank(), + panel.border=element_blank(), + panel.grid.major=element_blank(), + plot.background=element_blank(), + ) + +# CREATE COMPOSITE GRAPH +p2 %>% insert_left(g, width = 2) + +ggsave(argv$output, dpi = 300) \ No newline at end of file diff --git a/workflow/scripts/symlink_cluster.py b/workflow/scripts/symlink_cluster.py new file mode 100644 index 0000000..025926e --- /dev/null +++ b/workflow/scripts/symlink_cluster.py @@ -0,0 +1,37 @@ +from pathlib import Path +import yaml +import sys + +def get_clusters(filepath): + """ + Read the clusters output and return a dictionary + """ + try: + with open(filepath) as file: + selected_cluster = yaml.load(file, Loader=yaml.FullLoader) + return selected_cluster + except FileNotFoundError as e: + sys.stderr.write(f"No cluster selected. The file: <{filepath}> is not a valid cluster format. Check your config.yaml.\n") + raise e + +def symlink_cluster(strain, cluster, cluster_file, source_dir = 'data/interim/02_trycycler_cluster', target_dir = 'data/interim/03_trycycler_consensus'): + clusters = get_clusters(cluster_file) + source_path = Path(source_dir) + target_path = Path(target_dir) + + for contigs in clusters[strain][cluster]: + source = source_path / strain / cluster / "1_contigs" / f"{contigs}.fasta" + target = target_path / strain / cluster / "1_contigs" / f"{contigs}.fasta" + target.parent.mkdir(parents=True, exist_ok=True) + try: + target.symlink_to(source.resolve(), target_is_directory=False) + except: + if target.is_symlink(): + target.unlink(missing_ok=True) + target.symlink_to(source.resolve(), target_is_directory=False) + with open(str(target_path / strain / f"{cluster}_copy.log"), 'w') as f: + f.write("") + return + +if __name__ == "__main__": + symlink_cluster(sys.argv[1], sys.argv[2], sys.argv[3]) \ No newline at end of file