diff --git a/MonoPhylo.py b/MonoPhylo.py index a6b1765..bbc1ebc 100644 --- a/MonoPhylo.py +++ b/MonoPhylo.py @@ -8,27 +8,83 @@ def get_args(): """Get arguments from CLI""" parser = argparse.ArgumentParser( description="""------------------------------------------------------------------------------ - MonoPhylo: A tool for examining phylogenetic relationships in newick tree files. + MonoPhylo: A tool for examining phylogenetic relationships in newick tree files. For + usage instructions, please see documentation available at: https://github.com/dportik/MonoPhylo. ------------------------------------------------------------------------------""") - parser.add_argument("-t", "--tree", required=True, help="REQUIRED: The full path to a newick tree file.") - parser.add_argument("-o", "--out_dir", required=True, help="REQUIRED: The full path to an existing directory to write output files.") - parser.add_argument("-m", "--map", required=False, default=None, help="OPTIONAL: The full path to a map file containing tip names and grouping schemes for examining monophyly.") - parser.add_argument("--write_tips", required=False, action='store_true', help="OPTIONAL: Obtain list of tips in tree and write to output file.") - parser.add_argument("--genus", required=False, action='store_true', help="OPTIONAL: Obtain genus names from tips in tree. Requires tips to be labeled in GENUS_SPECIES format.") - parser.add_argument("--support", required=False, action='store_true', help="OPTIONAL: Obtain support values for monophyletic groupings - requires a tree with support values present.") - parser.add_argument("--root", required=False, action='store_true', help="OPTIONAL: Root tree using MRCA of tree tip labels specified using tip flags (minimally -tip1 and -tip2).") - parser.add_argument("--tip1", required=False, default=None, help="Required for --root: The name of the first tree tip.") - parser.add_argument("--tip2", required=False, default=None, help="Required for --root: The name of the second tree tip.") - parser.add_argument("--tip3", required=False, default=None, help="Optional for --root: The name of a third tree tip.") - parser.add_argument("--tip4", required=False, default=None, help="Optional for --root: The name of a fourth tree tip.") - parser.add_argument("--tip5", required=False, default=None, help="Optional for --root: The name of a fifth tree tip.") - parser.add_argument("--write_root", required=False, action='store_true', help="Optional for --root: Write the rooted tree to an output file (newick format).") + parser.add_argument("-t", "--tree", + required=True, + help="REQUIRED: The full path to a newick tree file.") + + parser.add_argument("-o", "--out_dir", + required=True, + help="REQUIRED: The full path to an existing directory to write " + "output files.") + + parser.add_argument("-m", "--map", + required=False, + default=None, + help="OPTIONAL: The full path to a map file containing tip names " + "and grouping schemes for examining monophyly.") + + parser.add_argument("--write_tips", + required=False, + action='store_true', + help="OPTIONAL: Obtain list of tips in tree and write to output file.") + + parser.add_argument("--genus", + required=False, + action='store_true', + help="OPTIONAL: Obtain genus names from tips in tree. Requires " + "tips to be labeled in GENUS_SPECIES format.") + + parser.add_argument("--support", + required=False, + action='store_true', + help="OPTIONAL: Obtain support values for monophyletic groupings " + "- requires a tree with support values present.") + + parser.add_argument("--root", + required=False, + action='store_true', + help="OPTIONAL: Root tree using MRCA of tree tip labels specified " + "using tip flags (minimally -tip1 and -tip2).") + + parser.add_argument("--tip1", + required=False, + default=None, + help="Required for --root: The name of the first tree tip.") + + parser.add_argument("--tip2", + required=False, + default=None, + help="Required for --root: The name of the second tree tip.") + + parser.add_argument("--tip3", + required=False, + default=None, + help="Optional for --root: The name of a third tree tip.") + + parser.add_argument("--tip4", + required=False, + default=None, + help="Optional for --root: The name of a fourth tree tip.") + + parser.add_argument("--tip5", + required=False, + default=None, + help="Optional for --root: The name of a fifth tree tip.") + + parser.add_argument("--write_root", + required=False, + action='store_true', + help="Optional for --root: Write the rooted tree to an output file (newick format).") + return parser.parse_args() def read_tree(t_file): - ''' + """ Read tree file, raise errors if not formatted correctly. - ''' + """ try: tree = Tree(t_file) except: @@ -41,23 +97,25 @@ def read_tree(t_file): return tree def get_taxa(tree): - ''' + """ Get a list of tips/taxa in the tree, sorted alphabetically. - ''' + """ taxa = sorted([leaf.name for leaf in tree]) - print "\n\nFound {} tips in tree.\n".format(len(taxa)) + print("\n\nFound {} tips in tree.\n".format(len(taxa))) return taxa def root_tree(tree, tiplist, taxa): - ''' + """ Root tree using the taxa list provided. Check to ensure taxa are actually valid tips first. - ''' + """ for t in tiplist: if t not in taxa: - raise NameError("\n\n{} does not match any tip found in this tree.\n\nTry writing tips to file and obtaining names directly from file.\n\n".format(t)) - print "\nUsing tips {} to root tree.\n".format(tiplist) + raise NameError("\n\n{} does not match any tip found in this tree.\n\n" + "Try writing tips to file and obtaining names directly " + "from file.\n\n".format(t)) + print("\nUsing tips {} to root tree.\n".format(tiplist)) ancestor = tree.get_common_ancestor(tiplist) #print ancestor tree.set_outgroup(ancestor) @@ -65,24 +123,23 @@ def root_tree(tree, tiplist, taxa): def root_writer(infile, tree, out_dir): outname = "Rooted_{}".format(infile.split('/')[-1]) tree.write(format=0, outfile=outname) - print "\nWrote rooted tree file '{1}' to directory: {0}.\n".format(out_dir, outname) - + print("\nWrote rooted tree file '{1}' to directory: {0}.\n".format(out_dir, outname)) def get_genera(taxa): - ''' + """ If names follow Genus_species, obtain all unique Genus components, return as list sorted alphabetically. - ''' + """ genera = sorted(set([t.split('_')[0] for t in taxa])) - print "\nFound {} genera across tree.\n".format(len(genera)) + print("\nFound {} genera across tree.\n".format(len(genera))) return genera def genus_dict(taxa, genera): - ''' + """ Create a dictionary where each genus is a key and a list of contained species are the corresponding value. - ''' + """ gdicts = {} for g in genera: taxa_list = [t for t in taxa if t.split('_')[0] == g] @@ -90,11 +147,11 @@ def genus_dict(taxa, genera): return gdicts def write_map_file_genus(gdicts): - ''' + """ Write the lists of genera and species labels to corresponding output files. - ''' - print "\nWriting genus and species labels to Species_List.txt in the output directory.\n" + """ + print("\nWriting genus and species labels to Species_List.txt in the output directory.\n") genera = sorted(gdicts.keys()) with open('Species_List.txt', 'a') as fh_out: fh_out.write("{}\t{}\n".format("Species","Genus")) @@ -102,42 +159,42 @@ def write_map_file_genus(gdicts): for species in gdicts[g]: #print species, g fh_out.write("{}\t{}\n".format(species,g)) - print "\nWriting genus labels to Genus_List.txt in the output directory.\n" + print("\nWriting genus labels to Genus_List.txt in the output directory.\n") with open('Genus_List.txt', 'a') as fh_out: fh_out.write("{}\n".format("Genus")) for g in genera: fh_out.write("{}\n".format(g)) def write_map_file_tip(taxa): - ''' + """ Write list of tips to output file (used if --genus not supplied). - ''' - print "\nWriting {} tip labels to Tip_List.txt in the output directory.\n".format(len(taxa)) + """ + print("\nWriting {} tip labels to Tip_List.txt in the output directory.\n".format(len(taxa))) with open('Tip_List.txt', 'a') as fh_out: fh_out.write("{}\n".format("Tip")) for t in taxa: fh_out.write("{}\n".format(t)) def groups_to_dicts(contents, groups, i, taxa): - ''' + """ Return a dictionary where group labels are keys and taxa lists are values. - ''' + """ gdict = {} for g in groups: taxa_list = [c[0] for c in contents[1:] if c[i] == g and c[0] in taxa] if taxa_list: gdict[g] = taxa_list - print "\tSubgroup {} contains {} taxa.".format(g, len(taxa_list)) + print("\tSubgroup {} contains {} taxa.".format(g, len(taxa_list))) return gdict def parse_mapfile(map, taxa): - ''' + """ For each grouping, create a list containing: [Group Label, Group Dictionary] Place all in one list:[ [Group1 Label, Group1 Dictionary], [Group2 Label, Group2 Dictionary], etc.] - ''' - print "\nExamining tips and groupings in map file.\n" + """ + print("\nExamining tips and groupings in map file.\n") with open(map, 'r') as fh: contents = [l.strip().split('\t') for l in fh if l.strip()] @@ -152,10 +209,10 @@ def parse_mapfile(map, taxa): return task_list def test_monophyly(label, dicts, tree): - ''' + """ Where d is a dictionary with group labels are keys and corresponding lists of species/tips are values. - ''' + """ outname = "{0}_{1}_results.txt".format("Group", label) log = "Summary.log" @@ -163,10 +220,13 @@ def test_monophyly(label, dicts, tree): mono_count = int(0) skip_count = int(0) with open(log, 'a') as fh_log: - print "\n\nExamining {}:".format(label) + print("\n\nExamining {}:".format(label)) fh_log.write("Examining {}:\n".format(label)) with open(outname, 'a') as fh_out: - fh_out.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format("Grouping","Number_Contained_Taxa","Monophyletic","Category","Number_Interfering_Species","Interfering_Species")) + fh_out.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format("Grouping", "Number_Contained_Taxa", + "Monophyletic", "Category", + "Number_Interfering_Species", + "Interfering_Species")) for g in groupings: results = tree.check_monophyly(values = dicts[g], target_attr="name", unrooted = True) names = sorted([r.name for r in results[2]]) @@ -174,29 +234,37 @@ def test_monophyly(label, dicts, tree): if len(dicts[g]) > 1: truth = results[0] mono = results[1].capitalize() - print "\t{0} is {1}".format(g, mono) + print("\t{0} is {1}".format(g, mono)) fh_log.write("\t{0} is {1}\n".format(g, mono)) else: truth = "NA" mono = "NA" - print "\t{0} contains only 1 taxon".format(g) + print("\t{0} contains only 1 taxon".format(g)) fh_log.write("\t{0} contains only 1 taxon\n".format(g)) - fh_out.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(g, len(dicts[g]), truth, mono, len(names), ", ".join(names))) + fh_out.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(g, len(dicts[g]), truth, mono, + len(names), ", ".join(names))) if mono == "Monophyletic": mono_count += 1 elif mono == "NA": skip_count += 1 - print "Found {0} monophyletic groups out of {1} testable groupings.\nOf {2} total groupings, {3} contained a single taxon and were ignored.\n\n".format(mono_count, (len(groupings)-skip_count), len(groupings), skip_count) - fh_log.write("Found {0} monophyletic groups out of {1} testable groupings.\nOf {2} total groupings, {3} contained a single taxon and were ignored.\n\n".format(mono_count, (len(groupings)-skip_count), len(groupings), skip_count)) + print("Found {0} monophyletic groups out of {1} testable groupings." + "\nOf {2} total groupings, {3} contained a single taxon " + "and were ignored.\n\n".format(mono_count, (len(groupings)-skip_count), + len(groupings), skip_count)) + + fh_log.write("Found {0} monophyletic groups out of {1} testable groupings." + "\nOf {2} total groupings, {3} contained a single taxon " + "and were ignored.\n\n".format(mono_count, (len(groupings)-skip_count), + len(groupings), skip_count)) def test_monophyly_support(label, dicts, tree): - ''' + """ Where d is a dictionary with group labels are keys and corresponding lists of species/tips are values. - ''' + """ outname = "{0}_{1}_results.txt".format("Group", label) log = "Summary.log" @@ -205,10 +273,12 @@ def test_monophyly_support(label, dicts, tree): skip_count = int(0) with open(log, 'a') as fh_log: - print "\n\nExamining {}:".format(label) + print("\n\nExamining {}:".format(label)) fh_log.write("Examining {}:\n".format(label)) with open(outname, 'a') as fh_out: - fh_out.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format("Grouping","Number_Contained_Taxa","Monophyletic","Category","Support","Number_Interfering_Species","Interfering_Species")) + fh_out.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format("Grouping", "Number_Contained_Taxa", + "Monophyletic", "Category", "Support", + "Number_Interfering_Species", "Interfering_Species")) for g in groupings: results = tree.check_monophyly(values = dicts[g], target_attr="name", unrooted = True) names = sorted([r.name for r in results[2]]) @@ -216,12 +286,12 @@ def test_monophyly_support(label, dicts, tree): if len(dicts[g]) > 1: truth = results[0] mono = results[1].capitalize() - print "\t{0} is {1}".format(g, mono) + print("\t{0} is {1}".format(g, mono)) fh_log.write("\t{0} is {1}\n".format(g, mono)) else: truth = "NA" mono = "NA" - print "\t{0} contains only 1 taxon".format(g) + print("\t{0} contains only 1 taxon".format(g)) fh_log.write("\t{0} contains only 1 taxon\n".format(g)) if results[1] == "monophyletic" and len(dicts[g]) > 1: @@ -230,15 +300,23 @@ def test_monophyly_support(label, dicts, tree): else: support = "NA" - fh_out.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(g, len(dicts[g]), truth, mono, support, len(names), ", ".join(names))) + fh_out.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(g, len(dicts[g]), truth, mono, + support, len(names), ", ".join(names))) if mono == "Monophyletic": mono_count += 1 elif mono == "NA": skip_count += 1 - print "Found {0} monophyletic groups out of {1} testable groupings.\nOf {2} total groupings, {3} contained a single taxon and were ignored.\n\n".format(mono_count, (len(groupings)-skip_count), len(groupings), skip_count) - fh_log.write("Found {0} monophyletic groups out of {1} testable groupings.\nOf {2} total groupings, {3} contained a single taxon and were ignored.\n\n".format(mono_count, (len(groupings)-skip_count), len(groupings), skip_count)) + print("Found {0} monophyletic groups out of {1} testable groupings." + "\nOf {2} total groupings, {3} contained a single taxon and " + "were ignored.\n\n".format(mono_count, (len(groupings)-skip_count), + len(groupings), skip_count)) + + fh_log.write("Found {0} monophyletic groups out of {1} testable groupings." + "\nOf {2} total groupings, {3} contained a single taxon and " + "were ignored.\n\n".format(mono_count, (len(groupings)-skip_count), + len(groupings), skip_count)) def main(): args = get_args()