| 
 | 1 | +#!/usr/bin/env python  | 
 | 2 | + | 
 | 3 | +# count_frequence_busco_family.py  | 
 | 4 | +# 2020 Jamie McGowan <jamie.mcgowan@mu.ie>  | 
 | 5 | +#   | 
 | 6 | +# Reports how many species a BUSCO family was found to be single copy in  | 
 | 7 | +# ONLY LOOKS AT SINGLE COPIES, IGNORES IF PRESENT AS MULTI COPY  | 
 | 8 | +# Usage: python count_frequency_busco_family.py busco_working_directory  | 
 | 9 | + | 
 | 10 | +import os, sys  | 
 | 11 | +	  | 
 | 12 | +if len(sys.argv) < 2:  | 
 | 13 | +	print("Usage: python count_frequency_busco_family.py busco_working_directory")  | 
 | 14 | +	sys.exit(0)  | 
 | 15 | + | 
 | 16 | +working_directory = os.path.abspath(sys.argv[1])  | 
 | 17 | + | 
 | 18 | +busco_dirs = []  | 
 | 19 | + | 
 | 20 | +for item in os.listdir("."):  | 
 | 21 | +	if item[0:4] == "run_":  | 
 | 22 | +		if os.path.isdir(item):  | 
 | 23 | +			busco_dirs.append(item)  | 
 | 24 | + | 
 | 25 | +n_busco_runs = len(busco_dirs)  | 
 | 26 | + | 
 | 27 | +print("Found " + str(n_busco_runs) + " BUSCO runs")  | 
 | 28 | +print("")  | 
 | 29 | +print("BUSCO Run\t#Single Copy BUSCOs")  | 
 | 30 | + | 
 | 31 | +all_species = []  | 
 | 32 | +all_buscos = set()  | 
 | 33 | +busco_per_species = {}  | 
 | 34 | + | 
 | 35 | +for directory in busco_dirs:  | 
 | 36 | +	os.chdir(working_directory)  | 
 | 37 | +	species = directory.split("run_")[1]  | 
 | 38 | +	all_species.append(species)  | 
 | 39 | +	buscos = []  | 
 | 40 | + | 
 | 41 | +	os.chdir(directory)  | 
 | 42 | +	os.chdir("single_copy_busco_sequences")  | 
 | 43 | + | 
 | 44 | + | 
 | 45 | +	for busco in os.listdir("."):  | 
 | 46 | +		if busco.endswith(".faa"):  | 
 | 47 | +			busco_name = busco[0:len(busco) - 4]  | 
 | 48 | +			buscos.append(busco_name)  | 
 | 49 | +			all_buscos.add(busco_name)  | 
 | 50 | + | 
 | 51 | +	print(species + "\t" + str(len(buscos)))  | 
 | 52 | +	busco_per_species[species] = buscos  | 
 | 53 | + | 
 | 54 | +print("\n\n")  | 
 | 55 | + | 
 | 56 | +all_buscos_count = {}  | 
 | 57 | +for busco in all_buscos:  | 
 | 58 | +	all_buscos_count[busco] = 0  | 
 | 59 | +	  | 
 | 60 | +	for species in all_species:  | 
 | 61 | +		if busco in busco_per_species[species]:  | 
 | 62 | +			all_buscos_count[busco] += 1  | 
 | 63 | + | 
 | 64 | +print("BUSCO\t#Species\t%Species")  | 
 | 65 | +for busco in all_buscos:  | 
 | 66 | +	percent = ((all_buscos_count[busco] / float(n_busco_runs)) * 100)  | 
 | 67 | +	percent = "{:.2f}".format(percent)  | 
 | 68 | +	print(busco + "\t" + str(all_buscos_count[busco]) + '\t' + percent)  | 
 | 69 | + | 
 | 70 | + | 
 | 71 | +print("\n\n")  | 
 | 72 | + | 
 | 73 | +# Print presence/absence matrix of all found buscos per species  | 
 | 74 | +line = ["BUSCO"] + all_species  | 
 | 75 | +print ("\t".join(line))  | 
 | 76 | + | 
 | 77 | +for busco in all_buscos:  | 
 | 78 | +	line = [busco]  | 
 | 79 | +	for species in all_species:  | 
 | 80 | +		if busco in busco_per_species[species]:  | 
 | 81 | +			line.append("Y")  | 
 | 82 | +		else:  | 
 | 83 | +			line.append("N")  | 
 | 84 | + | 
 | 85 | +	print ("\t".join(line))  | 
 | 86 | + | 
0 commit comments