-
Notifications
You must be signed in to change notification settings - Fork 149
Description
This has been bothering me for a long time, so this morning I decided to take a stab at this.
The utils.py has grown to over 150 functions over the years and close to 5,000 loc :/ Having everything in one file obviously is a bad idea not only for performance but also maintainability of the codebase.
After 3 hours of work, this is a dictionary that I plan to use to carry function names to new modules, where a given function name (represented by the keys of the dict below) in anvio/utils.py, will become a function in a new module (represented by he values in the dict).
For instance, according to this, get_total_memory_usage function in anvio/utils.py would be a function in anvio/utils/system.py and so on.
My question is, does the function names to categories below make sense? Is it done well, or do you think we can refine it more? For instance, I hate anviohelp and misc categories. The first one is a bad name, the second is a useless category. Does anyone have any suggestions regarding those functions or should we go with this?
Once this is finalized I will use this dictionary to generate these new modules, and hopefully to write a migration script that can go through all files in the codebase to update import statements (I have no idea how to do that at this point, but I will document things here of course).
FUNCTION_TO_MODULE = {
'get_total_memory_usage': 'anvio.utils.system',
'display_top_memory_usage': 'anvio.utils.system',
'get_available_program_names_in_active_environment': 'anvio.utils.system',
'is_program_exists': 'anvio.utils.system',
'check_h5py_module': 'anvio.utils.system',
'rev_comp': 'anvio.utils.sequences',
'rev_comp_gene_calls_dict': 'anvio.utils.sequences',
'get_GC_content_for_sequence': 'anvio.utils.sequences',
'get_synonymous_and_non_synonymous_potential': 'anvio.utils.sequences',
'get_list_of_AAs_for_gene_call': 'anvio.utils.sequences',
'get_list_of_codons_for_gene_call': 'anvio.utils.sequences',
'get_translated_sequence_for_gene_call': 'anvio.utils.sequences',
'translate': 'anvio.utils.sequences',
'get_most_likely_translation_frame': 'anvio.utils.sequences',
'get_codon_order_to_nt_positions_dict': 'anvio.utils.sequences',
'nt_seq_to_nt_num_array': 'anvio.utils.sequences',
'nt_seq_to_RC_nt_num_array': 'anvio.utils.sequences',
'nt_seq_to_codon_num_array': 'anvio.utils.sequences',
'nt_seq_to_RC_codon_num_array': 'anvio.utils.sequences',
'_nt_seq_to_codon_num_array': 'anvio.utils.sequences',
'is_amino_acid_functionally_conserved': 'anvio.utils.sequences',
'convert_sequence_indexing': 'anvio.utils.sequences',
'summarize_alignment': 'anvio.utils.sequences',
'restore_alignment': 'anvio.utils.sequences',
'serialize_args': 'anvio.utils.commandline',
'format_cmdline': 'anvio.utils.commandline',
'run_command': 'anvio.utils.commandline',
'start_command': 'anvio.utils.commandline',
'run_command_STDIN': 'anvio.utils.commandline',
'get_command_output_from_shell': 'anvio.utils.commandline',
'get_cmd_line': 'anvio.utils.commandline',
'get_gene_caller_ids_from_args': 'anvio.utils.commandline',
'gzip_compress_file': 'anvio.utils.files',
'gzip_decompress_file': 'anvio.utils.files',
'tar_extract_file': 'anvio.utils.files',
'store_array_as_TAB_delimited_file': 'anvio.utils.files',
'store_dataframe_as_TAB_delimited_file': 'anvio.utils.files',
'store_dict_as_TAB_delimited_file': 'anvio.utils.files',
'get_column_data_from_TAB_delim_file': 'anvio.utils.files',
'get_columns_of_TAB_delim_file': 'anvio.utils.files',
'is_all_columns_present_in_TAB_delim_file': 'anvio.utils.files',
'transpose_tab_delimited_file': 'anvio.utils.files',
'get_vectors_from_TAB_delim_matrix': 'anvio.utils.files',
'get_TAB_delimited_file_as_dictionary': 'anvio.utils.files',
'get_BLAST_tabular_output_as_dict': 'anvio.utils.files',
'get_samples_txt_file_as_dict': 'anvio.utils.files',
'get_primers_txt_file_as_dict': 'anvio.utils.files',
'get_groups_txt_file_as_dict': 'anvio.utils.files',
'get_yaml_as_dict': 'anvio.utils.files',
'get_chunk': 'anvio.utils.files',
'concatenate_files': 'anvio.utils.files',
'get_file_md5': 'anvio.utils.files',
'split_fasta': 'anvio.utils.fasta',
'get_num_sequences_in_fasta': 'anvio.utils.fasta',
'get_all_ids_from_fasta': 'anvio.utils.fasta',
'check_fasta_id_formatting': 'anvio.utils.fasta',
'check_fasta_id_uniqueness': 'anvio.utils.fasta',
'get_read_lengths_from_fasta': 'anvio.utils.fasta',
'remove_sequences_with_only_gaps_from_fasta': 'anvio.utils.fasta',
'get_FASTA_file_as_dictionary': 'anvio.utils.fasta',
'get_GC_content_for_FASTA_entries': 'anvio.utils.fasta',
'unique_FASTA_file': 'anvio.utils.fasta',
'store_dict_as_FASTA_file': 'anvio.utils.fasta',
'export_sequences_from_contigs_db': 'anvio.utils.fasta',
'create_fasta_dir_from_sequence_sources': 'anvio.utils.fasta',
'get_db_type_and_variant': 'anvio.utils.database',
'get_db_type': 'anvio.utils.database',
'get_db_variant': 'anvio.utils.database',
'get_required_version_for_db': 'anvio.utils.database',
'get_all_sample_names_from_the_database': 'anvio.utils.database',
'get_all_item_names_from_the_database': 'anvio.utils.database',
'get_bams_and_profiles_txt_as_data': 'anvio.utils.database',
'get_genes_database_path_for_bin': 'anvio.utils.database',
'is_contigs_db': 'anvio.dbinfo',
'is_trnaseq_db': 'anvio.dbinfo',
'is_pan_or_profile_db': 'anvio.dbinfo',
'is_profile_db': 'anvio.dbinfo',
'is_structure_db': 'anvio.dbinfo',
'is_blank_profile': 'anvio.dbinfo',
'is_pan_db': 'anvio.dbinfo',
'is_genome_storage': 'anvio.dbinfo',
'is_genes_db': 'anvio.dbinfo',
'is_kegg_modules_db': 'anvio.dbinfo',
'is_profile_db_merged': 'anvio.dbinfo',
'is_profile_db_and_contigs_db_compatible': 'anvio.dbinfo',
'is_structure_db_and_contigs_db_compatible': 'anvio.dbinfo',
'is_pan_db_and_genomes_storage_db_compatible': 'anvio.dbinfo',
'check_sample_id': 'anvio.utils.validation',
'check_collection_name': 'anvio.utils.validation',
'is_this_name_OK_for_database': 'anvio.utils.validation',
'check_contig_names': 'anvio.utils.validation',
'check_misc_data_keys_for_format': 'anvio.utils.validation',
'is_gene_caller_id': 'anvio.utils.validation',
'is_gene_sequence_clean': 'anvio.utils.validation',
'is_ascii_only': 'anvio.utils.validation',
'get_port_num': 'anvio.utils.network',
'get_next_available_port_num': 'anvio.utils.network',
'is_port_in_use': 'anvio.utils.network',
'download_file': 'anvio.utils.network',
'get_remote_file_content': 'anvio.utils.network',
'get_anvio_news': 'anvio.utils.network',
'download_protein_structure': 'anvio.utils.network',
'open_url_in_browser': 'anvio.utils.network',
'get_predicted_type_of_items_in_a_dict': 'anvio.utils.misc',
'HTMLColorToRGB': 'anvio.utils.misc',
'get_random_colors_dict': 'anvio.utils.misc',
'get_ordinal_from_integer': 'anvio.utils.misc',
'get_f_string_evaluated_by_dict': 'anvio.utils.misc',
'get_time_to_date': 'anvio.utils.misc',
'get_hash_for_list': 'anvio.utils.misc',
'get_filtered_dict': 'anvio.utils.misc',
'convert_numpy_array_to_binary_blob': 'anvio.utils.algorithms',
'convert_binary_blob_to_numpy_array': 'anvio.utils.algorithms',
'add_to_2D_numeric_array': 'anvio.utils.algorithms',
'apply_and_concat': 'anvio.utils.algorithms',
'get_indices_for_outlier_values': 'anvio.utils.algorithms',
'get_list_of_outliers': 'anvio.utils.algorithms',
'get_stretches_for_numbers_list': 'anvio.utils.algorithms',
'merge_stretches': 'anvio.utils.algorithms',
'get_constant_value_blocks': 'anvio.utils.algorithms',
'find_value_index': 'anvio.utils.algorithms',
'get_N50': 'anvio.utils.algorithms',
'split_by_delim_not_within_parens': 'anvio.utils.algorithms',
'get_split_start_stops': 'anvio.utils.algorithms',
'get_split_start_stops_with_gene_calls': 'anvio.utils.algorithms',
'get_split_start_stops_without_gene_calls': 'anvio.utils.algorithms',
'human_readable_file_size': 'anvio.utils.algorithms',
'anvio_hmm_target_term_to_alphabet_and_context': 'anvio.utils.hmm',
'get_pruned_HMM_hits_dict': 'anvio.utils.hmm',
'get_HMM_sources_dictionary': 'anvio.utils.hmm',
'get_attribute_from_hmm_file': 'anvio.utils.hmm',
'sanity_check_hmm_model': 'anvio.utils.hmm',
'sanity_check_pfam_accessions': 'anvio.utils.hmm',
'run_functional_enrichment_stats': 'anvio.utils.statistics',
'get_enriched_groups': 'anvio.utils.statistics',
'get_required_packages_for_enrichment_test': 'anvio.utils.statistics',
'check_R_packages_are_installed': 'anvio.utils.statistics',
'get_values_of_gene_level_coverage_stats_as_dict': 'anvio.utils.statistics',
'get_names_order_from_newick_tree': 'anvio.utils.phylogenetics',
'gen_NEXUS_format_partition_file_for_phylogenomics': 'anvio.utils.phylogenetics',
'ununique_BLAST_tabular_output': 'anvio.utils.anviohelp',
'convert_SSM_to_single_accession': 'anvio.utils.anviohelp',
'get_default_gene_caller': 'anvio.utils.anviohelp',
'get_split_and_contig_names_of_interest': 'anvio.utils.anviohelp',
'get_contigs_splits_dict': 'anvio.utils.anviohelp',
'get_variabile_item_frequencies': 'anvio.utils.anviohelp',
'get_consensus_and_departure_data': 'anvio.utils.anviohelp',
'get_bin_name_from_item_name': 'anvio.utils.anviohelp',
'get_contig_name_to_splits_dict': 'anvio.utils.anviohelp',
'get_variability_table_engine_type': 'anvio.utils.anviohelp',
'CoverageStats': 'anvio.utils.anviohelp',
'gen_gexf_network_file': 'anvio.utils.visualization',
'run_selenium_and_export_svg': 'anvio.utils.visualization',
'compare_times': 'anvio.utils.debug',
'is_all_npm_packages_installed': 'anvio.utils.debug',
}(latest version in #2465 may be different from this)