From 378bbc4a608873514ccbae1544950fcd017287b8 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 23 May 2024 11:31:26 -0400 Subject: [PATCH] print lint rule --- pyproject.toml | 1 - ...write_cached_reference_dataset_query_ht.py | 93 ------------------- v03_pipeline/lib/misc/sample_ids.py | 12 ++- 3 files changed, 8 insertions(+), 98 deletions(-) delete mode 100755 v03_pipeline/bin/write_cached_reference_dataset_query_ht.py diff --git a/pyproject.toml b/pyproject.toml index 59bec645f..adc5d947a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,6 @@ ignore = [ "FBT", # flake-boolean-trap... disallows boolean args to functions... fixing this code will require refactors. "ANN", # flake8-annotations is for typed code "DJ", # django specific - "T20", # forbids print, we print quite a bit "PYI", # pyi is typing stub files "PT", # pytest specific "PTH", # pathlib is preferred, but we're not using it yet diff --git a/v03_pipeline/bin/write_cached_reference_dataset_query_ht.py b/v03_pipeline/bin/write_cached_reference_dataset_query_ht.py deleted file mode 100755 index 5bf5c956f..000000000 --- a/v03_pipeline/bin/write_cached_reference_dataset_query_ht.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python3 -import argparse - -import hail as hl - -from v03_pipeline.lib.misc.io import write -from v03_pipeline.lib.model import ( - CachedReferenceDatasetQuery, - DatasetType, - ReferenceDatasetCollection, - ReferenceGenome, -) -from v03_pipeline.lib.paths import ( - valid_cached_reference_dataset_query_path, - valid_reference_dataset_collection_path, -) -from v03_pipeline.lib.reference_data.config import CONFIG -from v03_pipeline.lib.reference_data.dataset_table_operations import ( - import_ht_from_config_path, -) - - -def get_ht( - dataset_type: DatasetType, - reference_genome: ReferenceGenome, - query: CachedReferenceDatasetQuery, -) -> hl.Table: - # If the query is defined over an uncombined reference dataset, use the combiner config. - if query.query_raw_dataset: - config = CONFIG[query.dataset(dataset_type)][reference_genome.v02_value] - return import_ht_from_config_path( - config, - query.dataset(dataset_type), - reference_genome, - ) - return hl.read_table( - valid_reference_dataset_collection_path( - reference_genome, - dataset_type, - ReferenceDatasetCollection.COMBINED, - ), - ) - - -def run( - dataset_type: DatasetType, - reference_genome: ReferenceGenome, - query: CachedReferenceDatasetQuery, -): - ht = get_ht(dataset_type, reference_genome, query) - ht = query.query(ht, dataset_type=dataset_type, reference_genome=reference_genome) - destination_path = valid_cached_reference_dataset_query_path( - reference_genome, - dataset_type, - query, - ) - print(f'Uploading ht to {destination_path}') - write(ht, destination_path) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--reference-genome', - type=ReferenceGenome, - choices=list(ReferenceGenome), - default=ReferenceGenome.GRCh38, - ) - parser.add_argument( - '--dataset-type', - type=DatasetType, - choices=list(DatasetType), - default=None, - help='When used, update the passed dataset, otherwise run all datasets.', - ) - parser.add_argument( - '--query', - type=CachedReferenceDatasetQuery, - choices=list(CachedReferenceDatasetQuery), - required=True, - ) - args, _ = parser.parse_known_args() - if ( - args.query - and args.query - not in CachedReferenceDatasetQuery.for_reference_genome_dataset_type( - args.reference_genome, - args.dataset_type, - ) - ): - msg = f'{args.query} is not a valid query for {DatasetType}' - raise ValueError(msg) - run(args.dataset_type, args.reference_genome, args.query) diff --git a/v03_pipeline/lib/misc/sample_ids.py b/v03_pipeline/lib/misc/sample_ids.py index d2174fe50..ca5407b5e 100644 --- a/v03_pipeline/lib/misc/sample_ids.py +++ b/v03_pipeline/lib/misc/sample_ids.py @@ -2,6 +2,10 @@ import hail as hl +from v03_pipeline.lib.logger import get_logger + +logger = get_logger(__name__) + class MatrixTableSampleSetError(Exception): def __init__(self, message, missing_samples): @@ -42,7 +46,7 @@ def remap_sample_ids( f'All callset sample IDs:{mt.s.collect()}' ) if ignore_missing_samples_when_remapping: - print(message) + logger.info(message) else: raise MatrixTableSampleSetError(message, missing_samples) @@ -50,7 +54,7 @@ def remap_sample_ids( remap_expr = hl.if_else(hl.is_missing(mt.seqr_id), mt.s, mt.seqr_id) mt = mt.annotate_cols(seqr_id=remap_expr, vcf_id=mt.s) mt = mt.key_cols_by(s=mt.seqr_id) - print(f'Remapped {remap_count} sample ids...') + logger.info(f'Remapped {remap_count} sample ids...') return mt @@ -77,9 +81,9 @@ def subset_samples( if ( subset_count > anti_join_ht_count ) and ignore_missing_samples_when_subsetting: - print(message) + logger.info(message) else: raise MatrixTableSampleSetError(message, missing_samples) - print(f'Subsetted to {subset_count} sample ids') + logger.info(f'Subsetted to {subset_count} sample ids') mt = mt.semi_join_cols(sample_subset_ht) return mt.filter_rows(hl.agg.any(hl.is_defined(mt.GT)))