From 378bbc4a608873514ccbae1544950fcd017287b8 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Thu, 23 May 2024 11:31:26 -0400
Subject: [PATCH] print lint rule

---
 pyproject.toml                                |  1 -
 ...write_cached_reference_dataset_query_ht.py | 93 -------------------
 v03_pipeline/lib/misc/sample_ids.py           | 12 ++-
 3 files changed, 8 insertions(+), 98 deletions(-)
 delete mode 100755 v03_pipeline/bin/write_cached_reference_dataset_query_ht.py

diff --git a/pyproject.toml b/pyproject.toml
index 59bec645f..adc5d947a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,6 @@ ignore = [
     "FBT",   # flake-boolean-trap... disallows boolean args to functions... fixing this code will require refactors.
     "ANN",   # flake8-annotations is for typed code
     "DJ",    # django specific
-    "T20",   # forbids print, we print quite a bit
     "PYI",   # pyi is typing stub files
     "PT",    # pytest specific
     "PTH",   # pathlib is preferred, but we're not using it yet
diff --git a/v03_pipeline/bin/write_cached_reference_dataset_query_ht.py b/v03_pipeline/bin/write_cached_reference_dataset_query_ht.py
deleted file mode 100755
index 5bf5c956f..000000000
--- a/v03_pipeline/bin/write_cached_reference_dataset_query_ht.py
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-
-import hail as hl
-
-from v03_pipeline.lib.misc.io import write
-from v03_pipeline.lib.model import (
-    CachedReferenceDatasetQuery,
-    DatasetType,
-    ReferenceDatasetCollection,
-    ReferenceGenome,
-)
-from v03_pipeline.lib.paths import (
-    valid_cached_reference_dataset_query_path,
-    valid_reference_dataset_collection_path,
-)
-from v03_pipeline.lib.reference_data.config import CONFIG
-from v03_pipeline.lib.reference_data.dataset_table_operations import (
-    import_ht_from_config_path,
-)
-
-
-def get_ht(
-    dataset_type: DatasetType,
-    reference_genome: ReferenceGenome,
-    query: CachedReferenceDatasetQuery,
-) -> hl.Table:
-    # If the query is defined over an uncombined reference dataset, use the combiner config.
-    if query.query_raw_dataset:
-        config = CONFIG[query.dataset(dataset_type)][reference_genome.v02_value]
-        return import_ht_from_config_path(
-            config,
-            query.dataset(dataset_type),
-            reference_genome,
-        )
-    return hl.read_table(
-        valid_reference_dataset_collection_path(
-            reference_genome,
-            dataset_type,
-            ReferenceDatasetCollection.COMBINED,
-        ),
-    )
-
-
-def run(
-    dataset_type: DatasetType,
-    reference_genome: ReferenceGenome,
-    query: CachedReferenceDatasetQuery,
-):
-    ht = get_ht(dataset_type, reference_genome, query)
-    ht = query.query(ht, dataset_type=dataset_type, reference_genome=reference_genome)
-    destination_path = valid_cached_reference_dataset_query_path(
-        reference_genome,
-        dataset_type,
-        query,
-    )
-    print(f'Uploading ht to {destination_path}')
-    write(ht, destination_path)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--reference-genome',
-        type=ReferenceGenome,
-        choices=list(ReferenceGenome),
-        default=ReferenceGenome.GRCh38,
-    )
-    parser.add_argument(
-        '--dataset-type',
-        type=DatasetType,
-        choices=list(DatasetType),
-        default=None,
-        help='When used, update the passed dataset, otherwise run all datasets.',
-    )
-    parser.add_argument(
-        '--query',
-        type=CachedReferenceDatasetQuery,
-        choices=list(CachedReferenceDatasetQuery),
-        required=True,
-    )
-    args, _ = parser.parse_known_args()
-    if (
-        args.query
-        and args.query
-        not in CachedReferenceDatasetQuery.for_reference_genome_dataset_type(
-            args.reference_genome,
-            args.dataset_type,
-        )
-    ):
-        msg = f'{args.query} is not a valid query for {DatasetType}'
-        raise ValueError(msg)
-    run(args.dataset_type, args.reference_genome, args.query)
diff --git a/v03_pipeline/lib/misc/sample_ids.py b/v03_pipeline/lib/misc/sample_ids.py
index d2174fe50..ca5407b5e 100644
--- a/v03_pipeline/lib/misc/sample_ids.py
+++ b/v03_pipeline/lib/misc/sample_ids.py
@@ -2,6 +2,10 @@
 
 import hail as hl
 
+from v03_pipeline.lib.logger import get_logger
+
+logger = get_logger(__name__)
+
 
 class MatrixTableSampleSetError(Exception):
     def __init__(self, message, missing_samples):
@@ -42,7 +46,7 @@ def remap_sample_ids(
             f'All callset sample IDs:{mt.s.collect()}'
         )
         if ignore_missing_samples_when_remapping:
-            print(message)
+            logger.info(message)
         else:
             raise MatrixTableSampleSetError(message, missing_samples)
 
@@ -50,7 +54,7 @@ def remap_sample_ids(
     remap_expr = hl.if_else(hl.is_missing(mt.seqr_id), mt.s, mt.seqr_id)
     mt = mt.annotate_cols(seqr_id=remap_expr, vcf_id=mt.s)
     mt = mt.key_cols_by(s=mt.seqr_id)
-    print(f'Remapped {remap_count} sample ids...')
+    logger.info(f'Remapped {remap_count} sample ids...')
     return mt
 
 
@@ -77,9 +81,9 @@ def subset_samples(
         if (
             subset_count > anti_join_ht_count
         ) and ignore_missing_samples_when_subsetting:
-            print(message)
+            logger.info(message)
         else:
             raise MatrixTableSampleSetError(message, missing_samples)
-    print(f'Subsetted to {subset_count} sample ids')
+    logger.info(f'Subsetted to {subset_count} sample ids')
     mt = mt.semi_join_cols(sample_subset_ht)
     return mt.filter_rows(hl.agg.any(hl.is_defined(mt.GT)))