Skip to content

Commit 928924e

Browse files
authored
Merge pull request #695 from broadinstitute/benb/fix_logging
Update Logging.
2 parents 54ed7bb + c7f57f3 commit 928924e

File tree

8 files changed

+95
-50
lines changed

8 files changed

+95
-50
lines changed

v03_pipeline/lib/logger.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import logging.config
2+
3+
LOG_CONFIG = {
4+
'version': 1,
5+
'disable_existing_loggers': True,
6+
'propagate': True,
7+
'formatters': {
8+
'default': {
9+
'format': '%(asctime)s - %(module)s - %(levelname)s - %(message)s',
10+
'datefmt': '%Y-%m-%d %H:%M:%S',
11+
},
12+
},
13+
'handlers': {
14+
'default': {
15+
'formatter': 'default',
16+
'class': 'logging.StreamHandler',
17+
},
18+
},
19+
'loggers': {
20+
'': {
21+
'level': 'INFO',
22+
'handlers': ['default'],
23+
'propagate': True,
24+
},
25+
},
26+
}
27+
28+
_CONFIGURED = False
29+
30+
def get_logger(name: str):
31+
global _CONFIGURED # noqa: PLW0603
32+
if not _CONFIGURED:
33+
logging.config.dictConfig(LOG_CONFIG)
34+
_CONFIGURED = True
35+
return logging.getLogger(name)

v03_pipeline/lib/reference_data/clinvar.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import hail as hl
77

88
from v03_pipeline.lib.annotations.enums import CLINVAR_PATHOGENICITIES_LOOKUP
9+
from v03_pipeline.lib.logger import get_logger
910
from v03_pipeline.lib.model.definitions import ReferenceGenome
1011

1112
CLINVAR_ASSERTIONS = [
@@ -33,6 +34,7 @@
3334
},
3435
)
3536

37+
logger = get_logger(__name__)
3638

3739
def safely_move_to_gcs(tmp_file_name, gcs_tmp_file_name):
3840
try:
@@ -45,8 +47,8 @@ def safely_move_to_gcs(tmp_file_name, gcs_tmp_file_name):
4547
],
4648
check=True,
4749
)
48-
except subprocess.CalledProcessError as e:
49-
print(e)
50+
except subprocess.CalledProcessError:
51+
logger.exception(f'Failed to move local tmp file {tmp_file_name} to gcs')
5052

5153

5254
def parsed_clnsig(ht: hl.Table):

v03_pipeline/lib/reference_data/compare_globals.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import dataclasses
2-
import logging
32

43
import hail as hl
54

5+
from v03_pipeline.lib.logger import get_logger
66
from v03_pipeline.lib.model import (
77
DatasetType,
88
ReferenceDatasetCollection,
@@ -16,7 +16,7 @@
1616
parse_dataset_version,
1717
)
1818

19-
logger = logging.getLogger(__name__)
19+
logger = get_logger(__name__)
2020

2121

2222
@dataclasses.dataclass
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import hail as hl
2+
import luigi
3+
4+
from v03_pipeline.lib.logger import get_logger
5+
from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome, SampleType
6+
from v03_pipeline.lib.tasks.files import GCSorLocalFolderTarget
7+
8+
logger = get_logger(__name__)
9+
10+
class BaseHailTableTask(luigi.Task):
11+
reference_genome = luigi.EnumParameter(enum=ReferenceGenome)
12+
dataset_type = luigi.EnumParameter(enum=DatasetType)
13+
sample_type = luigi.EnumParameter(enum=SampleType)
14+
15+
def output(self) -> luigi.Target:
16+
raise NotImplementedError
17+
18+
def complete(self) -> bool:
19+
return GCSorLocalFolderTarget(self.output().path).exists()
20+
21+
def init_hail(self):
22+
# Need to use the GCP bucket as temp storage for very large callset joins
23+
hl.init(tmp_dir=Env.HAIL_TMPDIR, idempotent=True)
24+
25+
# Interval ref data join causes shuffle death, this prevents it
26+
hl._set_flags(use_new_shuffle='1', no_whole_stage_codegen='1') # noqa: SLF001
27+
28+
# NB: these are defined over luigi.Task instead of the BaseHailTableTask so that
29+
# they work on file dependencies.
30+
@luigi.Task.event_handler(luigi.Event.DEPENDENCY_MISSING)
31+
def dependency_missing(task):
32+
logger.info(f'{task} dependency_missing')
33+
34+
@luigi.Task.event_handler(luigi.Event.DEPENDENCY_PRESENT)
35+
def dependency_present(task):
36+
logger.info(f'{task} dependency_present')
37+
38+
@luigi.Task.event_handler(luigi.Event.START)
39+
def start(task):
40+
logger.info(f'{task} start')
41+
42+
@luigi.Task.event_handler(luigi.Event.FAILURE)
43+
def failure(task, _):
44+
logger.exception(f'{task} failure')
45+
46+
@luigi.Task.event_handler(luigi.Event.SUCCESS)
47+
def success(task):
48+
logger.info(f'{task} success')

v03_pipeline/lib/tasks/base/base_update_task.py

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,10 @@
11
import hail as hl
2-
import luigi
32

43
from v03_pipeline.lib.misc.io import write
5-
from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome, SampleType
6-
from v03_pipeline.lib.tasks.files import GCSorLocalFolderTarget
4+
from v03_pipeline.lib.tasks.base.base_hail_table_task import BaseHailTableTask
75

86

9-
class BaseUpdateTask(luigi.Task):
10-
reference_genome = luigi.EnumParameter(enum=ReferenceGenome)
11-
dataset_type = luigi.EnumParameter(enum=DatasetType)
12-
sample_type = luigi.EnumParameter(enum=SampleType)
13-
14-
def output(self) -> luigi.Target:
15-
raise NotImplementedError
16-
17-
def complete(self) -> bool:
18-
return GCSorLocalFolderTarget(self.output().path).exists()
19-
20-
def init_hail(self):
21-
# Need to use the GCP bucket as temp storage for very large callset joins
22-
hl.init(tmp_dir=Env.HAIL_TMPDIR, idempotent=True)
23-
24-
# Interval ref data join causes shuffle death, this prevents it
25-
hl._set_flags(use_new_shuffle='1', no_whole_stage_codegen='1') # noqa: SLF001
26-
7+
class BaseUpdateTask(BaseHailTableTask):
278
def run(self) -> None:
289
self.init_hail()
2910
if not self.output().exists():

v03_pipeline/lib/tasks/base/base_write_task.py

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,10 @@
11
import hail as hl
2-
import luigi
32

43
from v03_pipeline.lib.misc.io import write
5-
from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome, SampleType
6-
from v03_pipeline.lib.tasks.files import GCSorLocalFolderTarget
4+
from v03_pipeline.lib.tasks.base.base_hail_table_task import BaseHailTableTask
75

86

9-
class BaseWriteTask(luigi.Task):
10-
reference_genome = luigi.EnumParameter(enum=ReferenceGenome)
11-
dataset_type = luigi.EnumParameter(enum=DatasetType)
12-
sample_type = luigi.EnumParameter(enum=SampleType)
13-
14-
def output(self) -> luigi.Target:
15-
raise NotImplementedError
16-
17-
def complete(self) -> bool:
18-
return GCSorLocalFolderTarget(self.output().path).exists()
19-
20-
def init_hail(self):
21-
# Need to use the GCP bucket as temp storage for very large callset joins
22-
hl.init(tmp_dir=Env.HAIL_TMPDIR, idempotent=True)
23-
24-
# Interval ref data join causes shuffle death, this prevents it
25-
hl._set_flags(use_new_shuffle='1', no_whole_stage_codegen='1') # noqa: SLF001
26-
7+
class BaseWriteTask(BaseHailTableTask):
278
def run(self) -> None:
289
self.init_hail()
2910
ht = self.create_table()

v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
import logging
2-
31
import hail as hl
42
import luigi
53

4+
from v03_pipeline.lib.logger import get_logger
65
from v03_pipeline.lib.model import ReferenceDatasetCollection
76
from v03_pipeline.lib.paths import valid_reference_dataset_collection_path
87
from v03_pipeline.lib.reference_data.compare_globals import (
@@ -15,7 +14,7 @@
1514
from v03_pipeline.lib.tasks.base.base_update_task import BaseUpdateTask
1615
from v03_pipeline.lib.tasks.files import GCSorLocalTarget
1716

18-
logger = logging.getLogger(__name__)
17+
logger = get_logger(__name__)
1918

2019

2120
class UpdatedReferenceDatasetCollectionTask(BaseUpdateTask):

v03_pipeline/lib/tasks/write_project_family_tables_test.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ def test_snv_write_project_family_tables_task(self) -> None:
3232
hl.read_table(write_family_table_task.output().path)
3333
for write_family_table_task in write_project_family_tables.dynamic_write_family_table_tasks
3434
]
35-
print('bensadf', len(hts))
3635
self.assertCountEqual(
3736
[ht.globals.sample_ids.collect() for ht in hts],
3837
[

0 commit comments

Comments
 (0)