Skip to content

Commit 23aae3c

Browse files
committed
json
1 parent 7d14822 commit 23aae3c

File tree

125 files changed

+92
-55
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

125 files changed

+92
-55
lines changed

v03_pipeline/lib/methods/sample_qc.py

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,3 @@
1-
import csv
2-
import json
3-
from collections import defaultdict
4-
51
import hail as hl
62
from gnomad.sample_qc.pipeline import filter_rows_for_qc
73

@@ -72,15 +68,3 @@ def annotate_filter_flags(
7268
'mean_coverage',
7369
'filtered_callrate',
7470
)
75-
76-
77-
def sample_qc_tsv_to_dict(tsv_file_path: str) -> dict:
78-
parse_field_types = {'sample_type': str, 'filter_flags': json.loads}
79-
sample_qc_dict = defaultdict(dict)
80-
with open(tsv_file_path) as f:
81-
reader = csv.DictReader(f, delimiter='\t')
82-
for row in reader:
83-
sample_id = row.pop('s')
84-
for field, value in row.items():
85-
sample_qc_dict[sample_id][field] = parse_field_types[field](value)
86-
return sample_qc_dict

v03_pipeline/lib/paths.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ def relatedness_check_tsv_path(
219219
)
220220

221221

222-
def sample_qc_tsv_path(
222+
def sample_qc_json_path(
223223
reference_genome: ReferenceGenome,
224224
dataset_type: DatasetType,
225225
callset_path: str,
@@ -231,7 +231,7 @@ def sample_qc_tsv_path(
231231
dataset_type,
232232
),
233233
'sample_qc',
234-
f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.tsv',
234+
f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.json',
235235
)
236236

237237

v03_pipeline/lib/tasks/write_metadata_for_run.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,11 @@
44
import luigi
55
import luigi.util
66

7-
from v03_pipeline.lib.methods.sample_qc import sample_qc_tsv_to_dict
87
from v03_pipeline.lib.model import FeatureFlag
98
from v03_pipeline.lib.paths import (
109
metadata_for_run_path,
1110
relatedness_check_tsv_path,
12-
sample_qc_tsv_path,
11+
sample_qc_json_path,
1312
)
1413
from v03_pipeline.lib.tasks.base.base_loading_run_params import (
1514
BaseLoadingRunParams,
@@ -78,12 +77,13 @@ def run(self) -> None:
7877
self.reference_genome,
7978
)
8079
):
81-
metadata_json['sample_qc'] = sample_qc_tsv_to_dict(
82-
sample_qc_tsv_path(
80+
with open(
81+
sample_qc_json_path(
8382
self.reference_genome,
8483
self.dataset_type,
8584
self.callset_path,
8685
),
87-
)
86+
) as f:
87+
metadata_json['sample_qc'] = json.load(f)
8888
with self.output().open('w') as f:
8989
json.dump(metadata_json, f)

v03_pipeline/lib/tasks/write_metadata_for_run_test.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,13 @@
1414
TEST_REMAP_2 = 'v03_pipeline/var/test/remaps/test_remap_2.tsv'
1515
TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv'
1616
TEST_PEDIGREE_4 = 'v03_pipeline/var/test/pedigrees/test_pedigree_4.tsv'
17-
TEST_SAMPLE_QC_TSV = 'v03_pipeline/var/test/sample_qc_1.tsv'
17+
TEST_SAMPLE_QC_JSON = 'v03_pipeline/var/test/sample_qc_1.json'
1818

1919

2020
class WriteMetadataForRunTaskTest(MockedDatarootTestCase):
2121
@mock.patch(
22-
'v03_pipeline.lib.tasks.write_metadata_for_run.sample_qc_tsv_path',
23-
lambda *_: TEST_SAMPLE_QC_TSV,
22+
'v03_pipeline.lib.tasks.write_metadata_for_run.sample_qc_json_path',
23+
lambda *_: TEST_SAMPLE_QC_JSON,
2424
)
2525
@mock.patch('v03_pipeline.lib.tasks.write_metadata_for_run.FeatureFlag')
2626
@mock.patch(
@@ -98,7 +98,7 @@ def test_write_metadata_for_run_task(
9898
'sample_qc': {
9999
'HG00731': {
100100
'sample_type': 'WGS',
101-
'filter_flags': ['contamination', 'coverage'],
101+
'filter_flags': ['coverage', 'contamination'],
102102
},
103103
'HG00732': {
104104
'sample_type': 'WGS',

v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from v03_pipeline.lib.tasks.write_relatedness_check_tsv import (
2929
WriteRelatednessCheckTsvTask,
3030
)
31-
from v03_pipeline.lib.tasks.write_sample_qc_tsv import WriteSampleQCTsvTask
31+
from v03_pipeline.lib.tasks.write_sample_qc_json import WriteSampleQCJsonTask
3232
from v03_pipeline.lib.tasks.write_sex_check_table import WriteSexCheckTableTask
3333
from v03_pipeline.lib.tasks.write_validation_errors_for_run import (
3434
with_persisted_validation_errors,
@@ -92,7 +92,7 @@ def requires(self) -> list[luigi.Task]:
9292
):
9393
requirements = [
9494
*requirements,
95-
self.clone(WriteSampleQCTsvTask),
95+
self.clone(WriteSampleQCJsonTask),
9696
]
9797
return requirements
9898

v03_pipeline/lib/tasks/write_sample_qc_tsv.py renamed to v03_pipeline/lib/tasks/write_sample_qc_json.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,25 @@
1+
import json
2+
from collections import defaultdict
3+
14
import hail as hl
25
import hailtop.fs as hfs
36
import luigi
47
import luigi.util
58

69
from v03_pipeline.lib.methods.sample_qc import call_sample_qc
710
from v03_pipeline.lib.misc.io import import_tdr_qc_metrics
8-
from v03_pipeline.lib.paths import sample_qc_tsv_path, tdr_metrics_dir
11+
from v03_pipeline.lib.paths import sample_qc_json_path, tdr_metrics_dir
912
from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
1013
from v03_pipeline.lib.tasks.files import GCSorLocalTarget
1114
from v03_pipeline.lib.tasks.validate_callset import ValidateCallsetTask
1215
from v03_pipeline.lib.tasks.write_tdr_metrics_files import WriteTDRMetricsFilesTask
1316

1417

1518
@luigi.util.inherits(BaseLoadingRunParams)
16-
class WriteSampleQCTsvTask(luigi.Task):
19+
class WriteSampleQCJsonTask(luigi.Task):
1720
def output(self) -> luigi.Target:
1821
return GCSorLocalTarget(
19-
sample_qc_tsv_path(
22+
sample_qc_json_path(
2023
self.reference_genome,
2124
self.dataset_type,
2225
self.callset_path,
@@ -46,4 +49,14 @@ def run(self):
4649
self.sample_type,
4750
)
4851
ht = callset_mt.cols()
49-
ht.flatten().export(self.output().path)
52+
sample_qc_dict = defaultdict(dict)
53+
for row in ht.flatten().collect():
54+
r = dict(row)
55+
sample_id = r.pop('s')
56+
for field, value in r.items():
57+
sample_qc_dict[sample_id][field] = (
58+
list(value) if isinstance(value, set) else value
59+
)
60+
61+
with self.output().open('w') as f:
62+
json.dump(sample_qc_dict, f)

v03_pipeline/lib/tasks/write_sample_qc_tsv_test.py renamed to v03_pipeline/lib/tasks/write_sample_qc_json_test.py

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
from decimal import Decimal
23
from unittest.mock import Mock, patch
34

@@ -6,14 +7,14 @@
67
import luigi.worker
78

89
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
9-
from v03_pipeline.lib.tasks.write_sample_qc_tsv import WriteSampleQCTsvTask
10+
from v03_pipeline.lib.tasks.write_sample_qc_json import WriteSampleQCJsonTask
1011
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
1112

1213
TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
1314
TEST_RUN_ID = 'manual__2024-04-03'
1415

1516

16-
class WriteSampleQCTsvTaskTest(MockedDatarootTestCase):
17+
class WriteSampleQCJsonTaskTest(MockedDatarootTestCase):
1718
@patch('v03_pipeline.lib.tasks.write_tdr_metrics_files.gen_bq_table_names')
1819
@patch('v03_pipeline.lib.tasks.write_tdr_metrics_file.bq_metrics_query')
1920
def test_call_sample_qc(
@@ -87,7 +88,7 @@ def test_call_sample_qc(
8788
),
8889
]
8990
worker = luigi.worker.Worker()
90-
task = WriteSampleQCTsvTask(
91+
task = WriteSampleQCJsonTask(
9192
reference_genome=ReferenceGenome.GRCh38,
9293
dataset_type=DatasetType.SNV_INDEL,
9394
run_id=TEST_RUN_ID,
@@ -102,17 +103,30 @@ def test_call_sample_qc(
102103
self.assertTrue(hfs.exists(task.output().path))
103104

104105
with task.output().open('r') as f:
105-
lines = f.readlines()
106-
expected_first_five_lines = [
107-
's\tsample_type\tfilter_flags\n',
108-
'HG00731\tWGS\t["contamination","coverage"]\n',
109-
'HG00732\tWGS\t["coverage"]\n',
110-
'HG00733\tWGS\t["contamination"]\n',
111-
'NA19675\tWGS\t[]\n',
112-
]
113-
for expected_line, actual_line in zip(
114-
expected_first_five_lines,
115-
lines[:5],
116-
strict=False,
117-
):
118-
self.assertEqual(expected_line, actual_line)
106+
self.assertDictEqual(
107+
json.load(f),
108+
{
109+
'HG00731': {
110+
'sample_type': 'WGS',
111+
'filter_flags': ['contamination', 'coverage'],
112+
},
113+
'HG00732': {'sample_type': 'WGS', 'filter_flags': ['coverage']},
114+
'HG00733': {
115+
'sample_type': 'WGS',
116+
'filter_flags': ['contamination'],
117+
},
118+
'NA19675': {'sample_type': 'WGS', 'filter_flags': []},
119+
'NA19678': {'sample_type': 'WGS', 'filter_flags': []},
120+
'NA19679': {'sample_type': 'WGS', 'filter_flags': []},
121+
'NA20870': {'sample_type': 'WGS', 'filter_flags': []},
122+
'NA20872': {'sample_type': 'WGS', 'filter_flags': []},
123+
'NA20874': {'sample_type': 'WGS', 'filter_flags': []},
124+
'NA20875': {'sample_type': 'WGS', 'filter_flags': []},
125+
'NA20876': {'sample_type': 'WGS', 'filter_flags': []},
126+
'NA20877': {'sample_type': 'WGS', 'filter_flags': []},
127+
'NA20878': {'sample_type': 'WGS', 'filter_flags': []},
128+
'NA20881': {'sample_type': 'WGS', 'filter_flags': []},
129+
'NA20885': {'sample_type': 'WGS', 'filter_flags': []},
130+
'NA20888': {'sample_type': 'WGS', 'filter_flags': []},
131+
},
132+
)
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)