Skip to content

main <- dev #860

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ inline-quotes = "single"
'SLF001', # allow private access
'PLR0913', # allow high arity functions
]
'*migration*' = [
'N999', # allow invalid module names
]

[tool.ruff.pylint]
max-args = 6
Expand Down
12 changes: 6 additions & 6 deletions v03_pipeline/bin/vep-110-GRCh38.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ export PROJECT="$(gcloud config get-value project)"
export VEP_CONFIG_PATH="$(/usr/share/google/get_metadata_value attributes/VEP_CONFIG_PATH)"
export VEP_REPLICATE="$(/usr/share/google/get_metadata_value attributes/VEP_REPLICATE)"
export ASSEMBLY=GRCh38
export VEP_DOCKER_IMAGE=gcr.io/seqr-project/vep-docker-image:110
export VEP_DOCKER_IMAGE=gcr.io/seqr-project/vep-docker-image:GRCh38

mkdir -p /vep_data

Expand All @@ -36,26 +36,26 @@ sleep 60
sudo service docker restart

# Copied from the repo at v03_pipeline/var/vep_config
gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/110/vep-${ASSEMBLY}.json $VEP_CONFIG_PATH
gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/vep-${ASSEMBLY}.json $VEP_CONFIG_PATH

# Copied from the UTRAnnotator repo (https://github.com/ImperialCardioGenetics/UTRannotator/tree/master)
gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/110/uORF_5UTR_${ASSEMBLY}_PUBLIC.txt /vep_data/ &
gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/uORF_5UTR_${ASSEMBLY}_PUBLIC.txt /vep_data/ &

# Raw data files copied from the bucket (https://console.cloud.google.com/storage/browser/dm_alphamissense;tab=objects?prefix=&forceOnObjectsSortingFiltering=false)
# tabix -s 1 -b 2 -e 2 -f -S 1 AlphaMissense_hg38.tsv.gz
gcloud storage cp --billing-project $PROJECT 'gs://seqr-reference-data/vep/110/AlphaMissense_hg38.tsv.*' /vep_data/ &
gcloud storage cp --billing-project $PROJECT 'gs://seqr-reference-data/vep/GRCh38/AlphaMissense_hg38.tsv.*' /vep_data/ &

gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep_data/loftee-beta/${ASSEMBLY}.tar | tar -xf - -C /vep_data/ &

# Copied from ftp://ftp.ensembl.org/pub/release-110/variation/indexed_vep_cache/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz
gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep/110/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz | tar -xzf - -C /vep_data/ &
gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz | tar -xzf - -C /vep_data/ &

# Generated with:
# curl -O ftp://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz > Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz
# gzip -d Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz
# bgzip Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa
# samtools faidx Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz
gcloud storage cp --billing-project $PROJECT "gs://seqr-reference-data/vep/110/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.*" /vep_data/ &
gcloud storage cp --billing-project $PROJECT "gs://seqr-reference-data/vep/GRCh38/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.*" /vep_data/ &
docker pull ${VEP_DOCKER_IMAGE} &
wait

Expand Down
39 changes: 39 additions & 0 deletions v03_pipeline/lib/annotations/sv.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,41 @@ def _sv_types(ht: hl.Table) -> hl.ArrayExpression:
return ht.alleles[1].replace('[<>]', '').split(':', 2)


def alleles(ht: hl.Table, **_: Any) -> hl.ArrayExpression:
return hl.array(
[
'N',
hl.if_else(
(
hl.is_defined(ht.sv_type_detail_id)
& (hl.array(SV_TYPES)[ht.sv_type_id] != 'CPX')
),
hl.format(
'<%s:%s>',
hl.array(SV_TYPES)[ht.sv_type_id],
hl.array(SV_TYPE_DETAILS)[ht.sv_type_detail_id],
),
hl.format('<%s>', hl.array(SV_TYPES)[ht.sv_type_id]),
),
],
)


def info(ht: hl.Table, **_: Any) -> hl.StructExpression:
return hl.Struct(
ALGORITHMS=ht.algorithms,
END=ht.start_locus.position,
CHR2=ht.end_locus.contig,
END2=ht.end_locus.position,
SVTYPE=hl.array(SV_TYPES)[ht.sv_type_id],
SVLEN=ht.sv_len,
)


def locus(ht: hl.Table, **_: Any) -> hl.LocusExpression:
return ht.start_locus


def algorithms(ht: hl.Table, **_: Any) -> hl.Expression:
return hl.str(',').join(ht['info.ALGORITHMS'])

Expand Down Expand Up @@ -205,6 +240,10 @@ def strvctvre(ht: hl.Table, **_: Any) -> hl.Expression:
return hl.struct(score=hl.parse_float32(ht['info.StrVCTVRE']))


def sv_len(ht: hl.Table, **_: Any) -> hl.Expression:
return ht['info.SVLEN']


def sv_type_id(ht: hl.Table, **_: Any) -> hl.Expression:
return SV_TYPES_LOOKUP[_sv_types(ht)[0]]

Expand Down
171 changes: 171 additions & 0 deletions v03_pipeline/lib/annotations/sv_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
import unittest

import hail as hl

from v03_pipeline.lib.annotations.fields import get_fields
from v03_pipeline.lib.model import DatasetType


class SVTest(unittest.TestCase):
def test_sv_export_annotations(self) -> None:
ht = hl.Table.parallelize(
[
hl.Struct(
id=0,
algorithms='manta',
end_locus=hl.Locus(
contig='chr5',
position=20404,
reference_genome='GRCh38',
),
start_locus=hl.Locus(
contig='chr1',
position=180928,
reference_genome='GRCh38',
),
sv_len=123,
sv_type_id=2,
sv_type_detail_id=None,
),
hl.Struct(
id=1,
algorithms='manta',
end_locus=hl.Locus(
contig='chr1',
position=789481,
reference_genome='GRCh38',
),
start_locus=hl.Locus(
contig='chr1',
position=789481,
reference_genome='GRCh38',
),
sv_len=245,
sv_type_id=2,
sv_type_detail_id=None,
),
hl.Struct(
id=2,
algorithms='manta',
end_locus=hl.Locus(
contig='chr1',
position=6559723,
reference_genome='GRCh38',
),
start_locus=hl.Locus(
contig='chr1',
position=6558902,
reference_genome='GRCh38',
),
sv_len=245,
sv_type_id=3,
sv_type_detail_id=2,
),
hl.Struct(
id=3,
algorithms='manta',
end_locus=hl.Locus(
contig='chr1',
position=6559723,
reference_genome='GRCh38',
),
start_locus=hl.Locus(
contig='chr1',
position=6558902,
reference_genome='GRCh38',
),
sv_len=245,
sv_type_id=7,
sv_type_detail_id=6,
),
],
hl.tstruct(
id=hl.tint32,
algorithms=hl.tstr,
end_locus=hl.tlocus('GRCh38'),
start_locus=hl.tlocus('GRCh38'),
sv_len=hl.tint32,
sv_type_id=hl.tint32,
sv_type_detail_id=hl.tint32,
),
key='id',
)
ht = ht.select(
**get_fields(
ht,
DatasetType.SV.export_vcf_annotation_fns,
),
)
self.assertEqual(
ht.collect(),
[
hl.Struct(
id=0,
locus=hl.Locus(
contig='chr1',
position=180928,
reference_genome='GRCh38',
),
alleles=['N', '<BND>'],
info=hl.Struct(
ALGORITHMS='manta',
END=180928,
CHR2='chr5',
END2=20404,
SVTYPE='BND',
SVLEN=123,
),
),
hl.Struct(
id=1,
locus=hl.Locus(
contig='chr1',
position=789481,
reference_genome='GRCh38',
),
alleles=['N', '<BND>'],
info=hl.Struct(
ALGORITHMS='manta',
END=789481,
CHR2='chr1',
END2=789481,
SVTYPE='BND',
SVLEN=245,
),
),
hl.Struct(
id=2,
locus=hl.Locus(
contig='chr1',
position=6558902,
reference_genome='GRCh38',
),
alleles=['N', '<CPX>'],
info=hl.Struct(
ALGORITHMS='manta',
END=6558902,
CHR2='chr1',
END2=6559723,
SVTYPE='CPX',
SVLEN=245,
),
),
hl.Struct(
id=3,
locus=hl.Locus(
contig='chr1',
position=6558902,
reference_genome='GRCh38',
),
alleles=['N', '<INS:ME:SVA>'],
info=hl.Struct(
ALGORITHMS='manta',
END=6558902,
CHR2='chr1',
END2=6559723,
SVTYPE='INS',
SVLEN=245,
),
),
],
)
Empty file.
16 changes: 16 additions & 0 deletions v03_pipeline/lib/migration/base_migration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from abc import ABC, abstractmethod

import hail as hl

from v03_pipeline.lib.model import DatasetType, ReferenceGenome


class BaseMigration(ABC):
reference_genome_dataset_types: frozenset[
tuple[ReferenceGenome, DatasetType]
] = None

@staticmethod
@abstractmethod
def migrate(ht: hl.Table) -> hl.Table:
pass
33 changes: 33 additions & 0 deletions v03_pipeline/lib/migration/misc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import inspect
import pkgutil
import re

from v03_pipeline.lib.migration.base_migration import BaseMigration

MIGRATION_NAME_PATTERN = r'(\d\d\d\d_.*)'


def list_migrations(
path: str,
) -> list[tuple[str, BaseMigration]]:
migrations = []
for loader, name, _ in pkgutil.iter_modules([path]):
match = re.search(MIGRATION_NAME_PATTERN, name)
if match:
module = loader.find_module(name).load_module(name)
implemented_migration = next(
(
m
for m in module.__dict__.values()
# Return objects that are
# classes, subclasses of the BaseMigration
# and also NOT the BaseMigration class.
if inspect.isclass(m)
and issubclass(m, BaseMigration)
and m != BaseMigration
),
None,
)
if implemented_migration:
migrations.append((match.group(1), implemented_migration))
return sorted(migrations, key=lambda x: x[0])
47 changes: 47 additions & 0 deletions v03_pipeline/lib/migration/misc_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import os
import shutil
import tempfile
import unittest

from v03_pipeline.lib.migration.base_migration import BaseMigration
from v03_pipeline.lib.migration.misc import list_migrations


class TestListMigrations(unittest.TestCase):
def setUp(self):
self.tmpdir = tempfile.TemporaryDirectory()
for migration in [
'__init__.py',
'1111_a_migration.py',
'0000_migration.py',
'001_test.py',
'abcd_test.py',
'0000_migration.txt',
]:
with open(os.path.join(self.tmpdir.name, migration), 'w') as f:
f.write(
"""
from v03_pipeline.lib.migration.base_migration import BaseMigration
class ImplementedMigration(BaseMigration):
pass
""",
)

def tearDown(self):
if os.path.isdir(self.tmpdir.name):
shutil.rmtree(self.tmpdir.name)

def test_list_migrations(self):
self.assertEqual(
[
(x, issubclass(y, BaseMigration))
for (x, y) in list_migrations(self.tmpdir.name)
],
[
('0000_migration', True),
('1111_a_migration', True),
],
)
self.assertTrue(
all(hasattr(x[1], 'migrate') for x in list_migrations(self.tmpdir.name)),
)
Loading
Loading