Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 100 additions & 1 deletion src/main/resources/db-scripts/clickhouse/clickhouse.sql
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ CREATE TABLE IF NOT EXISTS genomic_event_derived
patient_unique_id String,
off_panel Boolean DEFAULT FALSE
) ENGINE = MergeTree
ORDER BY (variant_type, entrez_gene_id, hugo_gene_symbol, genetic_profile_stable_id, sample_unique_id);
ORDER BY (cancer_study_identifier, variant_type, entrez_gene_id, hugo_gene_symbol, genetic_profile_stable_id, sample_unique_id);

INSERT INTO genomic_event_derived
-- Insert Mutations
Expand Down Expand Up @@ -485,6 +485,105 @@ FROM
JOIN cancer_study cs ON cs.cancer_study_id = subquery.cancer_study_id
JOIN sample_derived sd ON sd.internal_id = subquery.sample_id;


-- START: PRIMARY KEY ADDITIONS
-- THE FOLLOWING SCRIPTS EXIST TO ADD PRIMARY KEYS TO LEGACY TABLES THAT ARE MISSING THEM. YOU
-- CANNOT CHANGE THE PRIMARY KEY ON A TABLE IN CLICKHOUSE, SO WE NEED TO CREATE A NEW TABLE WITH THE
-- PRIMARY KEY AND THEN COPY THE DATA OVER.


--Adds primary key to the sample_cna_event table for Clickhouse-only
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sheridancbio what do you think about fixing the primary keys of the "slung" tables as part of derivation. we create new table with appropriate keys, copy data into it and then switch the table out.
for the genetic_alteration table of public portal, this takes 5 minutes.

the alternative is that, as you suggested, we put table definition in sling process, which i don't love.

DROP TABLE IF EXISTS sample_cna_event_BACKUP;
CREATE TABLE sample_cna_event_BACKUP
(
`cna_event_id` Int64 COMMENT 'References cna_event.cna_event_id.',
`sample_id` Int64 COMMENT 'References sample.internal_id.',
`genetic_profile_id` Int64 COMMENT 'References genetic_profile.genetic_profile_id.',
`annotation_json` Nullable(String) COMMENT 'JSON-formatted annotation details.'
)
ENGINE = SharedMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}')
PRIMARY KEY (genetic_profile_id, cna_event_id, sample_id)
ORDER BY (genetic_profile_id, cna_event_id, sample_id)
SETTINGS index_granularity = 8192
COMMENT 'Observed CNA events per sample and profile. References cna_event, sample, and genetic_profile.';

-- Copy the data
INSERT INTO sample_cna_event_BACKUP
SELECT * FROM sample_cna_event;

-- SWITCH THE TABLES
EXCHANGE TABLES sample_cna_event_BACKUP AND sample_cna_event;

DROP TABLE IF EXISTS mutation_BACKUP;
CREATE TABLE mutation_BACKUP
(
`mutation_event_id` Int64 COMMENT 'References mutation_event.mutation_event_id.',
`genetic_profile_id` Int64 COMMENT 'References genetic_profile.genetic_profile_id.',
`sample_id` Int64 COMMENT 'References sample.internal_id.',
`entrez_gene_id` Int64 COMMENT 'References gene.entrez_gene_id.',
`center` Nullable(String) COMMENT 'Center where sequencing was performed.',
`sequencer` Nullable(String) COMMENT 'Sequencing platform used.',
`mutation_status` Nullable(String) COMMENT 'Mutation status: Germline, Somatic, or LOH.',
`validation_status` Nullable(String) COMMENT 'Validation status.',
`tumor_seq_allele1` Nullable(String) COMMENT 'Tumor allele 1 sequence.',
`tumor_seq_allele2` Nullable(String) COMMENT 'Tumor allele 2 sequence.',
`matched_norm_sample_barcode` Nullable(String) COMMENT 'Matched normal sample barcode.',
`match_norm_seq_allele1` Nullable(String) COMMENT 'Matched normal allele 1 sequence.',
`match_norm_seq_allele2` Nullable(String) COMMENT 'Matched normal allele 2 sequence.',
`tumor_validation_allele1` Nullable(String) COMMENT 'Tumor validation allele 1 sequence.',
`tumor_validation_allele2` Nullable(String) COMMENT 'Tumor validation allele 2 sequence.',
`match_norm_validation_allele1` Nullable(String) COMMENT 'Matched normal validation allele 1.',
`match_norm_validation_allele2` Nullable(String) COMMENT 'Matched normal validation allele 2.',
`verification_status` Nullable(String) COMMENT 'Verification status.',
`sequencing_phase` Nullable(String) COMMENT 'Sequencing phase.',
`sequence_source` Nullable(String) COMMENT 'Source of sequencing data.',
`validation_method` Nullable(String) COMMENT 'Validation method used.',
`score` Nullable(String) COMMENT 'Score or quality metric.',
`bam_file` Nullable(String) COMMENT 'Associated BAM file.',
`tumor_alt_count` Nullable(Int64) COMMENT 'Tumor alternate allele count.',
`tumor_ref_count` Nullable(Int64) COMMENT 'Tumor reference allele count.',
`normal_alt_count` Nullable(Int64) COMMENT 'Normal alternate allele count.',
`normal_ref_count` Nullable(Int64) COMMENT 'Normal reference allele count.',
`amino_acid_change` Nullable(String) COMMENT 'Amino acid change from mutation.',
`annotation_json` Nullable(String) COMMENT 'JSON-formatted annotations.'
)
ENGINE = SharedMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}')
PRIMARY KEY (genetic_profile_id,entrez_gene_id)
ORDER BY (genetic_profile_id,entrez_gene_id)
SETTINGS index_granularity = 8192
COMMENT 'Mutation observations in specific samples and profiles. References mutation_event, gene, genetic_profile, and sample.'

-- copy data into new table
INSERT INTO mutation_BACKUP
SELECT * FROM mutation;

-- switch the tables
EXCHANGE TABLES mutation_BACKUP AND mutation;


-- Adds primary key genetic_alteration table for Clickhouse-only
DROP TABLE IF EXISTS genetic_alteration_BACKUP;
CREATE TABLE genetic_alteration_BACKUP
(
`genetic_profile_id` Int64,
`genetic_entity_id` Int64,
`values` String
)
ENGINE = SharedMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}')
PRIMARY KEY (genetic_profile_id, genetic_entity_id)
ORDER BY (genetic_profile_id, genetic_entity_id)
SETTINGS index_granularity = 8192;

-- Copy the data
INSERT INTO genetic_alteration_BACKUP
SELECT * FROM genetic_alteration;

-- SWITCH THE TABLES
EXCHANGE TABLES genetic_alteration_BACKUP AND genetic_alteration;

--END: PRIMARY KEY ADDITIONS


OPTIMIZE TABLE sample_to_gene_panel_derived;
OPTIMIZE TABLE gene_panel_to_gene_derived;
OPTIMIZE TABLE sample_derived;
Expand Down
Loading