-
Notifications
You must be signed in to change notification settings - Fork 128
Open
Description
We have observed that 88 nodes in the July 2023 version of the KG (without taking the LCC) contain non-harmonized node names. For example, the same gene MT-ND5 is represented as either "ND5" or "MT-ND5," and the gallbladder anatomy node is represented as either "gallbladder" or "gall bladder." Below is a quick R script to merge these nodes:
# load edges of updated PrimeKG
primeKG_edges = fread(here(primeKG_dir, "kg", "auxiliary", "kg_raw.csv"))
# replace "off-label use" with off_label_use
primeKG_edges[relation == "off-label use", relation := "off_label_use"]
# construct node matrix
primeKG_nodes = primeKG_edges %>%
.[, .(x_id, x_type, x_name, x_source)] %>%
unique()
colnames(primeKG_nodes) = gsub("x", "node", colnames(primeKG_nodes))
# find and consolidate duplicate nodes
primeKG_nodes[, joint_id := paste(node_id, node_type, sep = "_")]
dup_list = primeKG_nodes[duplicated(joint_id), joint_id]
dup_nodes = primeKG_nodes %>%
.[joint_id %in% dup_list] %>%
.[order(joint_id)]
# separate out duplicate genes and anatomy, manually investigate
dup_anatomy = dup_nodes[node_type == "anatomy"] %>%
.[, final_name := "gall bladder"]
dup_gene = dup_nodes[node_type == "gene/protein"]
# read HGNC official IDs
hgnc_set = fread(here("Data", "ID_mappings", "hgnc_complete_set.txt"), sep = "\t") %>%
.[, entrez_id := as.character(entrez_id)]
dup_gene = merge(dup_gene, hgnc_set[, .(symbol, entrez_id)], by.x = "node_id", by.y = "entrez_id", all.x = T, all.y = F) %>%
setnames("symbol", "final_name")
# combine back
dup_nodes = rbind(dup_anatomy, dup_gene) %>%
.[, node_name := final_name] %>%
.[, final_name := NULL] %>%
unique()
# replace names as necessary
for (i in 1:nrow(dup_nodes)) {
primeKG_nodes[node_id == dup_nodes[i, node_id] & node_type == dup_nodes[i, node_type], node_name := dup_nodes[i, node_name]]
primeKG_edges[x_id == dup_nodes[i, node_id] & x_type == dup_nodes[i, node_type], x_name := dup_nodes[i, node_name]]
primeKG_edges[y_id == dup_nodes[i, node_id] & y_type == dup_nodes[i, node_type], y_name := dup_nodes[i, node_name]]
}
# drop duplicates from nodes
non_dup_rows = nrow(primeKG_nodes)
primeKG_nodes = unique(primeKG_nodes)
message("Removed ", non_dup_rows - nrow(primeKG_nodes), " duplicates")
# make indices
primeKG_nodes[, node_index := 1:nrow(primeKG_nodes) - 1]
setcolorder(primeKG_nodes, "node_index")
# add indices to edges
primeKG_nodes %>% .[, node_string := paste(node_id, node_name, node_source, sep = "_")] %>%
.[, x_index := node_index] %>%
.[, y_index := node_index]
primeKG_edges %>%
.[, x_string := paste(x_id, x_name, x_source, sep = "_")] %>%
.[, y_string := paste(y_id, y_name, y_source, sep = "_")]
# merge back to edges
primeKG_edges = merge(primeKG_edges, primeKG_nodes[, .(node_string, x_index)], by.x = "x_string", by.y = "node_string", sort = F)
primeKG_edges = merge(primeKG_edges, primeKG_nodes[, .(node_string, y_index)], by.x = "y_string", by.y = "node_string", sort = F)
# drop merge columns
primeKG_nodes %>%
.[, node_string := NULL] %>%
.[, x_index := NULL] %>%
.[, y_index := NULL]
primeKG_edges %>%
.[, x_string := NULL] %>%
.[, y_string := NULL]
setcolorder(primeKG_edges, c("relation", "display_relation", "x_index", "x_id", "x_type", "x_name", "x_source", "y_index", "y_id", "y_type", "y_name", "y_source"))
# print node counts
message("Updated PrimeKG Nodes:\t", nrow(primeKG_nodes))
message("Updated PrimeKG Edges:\t", nrow(primeKG_edges) / 2)
This script uses the file hgnc_complete_set.txt
(see source), which was downloaded from the Human Gene Nomenclature Committee to resolve any conflicting gene IDs.
Metadata
Metadata
Assignees
Labels
No labels