Skip to content

88 duplicate nodes with conflicting names #19

@ayushnoori

Description

@ayushnoori

We have observed that 88 nodes in the July 2023 version of the KG (without taking the LCC) contain non-harmonized node names. For example, the same gene MT-ND5 is represented as either "ND5" or "MT-ND5," and the gallbladder anatomy node is represented as either "gallbladder" or "gall bladder." Below is a quick R script to merge these nodes:

# load edges of updated PrimeKG
primeKG_edges = fread(here(primeKG_dir, "kg", "auxiliary", "kg_raw.csv"))

# replace "off-label use" with off_label_use
primeKG_edges[relation == "off-label use", relation := "off_label_use"]

# construct node matrix
primeKG_nodes = primeKG_edges %>%
  .[, .(x_id, x_type, x_name, x_source)] %>%
  unique()
colnames(primeKG_nodes) = gsub("x", "node", colnames(primeKG_nodes))

# find and consolidate duplicate nodes
primeKG_nodes[, joint_id := paste(node_id, node_type, sep = "_")]
dup_list = primeKG_nodes[duplicated(joint_id), joint_id]
dup_nodes = primeKG_nodes %>%
  .[joint_id %in% dup_list] %>%
  .[order(joint_id)]

# separate out duplicate genes and anatomy, manually investigate
dup_anatomy = dup_nodes[node_type == "anatomy"] %>%
  .[, final_name := "gall bladder"]
dup_gene = dup_nodes[node_type == "gene/protein"]

# read HGNC official IDs
hgnc_set = fread(here("Data", "ID_mappings", "hgnc_complete_set.txt"), sep = "\t") %>%
  .[, entrez_id := as.character(entrez_id)]
dup_gene = merge(dup_gene, hgnc_set[, .(symbol, entrez_id)], by.x = "node_id", by.y = "entrez_id", all.x = T, all.y = F) %>%
  setnames("symbol", "final_name")

# combine back
dup_nodes = rbind(dup_anatomy, dup_gene) %>%
  .[, node_name := final_name] %>%
  .[, final_name := NULL] %>%
  unique()

# replace names as necessary
for (i in 1:nrow(dup_nodes)) {
  primeKG_nodes[node_id == dup_nodes[i, node_id] & node_type == dup_nodes[i, node_type], node_name := dup_nodes[i, node_name]]
  primeKG_edges[x_id == dup_nodes[i, node_id] & x_type == dup_nodes[i, node_type], x_name := dup_nodes[i, node_name]]
  primeKG_edges[y_id == dup_nodes[i, node_id] & y_type == dup_nodes[i, node_type], y_name := dup_nodes[i, node_name]]
}

# drop duplicates from nodes
non_dup_rows = nrow(primeKG_nodes)
primeKG_nodes = unique(primeKG_nodes)
message("Removed ", non_dup_rows - nrow(primeKG_nodes), " duplicates")

# make indices
primeKG_nodes[, node_index := 1:nrow(primeKG_nodes) - 1]
setcolorder(primeKG_nodes, "node_index")

# add indices to edges
primeKG_nodes %>% .[, node_string := paste(node_id, node_name, node_source, sep = "_")] %>%
  .[, x_index := node_index] %>%
  .[, y_index := node_index]
primeKG_edges %>%
  .[, x_string := paste(x_id, x_name, x_source, sep = "_")] %>%
  .[, y_string := paste(y_id, y_name, y_source, sep = "_")]

# merge back to edges
primeKG_edges = merge(primeKG_edges, primeKG_nodes[, .(node_string, x_index)], by.x = "x_string", by.y = "node_string", sort = F)
primeKG_edges = merge(primeKG_edges, primeKG_nodes[, .(node_string, y_index)], by.x = "y_string", by.y = "node_string", sort = F)

# drop merge columns
primeKG_nodes %>%
  .[, node_string := NULL] %>%
  .[, x_index := NULL] %>%
  .[, y_index := NULL]
primeKG_edges %>%
  .[, x_string := NULL] %>%
  .[, y_string := NULL]
setcolorder(primeKG_edges, c("relation", "display_relation", "x_index", "x_id", "x_type", "x_name", "x_source", "y_index", "y_id", "y_type", "y_name", "y_source"))

# print node counts
message("Updated PrimeKG Nodes:\t", nrow(primeKG_nodes))
message("Updated PrimeKG Edges:\t", nrow(primeKG_edges) / 2)

This script uses the file hgnc_complete_set.txt (see source), which was downloaded from the Human Gene Nomenclature Committee to resolve any conflicting gene IDs.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions