88 duplicate nodes with conflicting names

We have observed that 88 nodes in the July 2023 version of the KG (without taking the LCC) contain non-harmonized node names. For example, the same gene MT-ND5 is represented as either "ND5" or "MT-ND5," and the gallbladder anatomy node is represented as either "gallbladder" or "gall bladder." Below is a quick R script to merge these nodes:

```R
# load edges of updated PrimeKG
primeKG_edges = fread(here(primeKG_dir, "kg", "auxiliary", "kg_raw.csv"))

# replace "off-label use" with off_label_use
primeKG_edges[relation == "off-label use", relation := "off_label_use"]

# construct node matrix
primeKG_nodes = primeKG_edges %>%
  .[, .(x_id, x_type, x_name, x_source)] %>%
  unique()
colnames(primeKG_nodes) = gsub("x", "node", colnames(primeKG_nodes))

# find and consolidate duplicate nodes
primeKG_nodes[, joint_id := paste(node_id, node_type, sep = "_")]
dup_list = primeKG_nodes[duplicated(joint_id), joint_id]
dup_nodes = primeKG_nodes %>%
  .[joint_id %in% dup_list] %>%
  .[order(joint_id)]

# separate out duplicate genes and anatomy, manually investigate
dup_anatomy = dup_nodes[node_type == "anatomy"] %>%
  .[, final_name := "gall bladder"]
dup_gene = dup_nodes[node_type == "gene/protein"]

# read HGNC official IDs
hgnc_set = fread(here("Data", "ID_mappings", "hgnc_complete_set.txt"), sep = "\t") %>%
  .[, entrez_id := as.character(entrez_id)]
dup_gene = merge(dup_gene, hgnc_set[, .(symbol, entrez_id)], by.x = "node_id", by.y = "entrez_id", all.x = T, all.y = F) %>%
  setnames("symbol", "final_name")

# combine back
dup_nodes = rbind(dup_anatomy, dup_gene) %>%
  .[, node_name := final_name] %>%
  .[, final_name := NULL] %>%
  unique()

# replace names as necessary
for (i in 1:nrow(dup_nodes)) {
  primeKG_nodes[node_id == dup_nodes[i, node_id] & node_type == dup_nodes[i, node_type], node_name := dup_nodes[i, node_name]]
  primeKG_edges[x_id == dup_nodes[i, node_id] & x_type == dup_nodes[i, node_type], x_name := dup_nodes[i, node_name]]
  primeKG_edges[y_id == dup_nodes[i, node_id] & y_type == dup_nodes[i, node_type], y_name := dup_nodes[i, node_name]]
}

# drop duplicates from nodes
non_dup_rows = nrow(primeKG_nodes)
primeKG_nodes = unique(primeKG_nodes)
message("Removed ", non_dup_rows - nrow(primeKG_nodes), " duplicates")

# make indices
primeKG_nodes[, node_index := 1:nrow(primeKG_nodes) - 1]
setcolorder(primeKG_nodes, "node_index")

# add indices to edges
primeKG_nodes %>% .[, node_string := paste(node_id, node_name, node_source, sep = "_")] %>%
  .[, x_index := node_index] %>%
  .[, y_index := node_index]
primeKG_edges %>%
  .[, x_string := paste(x_id, x_name, x_source, sep = "_")] %>%
  .[, y_string := paste(y_id, y_name, y_source, sep = "_")]

# merge back to edges
primeKG_edges = merge(primeKG_edges, primeKG_nodes[, .(node_string, x_index)], by.x = "x_string", by.y = "node_string", sort = F)
primeKG_edges = merge(primeKG_edges, primeKG_nodes[, .(node_string, y_index)], by.x = "y_string", by.y = "node_string", sort = F)

# drop merge columns
primeKG_nodes %>%
  .[, node_string := NULL] %>%
  .[, x_index := NULL] %>%
  .[, y_index := NULL]
primeKG_edges %>%
  .[, x_string := NULL] %>%
  .[, y_string := NULL]
setcolorder(primeKG_edges, c("relation", "display_relation", "x_index", "x_id", "x_type", "x_name", "x_source", "y_index", "y_id", "y_type", "y_name", "y_source"))

# print node counts
message("Updated PrimeKG Nodes:\t", nrow(primeKG_nodes))
message("Updated PrimeKG Edges:\t", nrow(primeKG_edges) / 2)
```

This script uses the file `hgnc_complete_set.txt` (see [source](https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt)), which was downloaded from the [Human Gene Nomenclature Committee](https://www.genenames.org/download/statistics-and-files/) to resolve any conflicting gene IDs.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

88 duplicate nodes with conflicting names #19

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

88 duplicate nodes with conflicting names #19

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions