-
Notifications
You must be signed in to change notification settings - Fork 6
Maybe One‐Off commands
James Kent edited this page Apr 17, 2025
·
18 revisions
from sqlalchemy import func, exists
from sqlalchemy.orm import aliased
# Ensure only studies with versions are included
same_dois = (
BaseStudy.query
.filter(exists().where(BaseStudy.id == Study.base_study_id)) # Filter to only include studies with versions
.group_by(BaseStudy.doi)
.having(func.count(BaseStudy.doi) > 1)
.with_entities(
BaseStudy.doi,
func.array_agg(BaseStudy.id).label("id_list")
)
.all()
)
from sqlalchemy import or_
# Collect all IDs to batch query
all_ids = [id_ for doi in same_dois if len(doi[1]) == 2 for id_ in doi[1]]
id_to_study = {s.id: s for s in BaseStudy.query.filter(BaseStudy.id.in_(all_ids)).all()}
keep = []
delete = []
for doi in same_dois:
if len(doi[1]) != 2:
continue
first_id, second_id = doi[1]
first = id_to_study[first_id]
second = id_to_study[second_id]
# Determine priority based on number of identifiers
first_id_count = sum(bool(getattr(first, attr)) for attr in ["doi", "pmid", "pmcid"])
second_id_count = sum(bool(getattr(second, attr)) for attr in ["doi", "pmid", "pmcid"])
if second_id_count > first_id_count:
first, second = second, first # Swap so that first is always the higher priority study
# Merge identifiers (priority study retains its values unless missing)
first.pmid = first.pmid or second.pmid
first.pmcid = first.pmcid or second.pmcid
first.name = first.name or second.name
first.description = first.description or second.description
first.year = first.year or second.year
first.publication = first.publication or second.publication
first.metadata_ = first.metadata_ or second.metadata_
# Merge versions/studies
first.versions.extend(second.versions)
# Add to lists for keeping and deleting
keep.append(first)
delete.append(second)
# Optional logging
print(f"doi: {doi[0]}")
print(f"name: {first.name}")
print(f"pmid: {first.pmid}")
print(f"pmcid: {first.pmcid}")
print(f"studies: {len(first.versions)} versions merged")
print(f"first source: {first.versions[0].source if first.versions else 'No source'}")
# Optional: Delete duplicates after processing
#for study in delete:
# db.session.delete(study)
# Commit the changes
# db.session.commit()
dup_bs = []
for bs in BaseStudy.query.options(joinedload(BaseStudy.versions)):
if len(bs.versions) < 1:
continue
pmid = bs.versions[0].pmid
for s in bs.versions[1:]:
if not pmid:
pmid = s.pmid
if s.pmid and s.pmid != pmid:
if bs not in dup_bs:
dup_bs.append(bs)
print(bs.id)
new_bs = []
for bs in dup_bs:
orig_pmid = bs.pmid
groups = {}
for v in bs.versions:
if v.pmid == orig_pmid:
continue
if v.pmid in groups:
groups[v.pmid].append(v)
else:
groups[v.pmid] = [v]
for pmid, studies in groups.items():
new_bs.append(BaseStudy(
name=next((x.name for x in studies if x.name), None),
pmid=pmid,
doi=next((x.doi for x in studies if x.doi), None),
authors=next((x.authors for x in studies if x.authors), None),
year=next((x.year for x in studies if x.year), None),
description=next((x.description for x in studies if x.description), None),
publication=next((x.publication for x in studies if x.publication), None),
metadata_=next((x.metadata for x in studies if x.metadata_), None),
level="group",
public=True,
versions=studies,
))
db.session.add_all(new_bs)
db.session.commit()
from sqlalchemy.orm.attributes import flag_modified
to_commit = []
for p in Project.query:
if not p.provenance.get("extractionMetadata"):
continue
for ss in p.provenance['extractionMetadata']['studyStatusList']:
if ss['status'] == "COMPLETE":
ss['status'] = 'completed'
elif ss['status'] == 'SAVEFORLATER':
ss['status'] = 'savedforlater'
flag_modified(p, "provenance")
to_commit.append(p)
db.session.add_all(to_commit)
db.session.commit()
from sqlalchemy.orm import joinedload
def order_objects(objects):
# Check if all objects have table_id defined
if all(hasattr(obj, 'table_id') and obj.table_id is not None for obj in objects):
# Check if all table_ids can be represented as integers
if all(obj.table_id.isdigit() for obj in objects):
# Convert table_ids to integers and sort based on value
objects.sort(key=lambda obj: int(obj.table_id))
else:
# Sort based on string comparison of table_ids
objects.sort(key=lambda obj: obj.table_id)
else:
# Check if all names can be represented as integers
if all(obj.name.isdigit() for obj in objects):
# Convert names to integers and sort based on value
objects.sort(key=lambda obj: int(obj.name))
else:
# Sort based on string comparison of names
objects.sort(key=lambda obj: obj.name)
# Assign order attribute to each object
for i, obj in enumerate(objects):
obj.order = i
studies = Study.query.options(joinedload(Study.analyses)).all()
doi_no_pmid = BaseStudy.query.filter(and_(BaseStudy.doi != None, BaseStudy.doi != '')).filter(or_(BaseStudy.pmid == None, BaseStudy.pmid == '')).all()
from sqlalchemy import or_
to_commit = []
to_delete = []
for bs in doi_no_pmid:
pmids = doi_to_pmid(bs.doi)
if len(pmids) == 1:
pmid = pmids[0]
# Find all studies with matching DOI or PMID
other_bss = BaseStudy.query.filter(
or_(BaseStudy.doi == bs.doi, BaseStudy.pmid == pmid)
).all()
# Combine with the current bs and remove duplicates
all_bss = {s.id: s for s in other_bss + [bs]}.values()
# Choose a base with both DOI and PMID if available
base = next((s for s in all_bss if s.doi and s.pmid), None)
if not base:
# Fall back to one with PMID, then DOI, then any
base = next((s for s in all_bss if s.pmid), None) or \
next((s for s in all_bss if s.doi), None) or \
list(all_bss)[0]
for other in all_bss:
if other.id == base.id:
continue
for attr in ['name', 'description', 'metadata', 'publication', 'authors', 'year', 'level']:
base_val = getattr(base, attr)
other_val = getattr(other, attr)
if not base_val and other_val:
setattr(base, attr, other_val)
# Merge versions
base.versions.extend(other.versions)
to_delete.append(other)
# Ensure base has both DOI and PMID
base.doi = base.doi or bs.doi
base.pmid = base.pmid or pmid
# Update versions with PMID
for v in base.versions:
if not v.pmid:
v.pmid = pmid
to_commit.append(v)
to_commit.append(base)
print(base.name)
# After loop:
# db.session.add_all(to_commit)
# for bs in to_delete:
# db.session.delete(bs)
# db.session.commit()
from Bio import Entrez
import time
Entrez.email = "jamesdkent21@gmail.com" # Replace with your email
BATCH_SIZE = 800
def fetch_pmcids_batch(pmids):
"""
Given a list of PMIDs, return a dict {pmid: pmcid}
"""
pmcid_map = {}
try:
handle = Entrez.elink(
dbfrom="pubmed",
db="pmc",
id=pmids,
linkname="pubmed_pmc"
)
records = Entrez.read(handle)
for record, pmid in zip(records, pmids):
for linkset in record.get("LinkSetDb", []):
for link in linkset.get("Link", []):
pmcid_map[pmid] = f"PMC{link['Id']}"
except Exception as e:
print(f"Error during batch fetch: {e}")
return pmcid_map
# Query all BaseStudies with pmid but missing pmcid
studies = BaseStudy.query.filter(
BaseStudy.pmid != None,
(BaseStudy.pmcid == None) | (BaseStudy.pmcid == '')
).all()
# Map from pmid string to BaseStudy
pmid_to_study = {str(study.pmid): study for study in studies}
all_pmids = list(pmid_to_study.keys())
to_commit = []
for i in range(0, len(all_pmids), BATCH_SIZE):
batch_pmids = all_pmids[i:i + BATCH_SIZE]
print(f"Fetching batch {i} to {i+len(batch_pmids)}")
pmcid_map = fetch_pmcids_batch(batch_pmids)
for pmid, pmcid in pmcid_map.items():
study = pmid_to_study[pmid]
study.pmcid = pmcid
to_commit.append(study)
print(f"PMID {pmid} → PMCID {pmcid}")
# NCBI recommends 3 requests per second max
time.sleep(0.5)
# Commit all changes
if to_commit:
db.session.add_all(to_commit)
db.session.commit()
print(f"Committed {len(to_commit)} updates.")
else:
print("No updates made.")