Skip to content

Maybe One‐Off commands

James Kent edited this page Apr 17, 2025 · 18 revisions

Find base_studies with the same doi

from sqlalchemy import func, exists
from sqlalchemy.orm import aliased

# Ensure only studies with versions are included
same_dois = (
    BaseStudy.query
    .filter(exists().where(BaseStudy.id == Study.base_study_id))  # Filter to only include studies with versions
    .group_by(BaseStudy.doi)
    .having(func.count(BaseStudy.doi) > 1)
    .with_entities(
        BaseStudy.doi,
        func.array_agg(BaseStudy.id).label("id_list")
    )
    .all()
)
from sqlalchemy import or_

# Collect all IDs to batch query
all_ids = [id_ for doi in same_dois if len(doi[1]) == 2 for id_ in doi[1]]
id_to_study = {s.id: s for s in BaseStudy.query.filter(BaseStudy.id.in_(all_ids)).all()}

keep = []
delete = []

for doi in same_dois:
    if len(doi[1]) != 2:
        continue

    first_id, second_id = doi[1]
    first = id_to_study[first_id]
    second = id_to_study[second_id]

    # Determine priority based on number of identifiers
    first_id_count = sum(bool(getattr(first, attr)) for attr in ["doi", "pmid", "pmcid"])
    second_id_count = sum(bool(getattr(second, attr)) for attr in ["doi", "pmid", "pmcid"])

    if second_id_count > first_id_count:
        first, second = second, first  # Swap so that first is always the higher priority study

    # Merge identifiers (priority study retains its values unless missing)
    first.pmid = first.pmid or second.pmid
    first.pmcid = first.pmcid or second.pmcid
    first.name = first.name or second.name
    first.description = first.description or second.description
    first.year = first.year or second.year
    first.publication = first.publication or second.publication
    first.metadata_ = first.metadata_ or second.metadata_

    # Merge versions/studies
    first.versions.extend(second.versions)

    # Add to lists for keeping and deleting
    keep.append(first)
    delete.append(second)

    # Optional logging
    print(f"doi: {doi[0]}")
    print(f"name: {first.name}")
    print(f"pmid: {first.pmid}")
    print(f"pmcid: {first.pmcid}")
    print(f"studies: {len(first.versions)} versions merged")
    print(f"first source: {first.versions[0].source if first.versions else 'No source'}")

# Optional: Delete duplicates after processing
#for study in delete:
#    db.session.delete(study)

# Commit the changes
# db.session.commit()

Divide combined studies

dup_bs = []
for bs in BaseStudy.query.options(joinedload(BaseStudy.versions)):
    if len(bs.versions) < 1:
        continue
    pmid = bs.versions[0].pmid
    for s in bs.versions[1:]:
        if not pmid:
            pmid = s.pmid
        if s.pmid and s.pmid != pmid:
            if bs not in dup_bs:
                dup_bs.append(bs)
            print(bs.id)
new_bs = []
for bs in dup_bs:
    orig_pmid = bs.pmid
    groups = {}
    for v in bs.versions:
        if v.pmid == orig_pmid:
            continue
        if v.pmid in groups:
            groups[v.pmid].append(v)
        else:
            groups[v.pmid] = [v]
    for pmid, studies in groups.items():
        new_bs.append(BaseStudy(
            name=next((x.name for x in studies if x.name), None),
            pmid=pmid,
            doi=next((x.doi for x in studies if x.doi), None),
            authors=next((x.authors for x in studies if x.authors), None),
            year=next((x.year for x in studies if x.year), None),
            description=next((x.description for x in studies if x.description), None),
            publication=next((x.publication for x in studies if x.publication), None),
            metadata_=next((x.metadata for x in studies if x.metadata_), None),
            level="group",
            public=True,
            versions=studies,
       ))

db.session.add_all(new_bs)
db.session.commit()

Update provenance

from sqlalchemy.orm.attributes import flag_modified


to_commit = []
for p in Project.query:
    if not p.provenance.get("extractionMetadata"):
        continue
    for ss in p.provenance['extractionMetadata']['studyStatusList']: 
        if ss['status'] == "COMPLETE":
            ss['status'] = 'completed'
        elif ss['status'] == 'SAVEFORLATER':
            ss['status'] = 'savedforlater'
   
    flag_modified(p, "provenance")
    to_commit.append(p)

db.session.add_all(to_commit)
db.session.commit()

Add order to existing analysis objects

from sqlalchemy.orm import joinedload

def order_objects(objects):
    # Check if all objects have table_id defined
    if all(hasattr(obj, 'table_id') and obj.table_id is not None for obj in objects):
        # Check if all table_ids can be represented as integers
        if all(obj.table_id.isdigit() for obj in objects):
            # Convert table_ids to integers and sort based on value
            objects.sort(key=lambda obj: int(obj.table_id))
        else:
            # Sort based on string comparison of table_ids
            objects.sort(key=lambda obj: obj.table_id)
    else:
        # Check if all names can be represented as integers
        if all(obj.name.isdigit() for obj in objects):
            # Convert names to integers and sort based on value
            objects.sort(key=lambda obj: int(obj.name))
        else:
            # Sort based on string comparison of names
            objects.sort(key=lambda obj: obj.name)
    
    # Assign order attribute to each object
    for i, obj in enumerate(objects):
        obj.order = i

studies = Study.query.options(joinedload(Study.analyses)).all()

handle studies with doi, but no pmid

doi_no_pmid = BaseStudy.query.filter(and_(BaseStudy.doi != None, BaseStudy.doi != '')).filter(or_(BaseStudy.pmid == None, BaseStudy.pmid == '')).all()

from sqlalchemy import or_

to_commit = []
to_delete = []

for bs in doi_no_pmid:
    pmids = doi_to_pmid(bs.doi)
    if len(pmids) == 1:
        pmid = pmids[0]

        # Find all studies with matching DOI or PMID
        other_bss = BaseStudy.query.filter(
            or_(BaseStudy.doi == bs.doi, BaseStudy.pmid == pmid)
        ).all()

        # Combine with the current bs and remove duplicates
        all_bss = {s.id: s for s in other_bss + [bs]}.values()

        # Choose a base with both DOI and PMID if available
        base = next((s for s in all_bss if s.doi and s.pmid), None)
        if not base:
            # Fall back to one with PMID, then DOI, then any
            base = next((s for s in all_bss if s.pmid), None) or \
                   next((s for s in all_bss if s.doi), None) or \
                   list(all_bss)[0]

        for other in all_bss:
            if other.id == base.id:
                continue

            for attr in ['name', 'description', 'metadata', 'publication', 'authors', 'year', 'level']:
                base_val = getattr(base, attr)
                other_val = getattr(other, attr)
                if not base_val and other_val:
                    setattr(base, attr, other_val)

            # Merge versions
            base.versions.extend(other.versions)
            to_delete.append(other)

        # Ensure base has both DOI and PMID
        base.doi = base.doi or bs.doi
        base.pmid = base.pmid or pmid

        # Update versions with PMID
        for v in base.versions:
            if not v.pmid:
                v.pmid = pmid
            to_commit.append(v)

        to_commit.append(base)
        print(base.name)

# After loop:
# db.session.add_all(to_commit)
# for bs in to_delete:
#     db.session.delete(bs)
# db.session.commit()

Add PMCIDs

from Bio import Entrez
import time

Entrez.email = "jamesdkent21@gmail.com"  # Replace with your email

BATCH_SIZE = 800

def fetch_pmcids_batch(pmids):
    """
    Given a list of PMIDs, return a dict {pmid: pmcid}
    """
    pmcid_map = {}
    try:
        handle = Entrez.elink(
            dbfrom="pubmed",
            db="pmc",
            id=pmids,
            linkname="pubmed_pmc"
        )
        records = Entrez.read(handle)
        for record, pmid in zip(records, pmids):
            for linkset in record.get("LinkSetDb", []):
                for link in linkset.get("Link", []):
                    pmcid_map[pmid] = f"PMC{link['Id']}"
    except Exception as e:
        print(f"Error during batch fetch: {e}")
    return pmcid_map

# Query all BaseStudies with pmid but missing pmcid
studies = BaseStudy.query.filter(
    BaseStudy.pmid != None,
    (BaseStudy.pmcid == None) | (BaseStudy.pmcid == '')
).all()

# Map from pmid string to BaseStudy
pmid_to_study = {str(study.pmid): study for study in studies}
all_pmids = list(pmid_to_study.keys())

to_commit = []

for i in range(0, len(all_pmids), BATCH_SIZE):
    batch_pmids = all_pmids[i:i + BATCH_SIZE]
    print(f"Fetching batch {i} to {i+len(batch_pmids)}")

    pmcid_map = fetch_pmcids_batch(batch_pmids)

    for pmid, pmcid in pmcid_map.items():
        study = pmid_to_study[pmid]
        study.pmcid = pmcid
        to_commit.append(study)
        print(f"PMID {pmid} → PMCID {pmcid}")

    # NCBI recommends 3 requests per second max
    time.sleep(0.5)

# Commit all changes
if to_commit:
    db.session.add_all(to_commit)
    db.session.commit()
    print(f"Committed {len(to_commit)} updates.")
else:
    print("No updates made.")
Clone this wiki locally