Maybe One‐Off commands

Find base_studies with the same doi

from sqlalchemy import func, exists
from sqlalchemy.orm import aliased

# Ensure only studies with versions are included
same_dois = (
    BaseStudy.query
    .filter(exists().where(BaseStudy.id == Study.base_study_id))  # Filter to only include studies with versions
    .group_by(BaseStudy.doi)
    .having(func.count(BaseStudy.doi) > 1)
    .with_entities(
        BaseStudy.doi,
        func.array_agg(BaseStudy.id).label("id_list")
    )
    .all()
)

from sqlalchemy import or_

# Collect all IDs to batch query
all_ids = [id_ for doi in same_dois if len(doi[1]) == 2 for id_ in doi[1]]
id_to_study = {s.id: s for s in BaseStudy.query.filter(BaseStudy.id.in_(all_ids)).all()}

keep = []
delete = []

for doi in same_dois:
    if len(doi[1]) != 2:
        continue

    first_id, second_id = doi[1]
    first = id_to_study[first_id]
    second = id_to_study[second_id]

    # Determine priority based on number of identifiers
    first_id_count = sum(bool(getattr(first, attr)) for attr in ["doi", "pmid", "pmcid"])
    second_id_count = sum(bool(getattr(second, attr)) for attr in ["doi", "pmid", "pmcid"])

    if second_id_count > first_id_count:
        first, second = second, first  # Swap so that first is always the higher priority study

    # Merge identifiers (priority study retains its values unless missing)
    first.pmid = first.pmid or second.pmid
    first.pmcid = first.pmcid or second.pmcid
    first.name = first.name or second.name
    first.description = first.description or second.description
    first.year = first.year or second.year
    first.publication = first.publication or second.publication
    first.metadata_ = first.metadata_ or second.metadata_

    # Merge versions/studies
    first.versions.extend(second.versions)

    # Add to lists for keeping and deleting
    keep.append(first)
    delete.append(second)

    # Optional logging
    print(f"doi: {doi[0]}")
    print(f"name: {first.name}")
    print(f"pmid: {first.pmid}")
    print(f"pmcid: {first.pmcid}")
    print(f"studies: {len(first.versions)} versions merged")
    print(f"first source: {first.versions[0].source if first.versions else 'No source'}")

# Optional: Delete duplicates after processing
#for study in delete:
#    db.session.delete(study)

# Commit the changes
# db.session.commit()

Divide combined studies

dup_bs = []
for bs in BaseStudy.query.options(joinedload(BaseStudy.versions)):
    if len(bs.versions) < 1:
        continue
    pmid = bs.versions[0].pmid
    for s in bs.versions[1:]:
        if not pmid:
            pmid = s.pmid
        if s.pmid and s.pmid != pmid:
            if bs not in dup_bs:
                dup_bs.append(bs)
            print(bs.id)

new_bs = []
for bs in dup_bs:
    orig_pmid = bs.pmid
    groups = {}
    for v in bs.versions:
        if v.pmid == orig_pmid:
            continue
        if v.pmid in groups:
            groups[v.pmid].append(v)
        else:
            groups[v.pmid] = [v]
    for pmid, studies in groups.items():
        new_bs.append(BaseStudy(
            name=next((x.name for x in studies if x.name), None),
            pmid=pmid,
            doi=next((x.doi for x in studies if x.doi), None),
            authors=next((x.authors for x in studies if x.authors), None),
            year=next((x.year for x in studies if x.year), None),
            description=next((x.description for x in studies if x.description), None),
            publication=next((x.publication for x in studies if x.publication), None),
            metadata_=next((x.metadata for x in studies if x.metadata_), None),
            level="group",
            public=True,
            versions=studies,
       ))

db.session.add_all(new_bs)
db.session.commit()

Update provenance

from sqlalchemy.orm.attributes import flag_modified


to_commit = []
for p in Project.query:
    if not p.provenance.get("extractionMetadata"):
        continue
    for ss in p.provenance['extractionMetadata']['studyStatusList']: 
        if ss['status'] == "COMPLETE":
            ss['status'] = 'completed'
        elif ss['status'] == 'SAVEFORLATER':
            ss['status'] = 'savedforlater'
   
    flag_modified(p, "provenance")
    to_commit.append(p)

db.session.add_all(to_commit)
db.session.commit()

Add order to existing analysis objects

from sqlalchemy.orm import joinedload

def order_objects(objects):
    # Check if all objects have table_id defined
    if all(hasattr(obj, 'table_id') and obj.table_id is not None for obj in objects):
        # Check if all table_ids can be represented as integers
        if all(obj.table_id.isdigit() for obj in objects):
            # Convert table_ids to integers and sort based on value
            objects.sort(key=lambda obj: int(obj.table_id))
        else:
            # Sort based on string comparison of table_ids
            objects.sort(key=lambda obj: obj.table_id)
    else:
        # Check if all names can be represented as integers
        if all(obj.name.isdigit() for obj in objects):
            # Convert names to integers and sort based on value
            objects.sort(key=lambda obj: int(obj.name))
        else:
            # Sort based on string comparison of names
            objects.sort(key=lambda obj: obj.name)
    
    # Assign order attribute to each object
    for i, obj in enumerate(objects):
        obj.order = i

studies = Study.query.options(joinedload(Study.analyses)).all()

handle studies with doi, but no pmid

doi_no_pmid = BaseStudy.query.filter(and_(BaseStudy.doi != None, BaseStudy.doi != '')).filter(or_(BaseStudy.pmid == None, BaseStudy.pmid == '')).all()

from sqlalchemy import or_

to_commit = []
to_delete = []

for bs in doi_no_pmid:
    pmids = doi_to_pmid(bs.doi)
    if len(pmids) == 1:
        pmid = pmids[0]

        # Find all studies with matching DOI or PMID
        other_bss = BaseStudy.query.filter(
            or_(BaseStudy.doi == bs.doi, BaseStudy.pmid == pmid)
        ).all()

        # Combine with the current bs and remove duplicates
        all_bss = {s.id: s for s in other_bss + [bs]}.values()

        # Choose a base with both DOI and PMID if available
        base = next((s for s in all_bss if s.doi and s.pmid), None)
        if not base:
            # Fall back to one with PMID, then DOI, then any
            base = next((s for s in all_bss if s.pmid), None) or \
                   next((s for s in all_bss if s.doi), None) or \
                   list(all_bss)[0]

        for other in all_bss:
            if other.id == base.id:
                continue

            for attr in ['name', 'description', 'metadata', 'publication', 'authors', 'year', 'level']:
                base_val = getattr(base, attr)
                other_val = getattr(other, attr)
                if not base_val and other_val:
                    setattr(base, attr, other_val)

            # Merge versions
            base.versions.extend(other.versions)
            to_delete.append(other)

        # Ensure base has both DOI and PMID
        base.doi = base.doi or bs.doi
        base.pmid = base.pmid or pmid

        # Update versions with PMID
        for v in base.versions:
            if not v.pmid:
                v.pmid = pmid
            to_commit.append(v)

        to_commit.append(base)
        print(base.name)

# After loop:
# db.session.add_all(to_commit)
# for bs in to_delete:
#     db.session.delete(bs)
# db.session.commit()

Add PMCIDs

from Bio import Entrez
import time

Entrez.email = "jamesdkent21@gmail.com"  # Replace with your email

BATCH_SIZE = 800

def fetch_pmcids_batch(pmids):
    """
    Given a list of PMIDs, return a dict {pmid: pmcid}
    """
    pmcid_map = {}
    try:
        handle = Entrez.elink(
            dbfrom="pubmed",
            db="pmc",
            id=pmids,
            linkname="pubmed_pmc"
        )
        records = Entrez.read(handle)
        for record, pmid in zip(records, pmids):
            for linkset in record.get("LinkSetDb", []):
                for link in linkset.get("Link", []):
                    pmcid_map[pmid] = f"PMC{link['Id']}"
    except Exception as e:
        print(f"Error during batch fetch: {e}")
    return pmcid_map

# Query all BaseStudies with pmid but missing pmcid
studies = BaseStudy.query.filter(
    BaseStudy.pmid != None,
    (BaseStudy.pmcid == None) | (BaseStudy.pmcid == '')
).all()

# Map from pmid string to BaseStudy
pmid_to_study = {str(study.pmid): study for study in studies}
all_pmids = list(pmid_to_study.keys())

to_commit = []

for i in range(0, len(all_pmids), BATCH_SIZE):
    batch_pmids = all_pmids[i:i + BATCH_SIZE]
    print(f"Fetching batch {i} to {i+len(batch_pmids)}")

    pmcid_map = fetch_pmcids_batch(batch_pmids)

    for pmid, pmcid in pmcid_map.items():
        study = pmid_to_study[pmid]
        study.pmcid = pmcid
        to_commit.append(study)
        print(f"PMID {pmid} → PMCID {pmcid}")

    # NCBI recommends 3 requests per second max
    time.sleep(0.5)

# Commit all changes
if to_commit:
    db.session.add_all(to_commit)
    db.session.commit()
    print(f"Committed {len(to_commit)} updates.")
else:
    print("No updates made.")

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Maybe One‐Off commands

Find base_studies with the same doi

Divide combined studies

Update provenance

Add order to existing analysis objects

handle studies with doi, but no pmid

Add PMCIDs

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Clone this wiki locally