Skip to content

Add missing authors and journals

James Kent edited this page Apr 19, 2025 · 3 revisions

refined script

from Bio import Entrez
from sqlalchemy import or_, func
from neurostore.models.data import BaseStudy, Study
from neurostore.database import db
from sqlalchemy.orm import joinedload

# Configure Entrez
Entrez.email = "jamesdkent21@gmail.com"  # Replace with your email
CHUNK_SIZE = 900  # PubMed API limit

def get_year_from_pubmed(pmid):
    """Get publication year from PubMed."""
    try:
        handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text")
        record = handle.read()
        handle.close()

        for line in record.splitlines():
            if line.startswith("DP  - "):
                year_str = line[6:10]
                try:
                    return int(year_str)
                except ValueError:
                    return None
        return None
    except Exception as e:
        print(f"Error fetching year for PMID {pmid}: {str(e)}")
        return None

def get_metadata_from_pubmed(bss):
    """Get journal and authors from PubMed for multiple records."""
    to_commit = []
    try:
        pmid_dict = {bs.pmid: bs for bs in bss}
        pmids_str = ",".join(list(pmid_dict.keys()))

        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        for entry in records.split("\n\n"):
            pmid = None
            journal_name = None
            authors = []
            
            for line in entry.splitlines():
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("JT  - "):
                    journal_name = line[6:]
                elif line.startswith("FAU - "):
                    authors.append(line[6:])

            if pmid and (journal_name or authors):
                bs = pmid_dict[pmid]
                if journal_name:
                    bs.publication = journal_name
                if authors:
                    bs.authors = ";".join(authors)
                to_commit.append(bs)
                
                # Update versions too
                for v in bs.versions:
                    if journal_name:
                        v.publication = journal_name
                    if authors:
                        v.authors = ";".join(authors)
                    to_commit.append(v)
                    
        return to_commit
    except Exception as e:
        print(f"Error fetching metadata: {str(e)}")
        return []

def fix_metadata():
    """Main function to fix missing metadata."""
    # Fix years
    bad_years = BaseStudy.query.filter(
        or_(
            BaseStudy.year == None,
            BaseStudy.year < 1900,
            BaseStudy.year > 2025
        )
    ).filter(BaseStudy.pmid != None).all()
    
    print(f"Found {len(bad_years)} records with invalid years")
    
    for bs in bad_years:
        year = get_year_from_pubmed(bs.pmid)
        if year:
            bs.year = year
            for v in bs.versions:
                v.year = year
            db.session.add(bs)
            db.session.add_all(bs.versions)
    
    # Fix journals and authors
    bad_metadata = BaseStudy.query.filter(
        or_(
            BaseStudy.publication == None,
            BaseStudy.publication == '',
            BaseStudy.authors == None,
            BaseStudy.authors == ''
        )
    ).filter(
        BaseStudy.pmid != None
    ).options(
        joinedload(BaseStudy.versions)
    ).all()
    
    print(f"Found {len(bad_metadata)} records with missing metadata")
    
    # Process in chunks due to API limits
    chunks = [bad_metadata[i:i+CHUNK_SIZE] for i in range(0, len(bad_metadata), CHUNK_SIZE)]
    
    for chunk in chunks:
        to_commit = get_metadata_from_pubmed(chunk)
        if to_commit:
            db.session.add_all(to_commit)
    
    # Commit all changes
    try:
        db.session.commit()
        print("Successfully updated metadata")
    except Exception as e:
        db.session.rollback()
        print(f"Error committing changes: {str(e)}")

if __name__ == "__main__":
    fix_metadata()
from sqlachemy import func
from sqlalchemy import func
from sqlalchemy import or_
BaseStudy.query.count(BaseStudy.id).group_by(BaseStudy.year)
BaseStudy.query.count().group_by(BaseStudy.year)
BaseStudy.query.count()
query = session.query(BaseStudy.year, func.count(BaseStudy.id)).group_by(BaseStudy.year)

# Executing the query and fetching the results
results = query.all()
query = db.session.query(BaseStudy.year, func.count(BaseStudy.id)).group_by(BaseStudy.year)

# Executing the query and fetching the results
results = query.all()
results
query = db.session.query(BaseStudy.year, func.count(BaseStudy.id)).group_by(BaseStudy.year).order_by(BaseStudy.year)


# Executing the query and fetching the results
results = query.all()
results
from sqlalchemy import in_
from sqlalchemy import in
values_to_check = [None, 0, 1, 3, 9, 13, 16, 19]
BaseStudy.query.filter(BaseStudy.year.in_(values_to_check)).count()
BaseStudy.query.filter(BaseStudy.year.in_(values_to_check)).all()
BaseStudy.query.filter(BaseStudy.year==None)
BaseStudy.query.filter(BaseStudy.year==None).count()
BaseStudy.query.filter(BaseStudy.year.in_(values_to_check[1:])).all()
BaseStudy.query.filter(BaseStudy.year.in_(values_to_check[1:])).count()
bad_year = BaseStudy.query.filter(or_(BaseStudy.year.in_(values_to_check[1:]), BaseStudy.year==None))
bad_year
bad_year = BaseStudy.query.filter(or_(BaseStudy.year.in_(values_to_check[1:]), BaseStudy.year==None)).all()
len(bad_year)
bad_year = BaseStudy.query.filter(or_(BaseStudy.year.in_(values_to_check[1:]), BaseStudy.year==None)).filter(BaseStudy.pmid != None).all()
len(bad_year)
from Bio import Entrez
from Bio.Entrez.Parser import ValidationError

def get_publication_year(pmid):
    """
    Retrieves the publication year for a given PMID using BioPython.

    Args:
    - pmid (str): PubMed ID (PMID) of the publication.

    Returns:
    - int or None: The publication year if found, otherwise None.
    """
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"

        # Fetch PubMed record for the given PMID
        handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text")
        record = handle.read()
        handle.close()

        # Extract publication year from the record
        lines = record.splitlines()
        for line in lines:
            if line.startswith("DP  - "):
                year_str = line[6:10]
                try:
                    year = int(year_str)
                    return year
                except ValueError:
                    return None  # Invalid year format
        return None  # Publication year not found

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve publication year.")
        return None

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return None
get_publication_year(bad_year[0].pmid)
get_publication_year(bad_year[1].pmid)
get_publication_year(bad_year[2].pmid)
to_commit = []
for bs in bad_year:
    year = get_publication_year(bs.pmid)
    if not year:
        print(f"NO YEAR FOR {bs.pmid}")
        continue
    bs.year = year
    to_commit.append(bs)
    for v in bs.versions:
        v.year = year
        to_commit.append(v)
to_commit
db.session.add(to_commit)
db.session.add_all(to_commit)
db.session.commit()
bad_year = BaseStudy.query.filter(or_(BaseStudy.year.in_(values_to_check[1:]), BaseStudy.year==None)).all()
len(bad_year)
query = db.session.query(BaseStudy.year, func.count(BaseStudy.id)).group_by(BaseStudy.year).order_by(BaseStudy.year)


# Executing the query and fetching the results
results = query.all()
results
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.year=='', func.trim(BaseStudy.year)=='')).all()
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).all()
db.session.rollback()
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).all()
len(bad_journal)
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).filter(BaseStudy.pmid != None).all()
len(bad_journal)
from sqlalchemy import joinload
from sqlalchemy import joinedload
from sqlalchemy.orm import joinedload
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).filter(BaseStudy.pmid != None).opetions(joinedload(BaseStudy.versions)).all()
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).filter(BaseStudy.pmid != None).options(joinedload(BaseStudy.versions)).all()

def get_journal_names(pmids):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    journal_names = {}
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "your.email@example.com"

        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(pmids)

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            journal_name = None
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("JT  - "):
                    journal_name = line[6:]
            if pmid and journal_name:
                journal_names[pmid] = journal_name

        return journal_names

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return {}

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return {}
CHUNK_SIZE = 900
chunks = [studies[i:i+CHUNK_SIZE] for i in range(0, len(studies), CHUNK_SIZE)]
chunks = [studies[i:i+CHUNK_SIZE] for i in range(0, len(bad_journal), CHUNK_SIZE)]
chunks = [bad_journal[i:i+CHUNK_SIZE] for i in range(0, len(bad_journal), CHUNK_SIZE)]
chunks

def get_journal_names(bs):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"

        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(pmids)

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            journal_name = None
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("JT  - "):
                    journal_name = line[6:]
            if pmid and journal_name:
                bs.publication = journal_name
                to_commit.append(bs)
                for v in bs.versions:
                    v.publication = journal
                    to_commmit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return {}

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return {}

def get_journal_names(bs):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"

        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(pmids)

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            journal_name = None
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("JT  - "):
                    journal_name = line[6:]
            if pmid and journal_name:
                bs.publication = journal_name
                to_commit.append(bs)
                for v in bs.versions:
                    v.publication = journal
                    to_commmit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
to_commit = []
for chunk in chunks:
    to_commit.extend(get_journal_names(chunk))

def get_journal_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            journal_name = None
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("JT  - "):
                    journal_name = line[6:]
            if pmid and journal_name:
                bs = pmid_dict[pmid]
                bs.publication = journal_name
                to_commit.append(bs)
                for v in bs.versions:
                    v.publication = journal
                    to_commmit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
for chunk in chunks:
    to_commit.extend(get_journal_names(chunk))

def get_journal_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            journal_name = None
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("JT  - "):
                    journal_name = line[6:]
            if pmid and journal_name:
                bs = pmid_dict[pmid]
                bs.publication = journal_name
                to_commit.append(bs)
                for v in bs.versions:
                    v.publication = journal_name
                    to_commmit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
for chunk in chunks:
    to_commit.extend(get_journal_names(chunk))

def get_journal_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            journal_name = None
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("JT  - "):
                    journal_name = line[6:]
            if pmid and journal_name:
                bs = pmid_dict[pmid]
                bs.publication = journal_name
                to_commit.append(bs)
                for v in bs.versions:
                    v.publication = journal_name
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
for chunk in chunks:
    to_commit.extend(get_journal_names(chunk))
to_commit
len(to_commit)
to_commit[10]
to_commit[10].publication
to_commit[100].publication
to_commit[1000].publication
to_commit[1500].publication
to_commit[2000].publication
to_commit[2002].publication
db.session.add_all(to_commit)
db.session.commit()
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).filter(BaseStudy.pmid != None).options(joinedload(BaseStudy.versions)).all()
len(bad_journal)
bad_authors = BaseStudy.query.filter(or_(BaseStudy.authors==None, BaseStudy.authors=='', func.trim(BaseStudy.authors)=='')).filter(BaseStudy.pmid != None).options(joinedload(BaseStudy.versions)).all()
len(bad_authors)

def get_journal_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            authors = []
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("FAU  - "):
                    authors.append(line[6:])
            if pmid and authors:
                authors = ",".join(authors)
                bs = pmid_dict[pmid]
                bs.authors = authors
                to_commit.append(bs)
                for v in bs.versions:
                    v.authors = authors
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
chunks = [bad_authors[i:i+CHUNK_SIZE] for i in range(0, len(bad_authors), CHUNK_SIZE)]
chunks

def get_author_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            authors = []
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("FAU  - "):
                    authors.append(line[6:])
            if pmid and authors:
                authors = ",".join(authors)
                bs = pmid_dict[pmid]
                bs.authors = authors
                to_commit.append(bs)
                for v in bs.versions:
                    v.authors = authors
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
to_commit = []
for chunk in chunks:
    to_commit.extend(get_author_names(chunk))
to_commit

def get_author_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            authors = []
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("FAU  - "):
                    authors.append(line[6:])
                    print(f"{line[6:]}")
            if pmid and authors:
                authors = ",".join(authors)
                bs = pmid_dict[pmid]
                bs.authors = authors
                to_commit.append(bs)
                for v in bs.versions:
                    v.authors = authors
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
to_commit = []
for chunk in chunks:
    to_commit.extend(get_author_names(chunk))

def get_author_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            authors = []
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("FAU - "):
                    authors.append(line[6:])
                    print(f"{line[6:]}")
            if pmid and authors:
                authors = ",".join(authors)
                bs = pmid_dict[pmid]
                bs.authors = authors
                to_commit.append(bs)
                for v in bs.versions:
                    v.authors = authors
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []

def get_author_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            authors = []
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("FAU - "):
                    authors.append(line[6:])
                    # print(f"{line[6:]}")
            if pmid and authors:
                authors = ",".join(authors)
                bs = pmid_dict[pmid]
                bs.authors = authors
                to_commit.append(bs)
                for v in bs.versions:
                    v.authors = authors
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
to_commit = []
for chunk in chunks:
    to_commit.extend(get_author_names(chunk))
to_commit
len(to_commit)
to_commit[0].authors

def get_author_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            authors = []
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("FAU - "):
                    authors.append(line[6:])
                    # print(f"{line[6:]}")
            if pmid and authors:
                authors = ";".join(authors)
                bs = pmid_dict[pmid]
                bs.authors = authors
                to_commit.append(bs)
                for v in bs.versions:
                    v.authors = authors
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
to_commit = []
for chunk in chunks:
    to_commit.extend(get_author_names(chunk))
to_commit
to_commit[0].authors
to_commit[1000].authors
to_commit[5000].authors
db.session.add(to_commit)
db.session.add_all(to_commit)
db.session.commit()
from neurostore.core import cache
cache.clear()
bad_authors = BaseStudy.query.filter(or_(BaseStudy.authors==None, BaseStudy.authors=='', func.trim(BaseStudy.authors)=='')).filter(BaseStudy.pmid != None).options(joinedload(BaseStudy.versions)).all()
len(bad_authors)
history
Clone this wiki locally