Add missing authors and journals

refined script

from Bio import Entrez
from sqlalchemy import or_, func
from neurostore.models.data import BaseStudy, Study
from neurostore.database import db
from sqlalchemy.orm import joinedload

# Configure Entrez
Entrez.email = "jamesdkent21@gmail.com"  # Replace with your email
CHUNK_SIZE = 900  # PubMed API limit

def get_year_from_pubmed(pmid):
    """Get publication year from PubMed."""
    try:
        handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text")
        record = handle.read()
        handle.close()

        for line in record.splitlines():
            if line.startswith("DP  - "):
                year_str = line[6:10]
                try:
                    return int(year_str)
                except ValueError:
                    return None
        return None
    except Exception as e:
        print(f"Error fetching year for PMID {pmid}: {str(e)}")
        return None

def get_metadata_from_pubmed(bss):
    """Get journal and authors from PubMed for multiple records."""
    to_commit = []
    try:
        pmid_dict = {bs.pmid: bs for bs in bss}
        pmids_str = ",".join(list(pmid_dict.keys()))

        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        for entry in records.split("\n\n"):
            pmid = None
            journal_name = None
            authors = []
            
            for line in entry.splitlines():
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("JT  - "):
                    journal_name = line[6:]
                elif line.startswith("FAU - "):
                    authors.append(line[6:])

            if pmid and (journal_name or authors):
                bs = pmid_dict[pmid]
                if journal_name:
                    bs.publication = journal_name
                if authors:
                    bs.authors = ";".join(authors)
                to_commit.append(bs)
                
                # Update versions too
                for v in bs.versions:
                    if journal_name:
                        v.publication = journal_name
                    if authors:
                        v.authors = ";".join(authors)
                    to_commit.append(v)
                    
        return to_commit
    except Exception as e:
        print(f"Error fetching metadata: {str(e)}")
        return []

def fix_metadata():
    """Main function to fix missing metadata."""
    # Fix years
    bad_years = BaseStudy.query.filter(
        or_(
            BaseStudy.year == None,
            BaseStudy.year < 1900,
            BaseStudy.year > 2025
        )
    ).filter(BaseStudy.pmid != None).all()
    
    print(f"Found {len(bad_years)} records with invalid years")
    
    for bs in bad_years:
        year = get_year_from_pubmed(bs.pmid)
        if year:
            bs.year = year
            for v in bs.versions:
                v.year = year
            db.session.add(bs)
            db.session.add_all(bs.versions)
    
    # Fix journals and authors
    bad_metadata = BaseStudy.query.filter(
        or_(
            BaseStudy.publication == None,
            BaseStudy.publication == '',
            BaseStudy.authors == None,
            BaseStudy.authors == ''
        )
    ).filter(
        BaseStudy.pmid != None
    ).options(
        joinedload(BaseStudy.versions)
    ).all()
    
    print(f"Found {len(bad_metadata)} records with missing metadata")
    
    # Process in chunks due to API limits
    chunks = [bad_metadata[i:i+CHUNK_SIZE] for i in range(0, len(bad_metadata), CHUNK_SIZE)]
    
    for chunk in chunks:
        to_commit = get_metadata_from_pubmed(chunk)
        if to_commit:
            db.session.add_all(to_commit)
    
    # Commit all changes
    try:
        db.session.commit()
        print("Successfully updated metadata")
    except Exception as e:
        db.session.rollback()
        print(f"Error committing changes: {str(e)}")

if __name__ == "__main__":
    fix_metadata()

from sqlachemy import func
from sqlalchemy import func
from sqlalchemy import or_
BaseStudy.query.count(BaseStudy.id).group_by(BaseStudy.year)
BaseStudy.query.count().group_by(BaseStudy.year)
BaseStudy.query.count()
query = session.query(BaseStudy.year, func.count(BaseStudy.id)).group_by(BaseStudy.year)

# Executing the query and fetching the results
results = query.all()
query = db.session.query(BaseStudy.year, func.count(BaseStudy.id)).group_by(BaseStudy.year)

# Executing the query and fetching the results
results = query.all()
results
query = db.session.query(BaseStudy.year, func.count(BaseStudy.id)).group_by(BaseStudy.year).order_by(BaseStudy.year)


# Executing the query and fetching the results
results = query.all()
results
from sqlalchemy import in_
from sqlalchemy import in
values_to_check = [None, 0, 1, 3, 9, 13, 16, 19]
BaseStudy.query.filter(BaseStudy.year.in_(values_to_check)).count()
BaseStudy.query.filter(BaseStudy.year.in_(values_to_check)).all()
BaseStudy.query.filter(BaseStudy.year==None)
BaseStudy.query.filter(BaseStudy.year==None).count()
BaseStudy.query.filter(BaseStudy.year.in_(values_to_check[1:])).all()
BaseStudy.query.filter(BaseStudy.year.in_(values_to_check[1:])).count()
bad_year = BaseStudy.query.filter(or_(BaseStudy.year.in_(values_to_check[1:]), BaseStudy.year==None))
bad_year
bad_year = BaseStudy.query.filter(or_(BaseStudy.year.in_(values_to_check[1:]), BaseStudy.year==None)).all()
len(bad_year)
bad_year = BaseStudy.query.filter(or_(BaseStudy.year.in_(values_to_check[1:]), BaseStudy.year==None)).filter(BaseStudy.pmid != None).all()
len(bad_year)
from Bio import Entrez
from Bio.Entrez.Parser import ValidationError

def get_publication_year(pmid):
    """
    Retrieves the publication year for a given PMID using BioPython.

    Args:
    - pmid (str): PubMed ID (PMID) of the publication.

    Returns:
    - int or None: The publication year if found, otherwise None.
    """
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"

        # Fetch PubMed record for the given PMID
        handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text")
        record = handle.read()
        handle.close()

        # Extract publication year from the record
        lines = record.splitlines()
        for line in lines:
            if line.startswith("DP  - "):
                year_str = line[6:10]
                try:
                    year = int(year_str)
                    return year
                except ValueError:
                    return None  # Invalid year format
        return None  # Publication year not found

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve publication year.")
        return None

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return None
get_publication_year(bad_year[0].pmid)
get_publication_year(bad_year[1].pmid)
get_publication_year(bad_year[2].pmid)
to_commit = []
for bs in bad_year:
    year = get_publication_year(bs.pmid)
    if not year:
        print(f"NO YEAR FOR {bs.pmid}")
        continue
    bs.year = year
    to_commit.append(bs)
    for v in bs.versions:
        v.year = year
        to_commit.append(v)
to_commit
db.session.add(to_commit)
db.session.add_all(to_commit)
db.session.commit()
bad_year = BaseStudy.query.filter(or_(BaseStudy.year.in_(values_to_check[1:]), BaseStudy.year==None)).all()
len(bad_year)
query = db.session.query(BaseStudy.year, func.count(BaseStudy.id)).group_by(BaseStudy.year).order_by(BaseStudy.year)


# Executing the query and fetching the results
results = query.all()
results
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.year=='', func.trim(BaseStudy.year)=='')).all()
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).all()
db.session.rollback()
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).all()
len(bad_journal)
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).filter(BaseStudy.pmid != None).all()
len(bad_journal)
from sqlalchemy import joinload
from sqlalchemy import joinedload
from sqlalchemy.orm import joinedload
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).filter(BaseStudy.pmid != None).opetions(joinedload(BaseStudy.versions)).all()
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).filter(BaseStudy.pmid != None).options(joinedload(BaseStudy.versions)).all()

def get_journal_names(pmids):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    journal_names = {}
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "your.email@example.com"

        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(pmids)

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            journal_name = None
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("JT  - "):
                    journal_name = line[6:]
            if pmid and journal_name:
                journal_names[pmid] = journal_name

        return journal_names

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return {}

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return {}
CHUNK_SIZE = 900
chunks = [studies[i:i+CHUNK_SIZE] for i in range(0, len(studies), CHUNK_SIZE)]
chunks = [studies[i:i+CHUNK_SIZE] for i in range(0, len(bad_journal), CHUNK_SIZE)]
chunks = [bad_journal[i:i+CHUNK_SIZE] for i in range(0, len(bad_journal), CHUNK_SIZE)]
chunks

def get_journal_names(bs):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"

        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(pmids)

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            journal_name = None
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("JT  - "):
                    journal_name = line[6:]
            if pmid and journal_name:
                bs.publication = journal_name
                to_commit.append(bs)
                for v in bs.versions:
                    v.publication = journal
                    to_commmit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return {}

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return {}

def get_journal_names(bs):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"

        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(pmids)

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            journal_name = None
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("JT  - "):
                    journal_name = line[6:]
            if pmid and journal_name:
                bs.publication = journal_name
                to_commit.append(bs)
                for v in bs.versions:
                    v.publication = journal
                    to_commmit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
to_commit = []
for chunk in chunks:
    to_commit.extend(get_journal_names(chunk))

def get_journal_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            journal_name = None
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("JT  - "):
                    journal_name = line[6:]
            if pmid and journal_name:
                bs = pmid_dict[pmid]
                bs.publication = journal_name
                to_commit.append(bs)
                for v in bs.versions:
                    v.publication = journal
                    to_commmit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
for chunk in chunks:
    to_commit.extend(get_journal_names(chunk))

def get_journal_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            journal_name = None
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("JT  - "):
                    journal_name = line[6:]
            if pmid and journal_name:
                bs = pmid_dict[pmid]
                bs.publication = journal_name
                to_commit.append(bs)
                for v in bs.versions:
                    v.publication = journal_name
                    to_commmit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
for chunk in chunks:
    to_commit.extend(get_journal_names(chunk))

def get_journal_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            journal_name = None
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("JT  - "):
                    journal_name = line[6:]
            if pmid and journal_name:
                bs = pmid_dict[pmid]
                bs.publication = journal_name
                to_commit.append(bs)
                for v in bs.versions:
                    v.publication = journal_name
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
for chunk in chunks:
    to_commit.extend(get_journal_names(chunk))
to_commit
len(to_commit)
to_commit[10]
to_commit[10].publication
to_commit[100].publication
to_commit[1000].publication
to_commit[1500].publication
to_commit[2000].publication
to_commit[2002].publication
db.session.add_all(to_commit)
db.session.commit()
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).filter(BaseStudy.pmid != None).options(joinedload(BaseStudy.versions)).all()
len(bad_journal)
bad_authors = BaseStudy.query.filter(or_(BaseStudy.authors==None, BaseStudy.authors=='', func.trim(BaseStudy.authors)=='')).filter(BaseStudy.pmid != None).options(joinedload(BaseStudy.versions)).all()
len(bad_authors)

def get_journal_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            authors = []
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("FAU  - "):
                    authors.append(line[6:])
            if pmid and authors:
                authors = ",".join(authors)
                bs = pmid_dict[pmid]
                bs.authors = authors
                to_commit.append(bs)
                for v in bs.versions:
                    v.authors = authors
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
chunks = [bad_authors[i:i+CHUNK_SIZE] for i in range(0, len(bad_authors), CHUNK_SIZE)]
chunks

def get_author_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            authors = []
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("FAU  - "):
                    authors.append(line[6:])
            if pmid and authors:
                authors = ",".join(authors)
                bs = pmid_dict[pmid]
                bs.authors = authors
                to_commit.append(bs)
                for v in bs.versions:
                    v.authors = authors
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
to_commit = []
for chunk in chunks:
    to_commit.extend(get_author_names(chunk))
to_commit

def get_author_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            authors = []
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("FAU  - "):
                    authors.append(line[6:])
                    print(f"{line[6:]}")
            if pmid and authors:
                authors = ",".join(authors)
                bs = pmid_dict[pmid]
                bs.authors = authors
                to_commit.append(bs)
                for v in bs.versions:
                    v.authors = authors
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
to_commit = []
for chunk in chunks:
    to_commit.extend(get_author_names(chunk))

def get_author_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            authors = []
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("FAU - "):
                    authors.append(line[6:])
                    print(f"{line[6:]}")
            if pmid and authors:
                authors = ",".join(authors)
                bs = pmid_dict[pmid]
                bs.authors = authors
                to_commit.append(bs)
                for v in bs.versions:
                    v.authors = authors
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []

def get_author_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            authors = []
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("FAU - "):
                    authors.append(line[6:])
                    # print(f"{line[6:]}")
            if pmid and authors:
                authors = ",".join(authors)
                bs = pmid_dict[pmid]
                bs.authors = authors
                to_commit.append(bs)
                for v in bs.versions:
                    v.authors = authors
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
to_commit = []
for chunk in chunks:
    to_commit.extend(get_author_names(chunk))
to_commit
len(to_commit)
to_commit[0].authors

def get_author_names(bss):
    """
    Retrieves the journal names for a list of PMIDs using BioPython.

    Args:
    - pmids (list of str): List of PubMed IDs (PMIDs) of the publications.

    Returns:
    - dict: A dictionary where keys are PMIDs and values are journal names.
    """
    to_commit = []
    try:
        # Provide your email address to comply with NCBI's usage policies
        Entrez.email = "jamesdkent21@gmail.com"
        pmid_dict = {bs.pmid: bs for bs in bss}
        # Join PMIDs into a comma-separated string
        pmids_str = ",".join(list(pmid_dict.keys()))

        # Fetch PubMed records for the given PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
        records = handle.read()
        handle.close()

        # Split records by individual entries
        entries = records.split("\n\n")

        # Extract journal names for each entry
        for entry in entries:
            lines = entry.splitlines()
            pmid = None
            authors = []
            for line in lines:
                if line.startswith("PMID- "):
                    pmid = line[6:]
                elif line.startswith("FAU - "):
                    authors.append(line[6:])
                    # print(f"{line[6:]}")
            if pmid and authors:
                authors = ";".join(authors)
                bs = pmid_dict[pmid]
                bs.authors = authors
                to_commit.append(bs)
                for v in bs.versions:
                    v.authors = authors
                    to_commit.append(v)
        return to_commit

    except ValidationError:
        # Handle validation errors from Entrez
        print("Validation error occurred. Unable to retrieve journal names.")
        return []

    except Exception as e:
        # Handle any other unexpected errors
        print(f"An error occurred: {str(e)}")
        return []
to_commit = []
for chunk in chunks:
    to_commit.extend(get_author_names(chunk))
to_commit
to_commit[0].authors
to_commit[1000].authors
to_commit[5000].authors
db.session.add(to_commit)
db.session.add_all(to_commit)
db.session.commit()
from neurostore.core import cache
cache.clear()
bad_authors = BaseStudy.query.filter(or_(BaseStudy.authors==None, BaseStudy.authors=='', func.trim(BaseStudy.authors)=='')).filter(BaseStudy.pmid != None).options(joinedload(BaseStudy.versions)).all()
len(bad_authors)
history

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Add missing authors and journals

refined script

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Clone this wiki locally