-
Notifications
You must be signed in to change notification settings - Fork 6
Add missing authors and journals
James Kent edited this page Apr 19, 2025
·
3 revisions
from Bio import Entrez
from sqlalchemy import or_, func
from neurostore.models.data import BaseStudy, Study
from neurostore.database import db
from sqlalchemy.orm import joinedload
# Configure Entrez
Entrez.email = "jamesdkent21@gmail.com" # Replace with your email
CHUNK_SIZE = 900 # PubMed API limit
def get_year_from_pubmed(pmid):
"""Get publication year from PubMed."""
try:
handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text")
record = handle.read()
handle.close()
for line in record.splitlines():
if line.startswith("DP - "):
year_str = line[6:10]
try:
return int(year_str)
except ValueError:
return None
return None
except Exception as e:
print(f"Error fetching year for PMID {pmid}: {str(e)}")
return None
def get_metadata_from_pubmed(bss):
"""Get journal and authors from PubMed for multiple records."""
to_commit = []
try:
pmid_dict = {bs.pmid: bs for bs in bss}
pmids_str = ",".join(list(pmid_dict.keys()))
handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
records = handle.read()
handle.close()
for entry in records.split("\n\n"):
pmid = None
journal_name = None
authors = []
for line in entry.splitlines():
if line.startswith("PMID- "):
pmid = line[6:]
elif line.startswith("JT - "):
journal_name = line[6:]
elif line.startswith("FAU - "):
authors.append(line[6:])
if pmid and (journal_name or authors):
bs = pmid_dict[pmid]
if journal_name:
bs.publication = journal_name
if authors:
bs.authors = ";".join(authors)
to_commit.append(bs)
# Update versions too
for v in bs.versions:
if journal_name:
v.publication = journal_name
if authors:
v.authors = ";".join(authors)
to_commit.append(v)
return to_commit
except Exception as e:
print(f"Error fetching metadata: {str(e)}")
return []
def fix_metadata():
"""Main function to fix missing metadata."""
# Fix years
bad_years = BaseStudy.query.filter(
or_(
BaseStudy.year == None,
BaseStudy.year < 1900,
BaseStudy.year > 2025
)
).filter(BaseStudy.pmid != None).all()
print(f"Found {len(bad_years)} records with invalid years")
for bs in bad_years:
year = get_year_from_pubmed(bs.pmid)
if year:
bs.year = year
for v in bs.versions:
v.year = year
db.session.add(bs)
db.session.add_all(bs.versions)
# Fix journals and authors
bad_metadata = BaseStudy.query.filter(
or_(
BaseStudy.publication == None,
BaseStudy.publication == '',
BaseStudy.authors == None,
BaseStudy.authors == ''
)
).filter(
BaseStudy.pmid != None
).options(
joinedload(BaseStudy.versions)
).all()
print(f"Found {len(bad_metadata)} records with missing metadata")
# Process in chunks due to API limits
chunks = [bad_metadata[i:i+CHUNK_SIZE] for i in range(0, len(bad_metadata), CHUNK_SIZE)]
for chunk in chunks:
to_commit = get_metadata_from_pubmed(chunk)
if to_commit:
db.session.add_all(to_commit)
# Commit all changes
try:
db.session.commit()
print("Successfully updated metadata")
except Exception as e:
db.session.rollback()
print(f"Error committing changes: {str(e)}")
if __name__ == "__main__":
fix_metadata()
from sqlachemy import func
from sqlalchemy import func
from sqlalchemy import or_
BaseStudy.query.count(BaseStudy.id).group_by(BaseStudy.year)
BaseStudy.query.count().group_by(BaseStudy.year)
BaseStudy.query.count()
query = session.query(BaseStudy.year, func.count(BaseStudy.id)).group_by(BaseStudy.year)
# Executing the query and fetching the results
results = query.all()
query = db.session.query(BaseStudy.year, func.count(BaseStudy.id)).group_by(BaseStudy.year)
# Executing the query and fetching the results
results = query.all()
results
query = db.session.query(BaseStudy.year, func.count(BaseStudy.id)).group_by(BaseStudy.year).order_by(BaseStudy.year)
# Executing the query and fetching the results
results = query.all()
results
from sqlalchemy import in_
from sqlalchemy import in
values_to_check = [None, 0, 1, 3, 9, 13, 16, 19]
BaseStudy.query.filter(BaseStudy.year.in_(values_to_check)).count()
BaseStudy.query.filter(BaseStudy.year.in_(values_to_check)).all()
BaseStudy.query.filter(BaseStudy.year==None)
BaseStudy.query.filter(BaseStudy.year==None).count()
BaseStudy.query.filter(BaseStudy.year.in_(values_to_check[1:])).all()
BaseStudy.query.filter(BaseStudy.year.in_(values_to_check[1:])).count()
bad_year = BaseStudy.query.filter(or_(BaseStudy.year.in_(values_to_check[1:]), BaseStudy.year==None))
bad_year
bad_year = BaseStudy.query.filter(or_(BaseStudy.year.in_(values_to_check[1:]), BaseStudy.year==None)).all()
len(bad_year)
bad_year = BaseStudy.query.filter(or_(BaseStudy.year.in_(values_to_check[1:]), BaseStudy.year==None)).filter(BaseStudy.pmid != None).all()
len(bad_year)
from Bio import Entrez
from Bio.Entrez.Parser import ValidationError
def get_publication_year(pmid):
"""
Retrieves the publication year for a given PMID using BioPython.
Args:
- pmid (str): PubMed ID (PMID) of the publication.
Returns:
- int or None: The publication year if found, otherwise None.
"""
try:
# Provide your email address to comply with NCBI's usage policies
Entrez.email = "jamesdkent21@gmail.com"
# Fetch PubMed record for the given PMID
handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text")
record = handle.read()
handle.close()
# Extract publication year from the record
lines = record.splitlines()
for line in lines:
if line.startswith("DP - "):
year_str = line[6:10]
try:
year = int(year_str)
return year
except ValueError:
return None # Invalid year format
return None # Publication year not found
except ValidationError:
# Handle validation errors from Entrez
print("Validation error occurred. Unable to retrieve publication year.")
return None
except Exception as e:
# Handle any other unexpected errors
print(f"An error occurred: {str(e)}")
return None
get_publication_year(bad_year[0].pmid)
get_publication_year(bad_year[1].pmid)
get_publication_year(bad_year[2].pmid)
to_commit = []
for bs in bad_year:
year = get_publication_year(bs.pmid)
if not year:
print(f"NO YEAR FOR {bs.pmid}")
continue
bs.year = year
to_commit.append(bs)
for v in bs.versions:
v.year = year
to_commit.append(v)
to_commit
db.session.add(to_commit)
db.session.add_all(to_commit)
db.session.commit()
bad_year = BaseStudy.query.filter(or_(BaseStudy.year.in_(values_to_check[1:]), BaseStudy.year==None)).all()
len(bad_year)
query = db.session.query(BaseStudy.year, func.count(BaseStudy.id)).group_by(BaseStudy.year).order_by(BaseStudy.year)
# Executing the query and fetching the results
results = query.all()
results
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.year=='', func.trim(BaseStudy.year)=='')).all()
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).all()
db.session.rollback()
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).all()
len(bad_journal)
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).filter(BaseStudy.pmid != None).all()
len(bad_journal)
from sqlalchemy import joinload
from sqlalchemy import joinedload
from sqlalchemy.orm import joinedload
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).filter(BaseStudy.pmid != None).opetions(joinedload(BaseStudy.versions)).all()
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).filter(BaseStudy.pmid != None).options(joinedload(BaseStudy.versions)).all()
def get_journal_names(pmids):
"""
Retrieves the journal names for a list of PMIDs using BioPython.
Args:
- pmids (list of str): List of PubMed IDs (PMIDs) of the publications.
Returns:
- dict: A dictionary where keys are PMIDs and values are journal names.
"""
journal_names = {}
try:
# Provide your email address to comply with NCBI's usage policies
Entrez.email = "your.email@example.com"
# Join PMIDs into a comma-separated string
pmids_str = ",".join(pmids)
# Fetch PubMed records for the given PMIDs
handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
records = handle.read()
handle.close()
# Split records by individual entries
entries = records.split("\n\n")
# Extract journal names for each entry
for entry in entries:
lines = entry.splitlines()
pmid = None
journal_name = None
for line in lines:
if line.startswith("PMID- "):
pmid = line[6:]
elif line.startswith("JT - "):
journal_name = line[6:]
if pmid and journal_name:
journal_names[pmid] = journal_name
return journal_names
except ValidationError:
# Handle validation errors from Entrez
print("Validation error occurred. Unable to retrieve journal names.")
return {}
except Exception as e:
# Handle any other unexpected errors
print(f"An error occurred: {str(e)}")
return {}
CHUNK_SIZE = 900
chunks = [studies[i:i+CHUNK_SIZE] for i in range(0, len(studies), CHUNK_SIZE)]
chunks = [studies[i:i+CHUNK_SIZE] for i in range(0, len(bad_journal), CHUNK_SIZE)]
chunks = [bad_journal[i:i+CHUNK_SIZE] for i in range(0, len(bad_journal), CHUNK_SIZE)]
chunks
def get_journal_names(bs):
"""
Retrieves the journal names for a list of PMIDs using BioPython.
Args:
- pmids (list of str): List of PubMed IDs (PMIDs) of the publications.
Returns:
- dict: A dictionary where keys are PMIDs and values are journal names.
"""
to_commit = []
try:
# Provide your email address to comply with NCBI's usage policies
Entrez.email = "jamesdkent21@gmail.com"
# Join PMIDs into a comma-separated string
pmids_str = ",".join(pmids)
# Fetch PubMed records for the given PMIDs
handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
records = handle.read()
handle.close()
# Split records by individual entries
entries = records.split("\n\n")
# Extract journal names for each entry
for entry in entries:
lines = entry.splitlines()
pmid = None
journal_name = None
for line in lines:
if line.startswith("PMID- "):
pmid = line[6:]
elif line.startswith("JT - "):
journal_name = line[6:]
if pmid and journal_name:
bs.publication = journal_name
to_commit.append(bs)
for v in bs.versions:
v.publication = journal
to_commmit.append(v)
return to_commit
except ValidationError:
# Handle validation errors from Entrez
print("Validation error occurred. Unable to retrieve journal names.")
return {}
except Exception as e:
# Handle any other unexpected errors
print(f"An error occurred: {str(e)}")
return {}
def get_journal_names(bs):
"""
Retrieves the journal names for a list of PMIDs using BioPython.
Args:
- pmids (list of str): List of PubMed IDs (PMIDs) of the publications.
Returns:
- dict: A dictionary where keys are PMIDs and values are journal names.
"""
to_commit = []
try:
# Provide your email address to comply with NCBI's usage policies
Entrez.email = "jamesdkent21@gmail.com"
# Join PMIDs into a comma-separated string
pmids_str = ",".join(pmids)
# Fetch PubMed records for the given PMIDs
handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
records = handle.read()
handle.close()
# Split records by individual entries
entries = records.split("\n\n")
# Extract journal names for each entry
for entry in entries:
lines = entry.splitlines()
pmid = None
journal_name = None
for line in lines:
if line.startswith("PMID- "):
pmid = line[6:]
elif line.startswith("JT - "):
journal_name = line[6:]
if pmid and journal_name:
bs.publication = journal_name
to_commit.append(bs)
for v in bs.versions:
v.publication = journal
to_commmit.append(v)
return to_commit
except ValidationError:
# Handle validation errors from Entrez
print("Validation error occurred. Unable to retrieve journal names.")
return []
except Exception as e:
# Handle any other unexpected errors
print(f"An error occurred: {str(e)}")
return []
to_commit = []
for chunk in chunks:
to_commit.extend(get_journal_names(chunk))
def get_journal_names(bss):
"""
Retrieves the journal names for a list of PMIDs using BioPython.
Args:
- pmids (list of str): List of PubMed IDs (PMIDs) of the publications.
Returns:
- dict: A dictionary where keys are PMIDs and values are journal names.
"""
to_commit = []
try:
# Provide your email address to comply with NCBI's usage policies
Entrez.email = "jamesdkent21@gmail.com"
pmid_dict = {bs.pmid: bs for bs in bss}
# Join PMIDs into a comma-separated string
pmids_str = ",".join(list(pmid_dict.keys()))
# Fetch PubMed records for the given PMIDs
handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
records = handle.read()
handle.close()
# Split records by individual entries
entries = records.split("\n\n")
# Extract journal names for each entry
for entry in entries:
lines = entry.splitlines()
pmid = None
journal_name = None
for line in lines:
if line.startswith("PMID- "):
pmid = line[6:]
elif line.startswith("JT - "):
journal_name = line[6:]
if pmid and journal_name:
bs = pmid_dict[pmid]
bs.publication = journal_name
to_commit.append(bs)
for v in bs.versions:
v.publication = journal
to_commmit.append(v)
return to_commit
except ValidationError:
# Handle validation errors from Entrez
print("Validation error occurred. Unable to retrieve journal names.")
return []
except Exception as e:
# Handle any other unexpected errors
print(f"An error occurred: {str(e)}")
return []
for chunk in chunks:
to_commit.extend(get_journal_names(chunk))
def get_journal_names(bss):
"""
Retrieves the journal names for a list of PMIDs using BioPython.
Args:
- pmids (list of str): List of PubMed IDs (PMIDs) of the publications.
Returns:
- dict: A dictionary where keys are PMIDs and values are journal names.
"""
to_commit = []
try:
# Provide your email address to comply with NCBI's usage policies
Entrez.email = "jamesdkent21@gmail.com"
pmid_dict = {bs.pmid: bs for bs in bss}
# Join PMIDs into a comma-separated string
pmids_str = ",".join(list(pmid_dict.keys()))
# Fetch PubMed records for the given PMIDs
handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
records = handle.read()
handle.close()
# Split records by individual entries
entries = records.split("\n\n")
# Extract journal names for each entry
for entry in entries:
lines = entry.splitlines()
pmid = None
journal_name = None
for line in lines:
if line.startswith("PMID- "):
pmid = line[6:]
elif line.startswith("JT - "):
journal_name = line[6:]
if pmid and journal_name:
bs = pmid_dict[pmid]
bs.publication = journal_name
to_commit.append(bs)
for v in bs.versions:
v.publication = journal_name
to_commmit.append(v)
return to_commit
except ValidationError:
# Handle validation errors from Entrez
print("Validation error occurred. Unable to retrieve journal names.")
return []
except Exception as e:
# Handle any other unexpected errors
print(f"An error occurred: {str(e)}")
return []
for chunk in chunks:
to_commit.extend(get_journal_names(chunk))
def get_journal_names(bss):
"""
Retrieves the journal names for a list of PMIDs using BioPython.
Args:
- pmids (list of str): List of PubMed IDs (PMIDs) of the publications.
Returns:
- dict: A dictionary where keys are PMIDs and values are journal names.
"""
to_commit = []
try:
# Provide your email address to comply with NCBI's usage policies
Entrez.email = "jamesdkent21@gmail.com"
pmid_dict = {bs.pmid: bs for bs in bss}
# Join PMIDs into a comma-separated string
pmids_str = ",".join(list(pmid_dict.keys()))
# Fetch PubMed records for the given PMIDs
handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
records = handle.read()
handle.close()
# Split records by individual entries
entries = records.split("\n\n")
# Extract journal names for each entry
for entry in entries:
lines = entry.splitlines()
pmid = None
journal_name = None
for line in lines:
if line.startswith("PMID- "):
pmid = line[6:]
elif line.startswith("JT - "):
journal_name = line[6:]
if pmid and journal_name:
bs = pmid_dict[pmid]
bs.publication = journal_name
to_commit.append(bs)
for v in bs.versions:
v.publication = journal_name
to_commit.append(v)
return to_commit
except ValidationError:
# Handle validation errors from Entrez
print("Validation error occurred. Unable to retrieve journal names.")
return []
except Exception as e:
# Handle any other unexpected errors
print(f"An error occurred: {str(e)}")
return []
for chunk in chunks:
to_commit.extend(get_journal_names(chunk))
to_commit
len(to_commit)
to_commit[10]
to_commit[10].publication
to_commit[100].publication
to_commit[1000].publication
to_commit[1500].publication
to_commit[2000].publication
to_commit[2002].publication
db.session.add_all(to_commit)
db.session.commit()
bad_journal = BaseStudy.query.filter(or_(BaseStudy.publication==None, BaseStudy.publication=='', func.trim(BaseStudy.publication)=='')).filter(BaseStudy.pmid != None).options(joinedload(BaseStudy.versions)).all()
len(bad_journal)
bad_authors = BaseStudy.query.filter(or_(BaseStudy.authors==None, BaseStudy.authors=='', func.trim(BaseStudy.authors)=='')).filter(BaseStudy.pmid != None).options(joinedload(BaseStudy.versions)).all()
len(bad_authors)
def get_journal_names(bss):
"""
Retrieves the journal names for a list of PMIDs using BioPython.
Args:
- pmids (list of str): List of PubMed IDs (PMIDs) of the publications.
Returns:
- dict: A dictionary where keys are PMIDs and values are journal names.
"""
to_commit = []
try:
# Provide your email address to comply with NCBI's usage policies
Entrez.email = "jamesdkent21@gmail.com"
pmid_dict = {bs.pmid: bs for bs in bss}
# Join PMIDs into a comma-separated string
pmids_str = ",".join(list(pmid_dict.keys()))
# Fetch PubMed records for the given PMIDs
handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
records = handle.read()
handle.close()
# Split records by individual entries
entries = records.split("\n\n")
# Extract journal names for each entry
for entry in entries:
lines = entry.splitlines()
pmid = None
authors = []
for line in lines:
if line.startswith("PMID- "):
pmid = line[6:]
elif line.startswith("FAU - "):
authors.append(line[6:])
if pmid and authors:
authors = ",".join(authors)
bs = pmid_dict[pmid]
bs.authors = authors
to_commit.append(bs)
for v in bs.versions:
v.authors = authors
to_commit.append(v)
return to_commit
except ValidationError:
# Handle validation errors from Entrez
print("Validation error occurred. Unable to retrieve journal names.")
return []
except Exception as e:
# Handle any other unexpected errors
print(f"An error occurred: {str(e)}")
return []
chunks = [bad_authors[i:i+CHUNK_SIZE] for i in range(0, len(bad_authors), CHUNK_SIZE)]
chunks
def get_author_names(bss):
"""
Retrieves the journal names for a list of PMIDs using BioPython.
Args:
- pmids (list of str): List of PubMed IDs (PMIDs) of the publications.
Returns:
- dict: A dictionary where keys are PMIDs and values are journal names.
"""
to_commit = []
try:
# Provide your email address to comply with NCBI's usage policies
Entrez.email = "jamesdkent21@gmail.com"
pmid_dict = {bs.pmid: bs for bs in bss}
# Join PMIDs into a comma-separated string
pmids_str = ",".join(list(pmid_dict.keys()))
# Fetch PubMed records for the given PMIDs
handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
records = handle.read()
handle.close()
# Split records by individual entries
entries = records.split("\n\n")
# Extract journal names for each entry
for entry in entries:
lines = entry.splitlines()
pmid = None
authors = []
for line in lines:
if line.startswith("PMID- "):
pmid = line[6:]
elif line.startswith("FAU - "):
authors.append(line[6:])
if pmid and authors:
authors = ",".join(authors)
bs = pmid_dict[pmid]
bs.authors = authors
to_commit.append(bs)
for v in bs.versions:
v.authors = authors
to_commit.append(v)
return to_commit
except ValidationError:
# Handle validation errors from Entrez
print("Validation error occurred. Unable to retrieve journal names.")
return []
except Exception as e:
# Handle any other unexpected errors
print(f"An error occurred: {str(e)}")
return []
to_commit = []
for chunk in chunks:
to_commit.extend(get_author_names(chunk))
to_commit
def get_author_names(bss):
"""
Retrieves the journal names for a list of PMIDs using BioPython.
Args:
- pmids (list of str): List of PubMed IDs (PMIDs) of the publications.
Returns:
- dict: A dictionary where keys are PMIDs and values are journal names.
"""
to_commit = []
try:
# Provide your email address to comply with NCBI's usage policies
Entrez.email = "jamesdkent21@gmail.com"
pmid_dict = {bs.pmid: bs for bs in bss}
# Join PMIDs into a comma-separated string
pmids_str = ",".join(list(pmid_dict.keys()))
# Fetch PubMed records for the given PMIDs
handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
records = handle.read()
handle.close()
# Split records by individual entries
entries = records.split("\n\n")
# Extract journal names for each entry
for entry in entries:
lines = entry.splitlines()
pmid = None
authors = []
for line in lines:
if line.startswith("PMID- "):
pmid = line[6:]
elif line.startswith("FAU - "):
authors.append(line[6:])
print(f"{line[6:]}")
if pmid and authors:
authors = ",".join(authors)
bs = pmid_dict[pmid]
bs.authors = authors
to_commit.append(bs)
for v in bs.versions:
v.authors = authors
to_commit.append(v)
return to_commit
except ValidationError:
# Handle validation errors from Entrez
print("Validation error occurred. Unable to retrieve journal names.")
return []
except Exception as e:
# Handle any other unexpected errors
print(f"An error occurred: {str(e)}")
return []
to_commit = []
for chunk in chunks:
to_commit.extend(get_author_names(chunk))
def get_author_names(bss):
"""
Retrieves the journal names for a list of PMIDs using BioPython.
Args:
- pmids (list of str): List of PubMed IDs (PMIDs) of the publications.
Returns:
- dict: A dictionary where keys are PMIDs and values are journal names.
"""
to_commit = []
try:
# Provide your email address to comply with NCBI's usage policies
Entrez.email = "jamesdkent21@gmail.com"
pmid_dict = {bs.pmid: bs for bs in bss}
# Join PMIDs into a comma-separated string
pmids_str = ",".join(list(pmid_dict.keys()))
# Fetch PubMed records for the given PMIDs
handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
records = handle.read()
handle.close()
# Split records by individual entries
entries = records.split("\n\n")
# Extract journal names for each entry
for entry in entries:
lines = entry.splitlines()
pmid = None
authors = []
for line in lines:
if line.startswith("PMID- "):
pmid = line[6:]
elif line.startswith("FAU - "):
authors.append(line[6:])
print(f"{line[6:]}")
if pmid and authors:
authors = ",".join(authors)
bs = pmid_dict[pmid]
bs.authors = authors
to_commit.append(bs)
for v in bs.versions:
v.authors = authors
to_commit.append(v)
return to_commit
except ValidationError:
# Handle validation errors from Entrez
print("Validation error occurred. Unable to retrieve journal names.")
return []
except Exception as e:
# Handle any other unexpected errors
print(f"An error occurred: {str(e)}")
return []
def get_author_names(bss):
"""
Retrieves the journal names for a list of PMIDs using BioPython.
Args:
- pmids (list of str): List of PubMed IDs (PMIDs) of the publications.
Returns:
- dict: A dictionary where keys are PMIDs and values are journal names.
"""
to_commit = []
try:
# Provide your email address to comply with NCBI's usage policies
Entrez.email = "jamesdkent21@gmail.com"
pmid_dict = {bs.pmid: bs for bs in bss}
# Join PMIDs into a comma-separated string
pmids_str = ",".join(list(pmid_dict.keys()))
# Fetch PubMed records for the given PMIDs
handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
records = handle.read()
handle.close()
# Split records by individual entries
entries = records.split("\n\n")
# Extract journal names for each entry
for entry in entries:
lines = entry.splitlines()
pmid = None
authors = []
for line in lines:
if line.startswith("PMID- "):
pmid = line[6:]
elif line.startswith("FAU - "):
authors.append(line[6:])
# print(f"{line[6:]}")
if pmid and authors:
authors = ",".join(authors)
bs = pmid_dict[pmid]
bs.authors = authors
to_commit.append(bs)
for v in bs.versions:
v.authors = authors
to_commit.append(v)
return to_commit
except ValidationError:
# Handle validation errors from Entrez
print("Validation error occurred. Unable to retrieve journal names.")
return []
except Exception as e:
# Handle any other unexpected errors
print(f"An error occurred: {str(e)}")
return []
to_commit = []
for chunk in chunks:
to_commit.extend(get_author_names(chunk))
to_commit
len(to_commit)
to_commit[0].authors
def get_author_names(bss):
"""
Retrieves the journal names for a list of PMIDs using BioPython.
Args:
- pmids (list of str): List of PubMed IDs (PMIDs) of the publications.
Returns:
- dict: A dictionary where keys are PMIDs and values are journal names.
"""
to_commit = []
try:
# Provide your email address to comply with NCBI's usage policies
Entrez.email = "jamesdkent21@gmail.com"
pmid_dict = {bs.pmid: bs for bs in bss}
# Join PMIDs into a comma-separated string
pmids_str = ",".join(list(pmid_dict.keys()))
# Fetch PubMed records for the given PMIDs
handle = Entrez.efetch(db="pubmed", id=pmids_str, rettype="medline", retmode="text")
records = handle.read()
handle.close()
# Split records by individual entries
entries = records.split("\n\n")
# Extract journal names for each entry
for entry in entries:
lines = entry.splitlines()
pmid = None
authors = []
for line in lines:
if line.startswith("PMID- "):
pmid = line[6:]
elif line.startswith("FAU - "):
authors.append(line[6:])
# print(f"{line[6:]}")
if pmid and authors:
authors = ";".join(authors)
bs = pmid_dict[pmid]
bs.authors = authors
to_commit.append(bs)
for v in bs.versions:
v.authors = authors
to_commit.append(v)
return to_commit
except ValidationError:
# Handle validation errors from Entrez
print("Validation error occurred. Unable to retrieve journal names.")
return []
except Exception as e:
# Handle any other unexpected errors
print(f"An error occurred: {str(e)}")
return []
to_commit = []
for chunk in chunks:
to_commit.extend(get_author_names(chunk))
to_commit
to_commit[0].authors
to_commit[1000].authors
to_commit[5000].authors
db.session.add(to_commit)
db.session.add_all(to_commit)
db.session.commit()
from neurostore.core import cache
cache.clear()
bad_authors = BaseStudy.query.filter(or_(BaseStudy.authors==None, BaseStudy.authors=='', func.trim(BaseStudy.authors)=='')).filter(BaseStudy.pmid != None).options(joinedload(BaseStudy.versions)).all()
len(bad_authors)
history