Skip to content

finding missing identifiers

James Kent edited this page Apr 19, 2025 · 1 revision
from Bio import Entrez
from sqlalchemy import and_, or_
from neurostore.models.data import BaseStudy, Study
from neurostore.database import db
import re
import time
import requests
from urllib.parse import quote

# Configure Entrez email (required for NCBI API access)
Entrez.email = "jamesdkent21@gmail.com"

def exponential_backoff_request(url, max_retries=5, initial_delay=1):
    """
    Make a request with exponential backoff for rate limiting
    
    Args:
        url: URL to request
        max_retries: Maximum number of retry attempts
        initial_delay: Initial delay in seconds
        
    Returns:
        Response object if successful, None if all retries failed
    """
    delay = initial_delay
    for attempt in range(max_retries):
        try:
            response = requests.get(url)
            if response.status_code == 429:  # Too Many Requests
                if attempt == max_retries - 1:  # Last attempt
                    print(f"Rate limit exceeded after {max_retries} retries")
                    return None
                    
                wait_time = delay * (2 ** attempt)  # Exponential backoff
                print(f"Rate limit hit, waiting {wait_time} seconds...")
                time.sleep(wait_time)
                continue
                
            response.raise_for_status()
            return response
            
        except requests.exceptions.RequestException as e:
            if attempt == max_retries - 1:  # Last attempt
                print(f"Request failed after {max_retries} retries: {str(e)}")
                return None
            time.sleep(delay * (2 ** attempt))
            
    return None

def clean_string(s):
    """Remove punctuation and convert to lowercase for matching"""
    if not s:
        return ""
    return re.sub(r'[^\w\s]', '', s).lower().strip()

def titles_match(title1, title2, threshold=0.9):
    """
    Compare two titles after cleaning and return True if they are similar enough.
    Uses character ratio comparison for fuzzy matching.
    """
    clean1 = clean_string(title1)
    clean2 = clean_string(title2)
    
    if not clean1 or not clean2:
        return False
        
    # Convert to sets of words for comparison
    words1 = set(clean1.split())
    words2 = set(clean2.split())
    
    # Calculate word overlap ratio
    overlap = len(words1.intersection(words2))
    total = max(len(words1), len(words2))
    
    return (overlap / total) >= threshold

def search_semantic_scholar(title):
    """Search Semantic Scholar API for a paper using title."""
    try:
        url = f"https://api.semanticscholar.org/graph/v1/paper/search?query={quote(title)}&fields=abstract,externalIds"
        response = exponential_backoff_request(url)
        
        if not response:
            return None
            
        data = response.json()
        if not data.get('data'):
            return None
            
        # Check each result for title match
        for paper in data['data']:
            if titles_match(title, paper.get('title', '')):
                result = {}
                if 'externalIds' in paper:
                    ids = paper['externalIds']
                    if 'DOI' in ids:
                        result['doi'] = ids['DOI']
                    if 'PubMed' in ids:
                        result['pmid'] = ids['PubMed']
                    if 'PubMedCentral' in ids:
                        result['pmcid'] = ids['PubMedCentral']
                if paper.get('abstract'):
                    result['abstract'] = paper['abstract']
                return result if result else None
                
        return None
        
    except Exception as e:
        print(f"Error searching Semantic Scholar: {str(e)}")
        return None

def get_article_details(pmid):
    """Get full article details from PubMed by PMID"""
    try:
        handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="xml")
        articles = Entrez.read(handle)['PubmedArticle']
        handle.close()
        
        if not articles:
            return None
            
        article = articles[0]
        
        # Extract article details
        result = {'pmid': pmid}
        
        # Get title
        article_data = article['MedlineCitation']['Article']
        result['title'] = article_data.get('ArticleTitle', '')
        
        # Get DOI and PMCID
        for id_obj in article['PubmedData'].get('ArticleIdList', []):
            if id_obj.attributes.get('IdType') == 'doi':
                result['doi'] = str(id_obj)
            elif id_obj.attributes.get('IdType') == 'pmc':
                result['pmcid'] = str(id_obj)
                
        return result
        
    except Exception as e:
        print(f"Error fetching article details: {str(e)}")
        return None

def search_pubmed(title, authors=None):
    """
    Search PubMed for a paper using title and optionally authors
    Returns a dict with pmid, doi, and pmcid if found
    """
    try:
        # Construct search query
        query = f'{title}[Title]'
        if authors:
            # Add first author to query to improve accuracy
            first_author = authors.split(';')[0].split(',')[0].strip()
            query += f' AND "{first_author}"[Author]'

        # Search PubMed
        handle = Entrez.esearch(db="pubmed", term=query, retmax=5)
        record = Entrez.read(handle)
        handle.close()

        if not record['IdList']:
            return None

        # Check each result for title match
        for pmid in record['IdList']:
            article = get_article_details(pmid)
            if not article:
                continue
                
            # Verify title match
            if titles_match(title, article['title']):
                return article
                
        return None

    except Exception as e:
        print(f"Error searching PubMed: {str(e)}")
        return None

def find_existing_study(result):
    """
    Check if a study with any of the found identifiers already exists
    Returns the existing study if found, None otherwise
    """
    if not result:
        return None
        
    query = []
    if result.get('pmid'):
        query.append(BaseStudy.pmid == result['pmid'])
    if result.get('doi'):
        query.append(BaseStudy.doi == result['doi'])
    if result.get('pmcid'):
        query.append(BaseStudy.pmcid == result['pmcid'])
        
    if not query:
        return None
        
    return BaseStudy.query.filter(or_(*query)).first()

def verify_merge(source_study_id, target_study):
    """
    Verify that all versions were properly moved to the target study
    
    Args:
        source_study_id: ID of the original base study
        target_study: BaseStudy object versions were moved to
        
    Returns:
        bool: True if merge was successful, False otherwise
    """
    # Find any orphaned studies (still pointing to old base_study_id)
    orphaned = Study.query.filter_by(base_study_id=source_study_id).count()
    if orphaned > 0:
        print(f"Warning: {orphaned} studies still reference old base study {source_study_id}")
        return False
        
    # Verify versions were moved to target
    moved = Study.query.filter_by(base_study_id=target_study.id).count()
    if moved == 0:
        print(f"Warning: No studies found under target base study {target_study.id}")
        return False
        
    return True

def merge_base_studies(source_study, target_study, identifiers):
    """
    Merge source base study into target base study, moving all versions
    and updating with new identifiers
    
    Args:
        source_study: The BaseStudy to merge from
        target_study: The BaseStudy to merge into
        identifiers: Dict containing pmid, doi, pmcid to update with
    
    Returns:
        Tuple[bool, list]: (success status, list of objects to commit)
    """
    try:
        to_commit = []
        source_id = source_study.id
        version_count = len(source_study.versions)
        
        print(f"Moving {version_count} versions from {source_id} to {target_study.id}")
        
        # Update target study with any missing identifiers
        if identifiers.get('pmid') and not target_study.pmid:
            target_study.pmid = identifiers['pmid']
            to_commit.append(target_study)
        if identifiers.get('doi') and not target_study.doi:
            target_study.doi = identifiers['doi']
            to_commit.append(target_study)
        if identifiers.get('pmcid') and not target_study.pmcid:
            target_study.pmcid = identifiers['pmcid']
            to_commit.append(target_study)
            
        # Move all versions from source to target
        for version in source_study.versions:
            # Update version identifiers
            if identifiers.get('pmid'):
                version.pmid = identifiers['pmid']
            if identifiers.get('doi'):
                version.doi = identifiers['doi']
            if identifiers.get('pmcid'):
                version.pmcid = identifiers['pmcid']
            
            # Add to target study's versions
            target_study.versions.append(version)
            # Update base_study_id
            version.base_study_id = target_study.id
            
            to_commit.append(version)
            
        # Flush changes to verify merge
        db.session.add_all(to_commit)
        db.session.flush()
        
        # Verify versions were moved successfully
        if not verify_merge(source_id, target_study):
            db.session.rollback()
            return False, []
            
        # Mark source study for deletion after successful merge
        db.session.delete(source_study)
        
        return True, to_commit
        
    except Exception as e:
        print(f"Error during merge: {str(e)}")
        db.session.rollback()
        return False, []

def find_missing_identifiers():
    """Find and update studies missing identifiers"""
    
    print("Starting identifier search...")

    # Get studies with no identifiers
    no_ids = BaseStudy.query.filter(
        and_(
            or_(BaseStudy.pmid == None, BaseStudy.pmid == ''),
            or_(BaseStudy.doi == None, BaseStudy.doi == ''),
            or_(BaseStudy.pmcid == None, BaseStudy.pmcid == '')
        )
    ).all()

    print(f"Found {len(no_ids)} studies with missing identifiers")
    
    updates = 0
    skipped = 0
    merged = 0
    merge_failed = 0
    to_commit = []
    
    for bs in no_ids:
        if not bs.name:  # Skip if no title to search with
            skipped += 1
            continue
            
        # Clean title for searching
        clean_title = clean_string(bs.name)
        if len(clean_title) < 10:  # Skip very short titles
            skipped += 1
            continue
            
        print(f"\nSearching for: {bs.name}")
        
        # Try PubMed first
        result = search_pubmed(bs.name)
        
        # If not found in PubMed, try Semantic Scholar
        if not result:
            print("Not found in PubMed, trying Semantic Scholar...")
            result = search_semantic_scholar(bs.name)
        
        if result:
            # Check if study already exists with these identifiers
            existing = find_existing_study(result)
            if existing and existing.id != bs.id:
                print(f"Found existing study with matching identifiers (ID: {existing.id})")
                print("Merging studies...")
                
                # If we got an abstract from Semantic Scholar, save it
                if result.get('abstract') and not existing.description:
                    existing.description = result['abstract']
                
                # Merge the studies
                success, merge_updates = merge_base_studies(bs, existing, result)
                if success:
                    to_commit.extend(merge_updates)
                    merged += 1
                    print("Merge successful")
                else:
                    merge_failed += 1
                    print("Merge failed")
                continue
                
            # Update base study
            if result.get('pmid'):
                bs.pmid = result['pmid']
            if result.get('doi'):
                bs.doi = result['doi']
            if result.get('pmcid'):
                bs.pmcid = result['pmcid']
            if result.get('abstract') and not bs.description:
                bs.description = result['abstract']
                
            # Update all versions
            for v in bs.versions:
                if result.get('pmid'):
                    v.pmid = result['pmid']
                if result.get('doi'):
                    v.doi = result['doi']
                if result.get('pmcid'):
                    v.pmcid = result['pmcid']
                    
            to_commit.append(bs)
            updates += 1
            
            print(f"Found identifiers: {result}")
        else:
            print("No matching article found in either PubMed or Semantic Scholar")
            skipped += 1
    
    print(f"\nFound {updates} studies to update")
    print(f"Successfully merged {merged} duplicate studies")
    print(f"Failed to merge {merge_failed} studies")
    print(f"Skipped {skipped} studies (no title, title too short, or no match found)")
    
    return to_commit

if __name__ == "__main__":
    try:
        # Start transaction
        db.session.begin()
        
        # Find studies to update
        to_commit = find_missing_identifiers()
        
        # Return list of studies to update
        if to_commit:
            print(f"\nFound {len(to_commit)} studies to update")
        else:
            print("\nNo studies to update")
            
    except Exception as e:
        print(f"\nError occurred: {str(e)}")
        raise
    finally:
        print("Identifier search complete!")
Clone this wiki locally