|
37 | 37 | import pandas as pd
|
38 | 38 | import shutil
|
39 | 39 | import tempfile
|
| 40 | +import json |
40 | 41 |
|
41 | 42 | def parse_args():
|
42 | 43 | parser = argparse.ArgumentParser(description='Extract RNA-Seq assay table from ISA.zip')
|
@@ -312,6 +313,175 @@ def extract_and_find_assay(outdir, glds_accession):
|
312 | 313 | # Return both the dataframe and the filename
|
313 | 314 | return pd.read_csv(assay_path, sep='\t'), matched_file
|
314 | 315 |
|
| 316 | +def add_read_counts(df, outdir, glds_accession, assay_suffix, runsheet_df=None): |
| 317 | + """Add the read counts column to the dataframe. |
| 318 | + |
| 319 | + Args: |
| 320 | + df: The assay table dataframe |
| 321 | + outdir: The output directory |
| 322 | + glds_accession: The GLDS accession number |
| 323 | + assay_suffix: The assay suffix for MultiQC report files |
| 324 | + runsheet_df: Optional runsheet dataframe with sample information |
| 325 | + |
| 326 | + Returns: |
| 327 | + The modified dataframe |
| 328 | + """ |
| 329 | + column_name = "Parameter Value[Read Count]" |
| 330 | + |
| 331 | + # Determine if paired-end from runsheet |
| 332 | + is_paired_end = is_paired_end_data(runsheet_df) |
| 333 | + print(f"Data is {'paired-end' if is_paired_end else 'single-end'} based on runsheet") |
| 334 | + |
| 335 | + # Path to raw MultiQC data zip |
| 336 | + fastqc_dir = os.path.join(outdir, "00-RawData", "FastQC_Reports") |
| 337 | + multiqc_data_zip = os.path.join(fastqc_dir, f"raw_multiqc{assay_suffix}_data.zip") |
| 338 | + |
| 339 | + if not os.path.exists(multiqc_data_zip): |
| 340 | + print(f"WARNING: MultiQC data zip file not found at {multiqc_data_zip}") |
| 341 | + # If zip not found, just add placeholder values |
| 342 | + df[column_name] = "N/A" |
| 343 | + column_changes.append(f"Added: {column_name} (with placeholder values)") |
| 344 | + return df |
| 345 | + |
| 346 | + print(f"Found MultiQC data zip: {multiqc_data_zip}") |
| 347 | + |
| 348 | + # Find the sample name column in the assay table |
| 349 | + sample_col = next((col for col in df.columns if 'Sample Name' in col), None) |
| 350 | + if not sample_col: |
| 351 | + print("Warning: Could not find Sample Name column in assay table") |
| 352 | + # If no sample column, just add placeholder values |
| 353 | + df[column_name] = "N/A" |
| 354 | + column_changes.append(f"Added: {column_name} (with placeholder values)") |
| 355 | + return df |
| 356 | + |
| 357 | + # Get sample names from assay table |
| 358 | + assay_sample_names = df[sample_col].tolist() |
| 359 | + |
| 360 | + # Get read counts from MultiQC data |
| 361 | + read_counts = {} |
| 362 | + |
| 363 | + # Create a temporary directory to extract files |
| 364 | + with tempfile.TemporaryDirectory() as temp_dir: |
| 365 | + try: |
| 366 | + # Extract the zip file |
| 367 | + with zipfile.ZipFile(multiqc_data_zip, 'r') as zip_ref: |
| 368 | + zip_ref.extractall(temp_dir) |
| 369 | + |
| 370 | + # Expected path to the JSON file in the extracted data directory |
| 371 | + expected_dir = f"raw_multiqc{assay_suffix}_data" |
| 372 | + json_path = os.path.join(temp_dir, expected_dir, "multiqc_data.json") |
| 373 | + |
| 374 | + # Check if JSON file exists in the expected location |
| 375 | + if not os.path.exists(json_path): |
| 376 | + # Fallback: try to find it elsewhere |
| 377 | + print(f"JSON not found at expected path: {json_path}, searching elsewhere...") |
| 378 | + |
| 379 | + # Try directly in the temp directory |
| 380 | + direct_path = os.path.join(temp_dir, "multiqc_data.json") |
| 381 | + if os.path.exists(direct_path): |
| 382 | + json_path = direct_path |
| 383 | + else: |
| 384 | + # Search all subdirectories |
| 385 | + for root, dirs, files in os.walk(temp_dir): |
| 386 | + if "multiqc_data.json" in files: |
| 387 | + json_path = os.path.join(root, "multiqc_data.json") |
| 388 | + print(f"Found JSON at: {json_path}") |
| 389 | + break |
| 390 | + |
| 391 | + if not os.path.exists(json_path): |
| 392 | + print(f"ERROR: Could not find multiqc_data.json in the extracted zip") |
| 393 | + # Add placeholder values |
| 394 | + df[column_name] = "N/A" |
| 395 | + column_changes.append(f"Added: {column_name} (with placeholder values)") |
| 396 | + return df |
| 397 | + |
| 398 | + print(f"Using MultiQC data from: {json_path}") |
| 399 | + |
| 400 | + # Parse the MultiQC JSON |
| 401 | + with open(json_path, 'r') as f: |
| 402 | + multiqc_data = json.load(f) |
| 403 | + |
| 404 | + # First, try to extract directly from report_general_stats_data |
| 405 | + if 'report_general_stats_data' in multiqc_data and multiqc_data['report_general_stats_data']: |
| 406 | + stats_module = multiqc_data['report_general_stats_data'][0] # Use first module |
| 407 | + print("Extracting read counts directly from report_general_stats_data") |
| 408 | + |
| 409 | + for sample_name, sample_data in stats_module.items(): |
| 410 | + if 'total_sequences' in sample_data: |
| 411 | + read_count = int(sample_data['total_sequences']) |
| 412 | + print(f"Found count for {sample_name}: {read_count}") |
| 413 | + read_counts[sample_name] = read_count |
| 414 | + |
| 415 | + # Fallback to FastQC module specific extraction if needed |
| 416 | + elif ('report_data_sources' in multiqc_data and |
| 417 | + 'FastQC' in multiqc_data['report_data_sources']): |
| 418 | + |
| 419 | + # Find the index for FastQC in the general stats data |
| 420 | + fastqc_index = None |
| 421 | + for i, module_data in enumerate(multiqc_data.get('report_general_stats_data', [])): |
| 422 | + if module_data and any('total_sequences' in sample_data for sample_data in module_data.values()): |
| 423 | + fastqc_index = i |
| 424 | + break |
| 425 | + |
| 426 | + if fastqc_index is not None: |
| 427 | + fastqc_stats = multiqc_data['report_general_stats_data'][fastqc_index] |
| 428 | + |
| 429 | + # Process each sample to extract read counts |
| 430 | + for sample_name, sample_data in fastqc_stats.items(): |
| 431 | + if 'total_sequences' in sample_data: |
| 432 | + read_count = int(sample_data['total_sequences']) |
| 433 | + print(f"Found count for {sample_name}: {read_count}") |
| 434 | + read_counts[sample_name] = read_count |
| 435 | + |
| 436 | + if not read_counts: |
| 437 | + print("WARNING: Could not extract any read counts from MultiQC data") |
| 438 | + |
| 439 | + # Debug output to show read counts found |
| 440 | + print(f"Successfully extracted {len(read_counts)} read counts:") |
| 441 | + for sample_name, count in read_counts.items(): |
| 442 | + print(f" - {sample_name}: {count}") |
| 443 | + |
| 444 | + except Exception as e: |
| 445 | + print(f"Error extracting read counts from MultiQC data: {str(e)}") |
| 446 | + # If error occurs, add placeholder values |
| 447 | + df[column_name] = "N/A" |
| 448 | + column_changes.append(f"Added: {column_name} (with placeholder values)") |
| 449 | + return df |
| 450 | + |
| 451 | + # Debug output to show sample names in assay table |
| 452 | + print("Sample names in assay table:") |
| 453 | + for sample_name in assay_sample_names: |
| 454 | + print(f" - {sample_name}") |
| 455 | + |
| 456 | + # Generate read count values for each sample in the assay table |
| 457 | + values = [] |
| 458 | + for assay_sample in assay_sample_names: |
| 459 | + print(f"Looking for read count for sample: {assay_sample}") |
| 460 | + |
| 461 | + # Try direct match first |
| 462 | + if assay_sample in read_counts: |
| 463 | + print(f"Direct match found for {assay_sample}") |
| 464 | + values.append(str(read_counts[assay_sample])) |
| 465 | + else: |
| 466 | + # Try a more flexible match if direct match fails |
| 467 | + found_match = False |
| 468 | + for mqc_sample, count in read_counts.items(): |
| 469 | + # Check if assay sample name is contained in MultiQC sample name or vice versa |
| 470 | + if assay_sample in mqc_sample or mqc_sample in assay_sample: |
| 471 | + values.append(str(count)) |
| 472 | + found_match = True |
| 473 | + print(f"Flexible match found: {assay_sample} -> {mqc_sample} = {count}") |
| 474 | + break |
| 475 | + |
| 476 | + if not found_match: |
| 477 | + print(f"WARNING: No read count found for sample {assay_sample}") |
| 478 | + values.append("N/A") |
| 479 | + |
| 480 | + # Add the column to the dataframe |
| 481 | + df = update_column(df, column_name, values) |
| 482 | + |
| 483 | + return df |
| 484 | + |
315 | 485 | def add_parameter_column(df, column_name, value, prefix=None):
|
316 | 486 | """Add a parameter column to the dataframe if it doesn't exist already.
|
317 | 487 |
|
@@ -972,6 +1142,9 @@ def main():
|
972 | 1142 |
|
973 | 1143 | # Following the original star workflow order:
|
974 | 1144 |
|
| 1145 | + # 0. Add read counts from MultiQC report (new) |
| 1146 | + assay_df = add_read_counts(assay_df, args.outdir, args.glds_accession, args.assay_suffix, runsheet_df) |
| 1147 | + |
975 | 1148 | # 1. Trimmed Sequence Data column
|
976 | 1149 | assay_df = add_trimmed_data_column(assay_df, glds_prefix, runsheet_df=runsheet_df)
|
977 | 1150 |
|
|
0 commit comments