Skip to content

Commit 5b942ce

Browse files
committed
add read count col to update assay table
1 parent 35fdd28 commit 5b942ce

File tree

1 file changed

+173
-0
lines changed

1 file changed

+173
-0
lines changed

RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/update_assay_table.py

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import pandas as pd
3838
import shutil
3939
import tempfile
40+
import json
4041

4142
def parse_args():
4243
parser = argparse.ArgumentParser(description='Extract RNA-Seq assay table from ISA.zip')
@@ -312,6 +313,175 @@ def extract_and_find_assay(outdir, glds_accession):
312313
# Return both the dataframe and the filename
313314
return pd.read_csv(assay_path, sep='\t'), matched_file
314315

316+
def add_read_counts(df, outdir, glds_accession, assay_suffix, runsheet_df=None):
317+
"""Add the read counts column to the dataframe.
318+
319+
Args:
320+
df: The assay table dataframe
321+
outdir: The output directory
322+
glds_accession: The GLDS accession number
323+
assay_suffix: The assay suffix for MultiQC report files
324+
runsheet_df: Optional runsheet dataframe with sample information
325+
326+
Returns:
327+
The modified dataframe
328+
"""
329+
column_name = "Parameter Value[Read Count]"
330+
331+
# Determine if paired-end from runsheet
332+
is_paired_end = is_paired_end_data(runsheet_df)
333+
print(f"Data is {'paired-end' if is_paired_end else 'single-end'} based on runsheet")
334+
335+
# Path to raw MultiQC data zip
336+
fastqc_dir = os.path.join(outdir, "00-RawData", "FastQC_Reports")
337+
multiqc_data_zip = os.path.join(fastqc_dir, f"raw_multiqc{assay_suffix}_data.zip")
338+
339+
if not os.path.exists(multiqc_data_zip):
340+
print(f"WARNING: MultiQC data zip file not found at {multiqc_data_zip}")
341+
# If zip not found, just add placeholder values
342+
df[column_name] = "N/A"
343+
column_changes.append(f"Added: {column_name} (with placeholder values)")
344+
return df
345+
346+
print(f"Found MultiQC data zip: {multiqc_data_zip}")
347+
348+
# Find the sample name column in the assay table
349+
sample_col = next((col for col in df.columns if 'Sample Name' in col), None)
350+
if not sample_col:
351+
print("Warning: Could not find Sample Name column in assay table")
352+
# If no sample column, just add placeholder values
353+
df[column_name] = "N/A"
354+
column_changes.append(f"Added: {column_name} (with placeholder values)")
355+
return df
356+
357+
# Get sample names from assay table
358+
assay_sample_names = df[sample_col].tolist()
359+
360+
# Get read counts from MultiQC data
361+
read_counts = {}
362+
363+
# Create a temporary directory to extract files
364+
with tempfile.TemporaryDirectory() as temp_dir:
365+
try:
366+
# Extract the zip file
367+
with zipfile.ZipFile(multiqc_data_zip, 'r') as zip_ref:
368+
zip_ref.extractall(temp_dir)
369+
370+
# Expected path to the JSON file in the extracted data directory
371+
expected_dir = f"raw_multiqc{assay_suffix}_data"
372+
json_path = os.path.join(temp_dir, expected_dir, "multiqc_data.json")
373+
374+
# Check if JSON file exists in the expected location
375+
if not os.path.exists(json_path):
376+
# Fallback: try to find it elsewhere
377+
print(f"JSON not found at expected path: {json_path}, searching elsewhere...")
378+
379+
# Try directly in the temp directory
380+
direct_path = os.path.join(temp_dir, "multiqc_data.json")
381+
if os.path.exists(direct_path):
382+
json_path = direct_path
383+
else:
384+
# Search all subdirectories
385+
for root, dirs, files in os.walk(temp_dir):
386+
if "multiqc_data.json" in files:
387+
json_path = os.path.join(root, "multiqc_data.json")
388+
print(f"Found JSON at: {json_path}")
389+
break
390+
391+
if not os.path.exists(json_path):
392+
print(f"ERROR: Could not find multiqc_data.json in the extracted zip")
393+
# Add placeholder values
394+
df[column_name] = "N/A"
395+
column_changes.append(f"Added: {column_name} (with placeholder values)")
396+
return df
397+
398+
print(f"Using MultiQC data from: {json_path}")
399+
400+
# Parse the MultiQC JSON
401+
with open(json_path, 'r') as f:
402+
multiqc_data = json.load(f)
403+
404+
# First, try to extract directly from report_general_stats_data
405+
if 'report_general_stats_data' in multiqc_data and multiqc_data['report_general_stats_data']:
406+
stats_module = multiqc_data['report_general_stats_data'][0] # Use first module
407+
print("Extracting read counts directly from report_general_stats_data")
408+
409+
for sample_name, sample_data in stats_module.items():
410+
if 'total_sequences' in sample_data:
411+
read_count = int(sample_data['total_sequences'])
412+
print(f"Found count for {sample_name}: {read_count}")
413+
read_counts[sample_name] = read_count
414+
415+
# Fallback to FastQC module specific extraction if needed
416+
elif ('report_data_sources' in multiqc_data and
417+
'FastQC' in multiqc_data['report_data_sources']):
418+
419+
# Find the index for FastQC in the general stats data
420+
fastqc_index = None
421+
for i, module_data in enumerate(multiqc_data.get('report_general_stats_data', [])):
422+
if module_data and any('total_sequences' in sample_data for sample_data in module_data.values()):
423+
fastqc_index = i
424+
break
425+
426+
if fastqc_index is not None:
427+
fastqc_stats = multiqc_data['report_general_stats_data'][fastqc_index]
428+
429+
# Process each sample to extract read counts
430+
for sample_name, sample_data in fastqc_stats.items():
431+
if 'total_sequences' in sample_data:
432+
read_count = int(sample_data['total_sequences'])
433+
print(f"Found count for {sample_name}: {read_count}")
434+
read_counts[sample_name] = read_count
435+
436+
if not read_counts:
437+
print("WARNING: Could not extract any read counts from MultiQC data")
438+
439+
# Debug output to show read counts found
440+
print(f"Successfully extracted {len(read_counts)} read counts:")
441+
for sample_name, count in read_counts.items():
442+
print(f" - {sample_name}: {count}")
443+
444+
except Exception as e:
445+
print(f"Error extracting read counts from MultiQC data: {str(e)}")
446+
# If error occurs, add placeholder values
447+
df[column_name] = "N/A"
448+
column_changes.append(f"Added: {column_name} (with placeholder values)")
449+
return df
450+
451+
# Debug output to show sample names in assay table
452+
print("Sample names in assay table:")
453+
for sample_name in assay_sample_names:
454+
print(f" - {sample_name}")
455+
456+
# Generate read count values for each sample in the assay table
457+
values = []
458+
for assay_sample in assay_sample_names:
459+
print(f"Looking for read count for sample: {assay_sample}")
460+
461+
# Try direct match first
462+
if assay_sample in read_counts:
463+
print(f"Direct match found for {assay_sample}")
464+
values.append(str(read_counts[assay_sample]))
465+
else:
466+
# Try a more flexible match if direct match fails
467+
found_match = False
468+
for mqc_sample, count in read_counts.items():
469+
# Check if assay sample name is contained in MultiQC sample name or vice versa
470+
if assay_sample in mqc_sample or mqc_sample in assay_sample:
471+
values.append(str(count))
472+
found_match = True
473+
print(f"Flexible match found: {assay_sample} -> {mqc_sample} = {count}")
474+
break
475+
476+
if not found_match:
477+
print(f"WARNING: No read count found for sample {assay_sample}")
478+
values.append("N/A")
479+
480+
# Add the column to the dataframe
481+
df = update_column(df, column_name, values)
482+
483+
return df
484+
315485
def add_parameter_column(df, column_name, value, prefix=None):
316486
"""Add a parameter column to the dataframe if it doesn't exist already.
317487
@@ -972,6 +1142,9 @@ def main():
9721142

9731143
# Following the original star workflow order:
9741144

1145+
# 0. Add read counts from MultiQC report (new)
1146+
assay_df = add_read_counts(assay_df, args.outdir, args.glds_accession, args.assay_suffix, runsheet_df)
1147+
9751148
# 1. Trimmed Sequence Data column
9761149
assay_df = add_trimmed_data_column(assay_df, glds_prefix, runsheet_df=runsheet_df)
9771150

0 commit comments

Comments
 (0)