Skip to content

Updates rubric tester to generate CSV for the results data. #100

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bin/prepare.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
project_link = row.first.last

# Get the project id
project_link = project_link.match(/\/([^\/]+)\/view/)[1]
project_link = project_link.match(/\/([^\/]+)(?:\/view|\/?$)/)[1]
tokens << project_link
new_row = [project_link]

Expand Down
132 changes: 131 additions & 1 deletion lib/assessment/report.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import os
import csv
import hashlib
import io
import json
import math
import os
import re
from datetime import datetime
from itertools import product, chain
from typing import List, Dict, Any
from lib.assessment.config import VALID_LABELS, PASSING_LABELS

Expand Down Expand Up @@ -83,6 +87,132 @@ def _generate_confusion_table(self, confusion_matrix, labels):
confusion_table += '</table>'
return confusion_table

def _label_to_number(self, label):
if 'Extensive' in label:
return 3
elif 'Convincing' in label:
return 2
elif 'Limited' in label:
return 1

return 0

def _key_concept_to_filename(self, key_concept):
ret = re.sub(r'[^\w\s]', '', key_concept)
ret = re.sub(r'\s+', '-', ret)
ret = re.sub(r'^-+|-+$', '', ret)
return ret.lower()

def generate_csv_output(self, output_path, prompt, rubric, accuracy=None, predicted_labels=None, actual_labels=None, is_pass_fail=False, accuracy_by_criteria=None, errors=[], input_params={}, confusion_by_criteria=None, overall_confusion=None, label_names=None):
# Note: We open the CSV file with the newlines turned off as required by CSV writing

# Keeps track of the rubric changes
rubric_hash = hashlib.sha1(rubric.encode('utf-8')).hexdigest()

# Keeps track of prompt changes
prompt_hash = hashlib.sha1(prompt.encode('utf-8')).hexdigest()

# String to append based on pass/fail vs. exact match
matching_type = "partial" if is_pass_fail else "exact"

# Get the list of key concepts
key_concepts = list(map(lambda label: label['Key Concept'], list(predicted_labels.values())[0]))

# Reform accuracy to something writable
accuracy = 'NaN' if accuracy is None or math.isnan(accuracy) else str(accuracy)

output_file = os.path.join(output_path, f"{input_params['lesson_name']}-{matching_type}-metadata.csv")
with open(output_file, 'w+', newline='') as file:
csv_writer = csv.writer(file)

# Write Header
csv_writer.writerow(["RUBRIC_HASH", "PROMPT_HASH", "DATE", "IS_PASS_FAIL", "ERRORS", "ACCURACY"])

# Write data
is_pass_fail_value = "TRUE" if is_pass_fail else "FALSE"
csv_writer.writerow([rubric_hash, prompt_hash, datetime.now().isoformat(), is_pass_fail_value, ';'.join(errors), accuracy])

# Write sample report and aggregate
output_file = os.path.join(output_path, f"{input_params['lesson_name']}-sample-accuracy.csv")
with open(output_file, 'w+', newline='') as file:
csv_writer = csv.writer(file)

# Write Header
csv_writer.writerow(["STUDENT_ID", "LEARNING_GOAL", "ACTUAL", "PREDICTED", "PASS_FAIL_DIFF", "DIFF"])

# Go through each student and each label
for student_id, labels in predicted_labels.items():
for label in labels:
criteria = label['Key Concept']
actual = actual_labels[student_id][criteria]
predicted = label['Label']
diff = self._label_to_number(predicted) - self._label_to_number(actual)
pass_fail_diff = (self._label_to_number(predicted) // 2) - (self._label_to_number(actual) // 2)
csv_writer.writerow([student_id, criteria, actual, predicted, pass_fail_diff, diff])

output_file = os.path.join(output_path, f"{input_params['lesson_name']}-{matching_type}-accuracy.csv")
with open(output_file, 'w+', newline='') as file:
csv_writer = csv.writer(file)

# Write Header
csv_writer.writerow(["LEARNING_GOAL", "ACCURACY"])

# Write the overall accuracy (repeated from the metadata report)
csv_writer.writerow(["OVERALL", accuracy])

# For each learning goal, print the accuracy
for key_concept in key_concepts:
cur_accuracy = accuracy_by_criteria.get(key_concept)
cur_accuracy = 'NaN' if cur_accuracy is None or math.isnan(cur_accuracy) else str(cur_accuracy)
csv_writer.writerow([key_concept, cur_accuracy])

labels = ["EXTENSIVE", "CONVINCING", "LIMITED", "NO"]

# First write confusion matrix for all goals
output_file = os.path.join(output_path, f"{input_params['lesson_name']}-{matching_type}-confusion.csv")
with open(output_file, 'w+', newline='') as file:
csv_writer = csv.writer(file)

# Write Header
if is_pass_fail:
csv_writer.writerow(["KEY_CONCEPT", "TRUE_POSITIVE", "FALSE_NEGATIVE", "FALSE_POSITIVE", "TRUE_NEGATIVE"])
else:
# Yield a permutation of all labels to form the header
items = list(map(lambda lst: '/'.join(lst), product(labels, labels)))
csv_writer.writerow(["KEY_CONCEPT", *items])

# A 'chain' just flattens the 2d matrix into the row-ordered list
# Write all the values in the overall matrix into the CSV
csv_writer.writerow(["OVERALL", *list(chain(*overall_confusion))])

# Write a row for each concept as well
for key_concept, confusion_matrix in confusion_by_criteria.items():
csv_writer.writerow([key_concept, *list(chain(*confusion_matrix))])

# Write learning goal accuracy reports
for key_concept in key_concepts:
slug = self._key_concept_to_filename(key_concept)
output_file = os.path.join(output_path, f"{input_params['lesson_name']}-sample-accuracy-{slug}.csv")

with open(output_file, 'w+', newline='') as file:
csv_writer = csv.writer(file)

# Write Header
csv_writer.writerow(["STUDENT_ID", "ACTUAL", "PREDICTED", "PASS_FAIL_DIFF", "DIFF"])

# Search the report data for just info relevant to this key concept
for student_id, labels in predicted_labels.items():
for label in labels:
criteria = label['Key Concept']
if criteria != key_concept:
continue

actual = actual_labels[student_id][criteria]
predicted = label['Label']
diff = self._label_to_number(predicted) - self._label_to_number(actual)
pass_fail_diff = (self._label_to_number(predicted) // 2) - (self._label_to_number(actual) // 2)
csv_writer.writerow([student_id, actual, predicted, pass_fail_diff, diff])

def generate_html_output(self, output_file, prompt, rubric, accuracy=None, predicted_labels=None, actual_labels=None, is_pass_fail=False, accuracy_by_criteria=None, errors=[], input_params={}, confusion_by_criteria=None, overall_confusion=None, label_names=None, prefix='sample_code'):
link_base_url = f'file://{os.getcwd()}/{prefix}'
title_suffix = 'pass-fail' if is_pass_fail else 'exact-match'
Expand Down
17 changes: 17 additions & 0 deletions lib/assessment/rubric_tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
standard_rubric_file = 'standard_rubric.csv'
actual_labels_file = 'actual_labels.csv'
output_dir_name = 'output'
report_dir_name = 'reports'
datasets_dir = 'datasets'
cache_dir_name = 'cached_responses'
accuracy_threshold_file = 'accuracy_thresholds.json'
Expand Down Expand Up @@ -363,6 +364,7 @@ def main():

# set up output and cache directories
os.makedirs(os.path.join(params_lesson_prefix, output_dir_name), exist_ok=True)
os.makedirs(os.path.join(params_lesson_prefix, report_dir_name), exist_ok=True)
os.makedirs(os.path.join(params_lesson_prefix, cache_dir_name), exist_ok=True)
if not options.use_cached:
for file in glob.glob(f'{os.path.join(params_lesson_prefix, cache_dir_name)}/*'):
Expand Down Expand Up @@ -402,6 +404,21 @@ def main():
}
}
report = Report()
report.generate_csv_output(
os.path.join(params_lesson_prefix, report_dir_name),
prompt,
rubric,
accuracy=overall_accuracy_percent,
predicted_labels=predicted_labels,
actual_labels=actual_labels,
is_pass_fail=is_pass_fail,
accuracy_by_criteria=accuracy_by_criteria_percent,
errors=errors,
input_params=input_params,
confusion_by_criteria=confusion_by_criteria,
overall_confusion=overall_confusion,
label_names=label_names,
)
report.generate_html_output(
output_file,
prompt,
Expand Down