Skip to content

Commit 1d8cd42

Browse files
committed
Feat: DBDM with tests and docker and python API fixed
1 parent b15994b commit 1d8cd42

File tree

9 files changed

+361
-84
lines changed

9 files changed

+361
-84
lines changed

.github/workflows/ci.yml

Lines changed: 0 additions & 29 deletions
This file was deleted.

.github/workflows/docker-publish.yml renamed to .github/workflows/cicd.yaml

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,36 @@
1-
name: Build and Push Docker Image
1+
name: CI and Docker Build
22

33
on:
44
push:
55
branches:
66
- main
7+
pull_request:
78

89
jobs:
10+
test:
11+
runs-on: ubuntu-latest
12+
13+
steps:
14+
- name: Checkout repository
15+
uses: actions/checkout@v2
16+
17+
- name: Set up Python
18+
uses: actions/setup-python@v2
19+
with:
20+
python-version: '3.8'
21+
22+
- name: Install dependencies
23+
run: |
24+
python -m pip install --upgrade pip
25+
pip install -r requirements.txt
26+
27+
- name: Run tests
28+
run: |
29+
python -m unittest discover tests
30+
931
build:
1032
runs-on: ubuntu-latest
33+
needs: test # This ensures the build job only runs if the test job succeeds
1134

1235
steps:
1336
- name: Checkout repository
@@ -27,5 +50,4 @@ jobs:
2750
with:
2851
context: .
2952
push: true
30-
tags: ${{ secrets.DOCKER_USERNAME }}/simplatab-machine-learning-automator:0.7.2-TestVersion
31-
53+
tags: ${{ secrets.DOCKER_USERNAME }}/simplatab-machine-learning-automator:0.7.3-TestVersion

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ radiomics_setup.py
88
*.so
99
.vscode
1010

11+
1112
# Folders
1213
Feature Selection.txt
1314
Data

Helpers/data_checks.py

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,23 +18,52 @@ def load_data(self):
1818

1919
return train, test
2020

21-
def check_target_column(self, df, target_col="Target"):
21+
@staticmethod
22+
def check_target_column(df, target_col="Target"):
2223
if target_col not in df.columns:
2324
raise ValueError(f"The target column '{target_col}' is not present in the dataframe.")
2425

2526
if not df[target_col].isin([0, 1]).all():
2627
raise ValueError(f"The target column '{target_col}' does not contain binary values 0 and 1.")
2728

28-
def set_index_column(self, df, index_col="ID"):
29-
if index_col not in df.columns and "patient_id" not in df.columns:
30-
raise ValueError(f"The index column '{index_col}' is not present in the dataframe.")
31-
try:
29+
@staticmethod
30+
def set_index_column(df, index_col="ID"):
31+
32+
if index_col in df.columns.to_list():
3233
df.set_index(index_col, inplace=True)
33-
except KeyError:
34+
return df
35+
elif "patient_id" in df.columns.to_list():
3436
df.set_index("patient_id", inplace=True)
35-
36-
def remove_nan_rows(self, df):
37+
return df
38+
else:
39+
raise ValueError(f"Neither '{index_col}' nor 'patient_id' is present in the dataframe.")
40+
41+
42+
@staticmethod
43+
def remove_nan_rows(df):
3744
df.dropna(inplace=True)
45+
return df
46+
47+
@staticmethod
48+
def check_categorical_features(train, test):
49+
categorical_cols = train.select_dtypes(include=['object', 'category']).columns
50+
if categorical_cols.empty:
51+
# No categorical columns to process
52+
return train, test, []
53+
54+
cols_to_drop = []
55+
56+
for col in categorical_cols:
57+
if col in test.columns:
58+
train_unique_values = set(train[col].dropna().unique())
59+
test_unique_values = set(test[col].dropna().unique())
60+
if train_unique_values != test_unique_values:
61+
cols_to_drop.append(col)
62+
63+
train.drop(columns=cols_to_drop, inplace=True)
64+
test.drop(columns=cols_to_drop, inplace=True)
65+
66+
return train, test, cols_to_drop
3867

3968
def process_data(self):
4069
train, test = self.load_data()
@@ -43,11 +72,12 @@ def process_data(self):
4372
self.check_target_column(train)
4473

4574
# Set index column
46-
self.set_index_column(train)
47-
self.set_index_column(test)
75+
train = self.set_index_column(train)
76+
test= self.set_index_column(test)
4877

4978
# Remove rows with NaN values
50-
self.remove_nan_rows(train)
51-
self.remove_nan_rows(test)
79+
train = self.remove_nan_rows(train)
80+
test = self.remove_nan_rows(test)
81+
train, test, cols_to_drop = self.check_categorical_features(train, test)
5282

5383
return train, test

__main__.py

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44
from Helpers.pipelines_main import train_k_fold, external_test, read_yaml
55
from Helpers.data_checks import DataChecker
66
from tkinter import Tk, filedialog
7+
from Helpers import DBDM
78

89
def get_user_input():
910
params = {}
10-
11+
params["BiasAssessment"] = input("Enable bias assessment (true/false): ").lower() == 'true'
12+
params["Feature"] = str(input("Enter feature from your train and test sets columns for bias assessment: "))
1113
params["number_of_k_folds"] = int(input("Enter number of k-folds: "))
1214

1315
params["apply_grid_search"] = {}
@@ -50,9 +52,40 @@ def select_folder(prompt):
5052
root.destroy()
5153
return folder_selected
5254

53-
def main(input_folder, output_folder):
55+
56+
def main(input_folder, output_folder, params):
5457
read_yaml(input_folder)
5558

59+
# Perform Bias Assessment
60+
if params["BiasAssessment"]:
61+
print("------------- \n", " Bias Detection Started \n", "-------------")
62+
try:
63+
print("------------- \n", " Bias Detection Started for Train.csv \n", "-------------")
64+
DBDM.bias_config(
65+
file_path=os.path.join(input_folder, "Train.csv"),
66+
subgroup_analysis=0, # default is 0
67+
facet=params["Feature"],
68+
outcome='Target',
69+
subgroup_col='', # default is ''
70+
label_value=1, # default is 1
71+
)
72+
print("------------- \n", " Bias Detection Finished for Train.csv \n", "-------------")
73+
except:
74+
pass
75+
try:
76+
print("------------- \n", " Bias Detection Started for Test.csv \n", "-------------")
77+
DBDM.bias_config(
78+
file_path=os.path.join(input_folder, "Test.csv"),
79+
subgroup_analysis=0, # default is 0
80+
facet=params["Feature"],
81+
outcome='Target',
82+
subgroup_col='', # default is ''
83+
label_value=1, # default is 1
84+
)
85+
print("------------- \n", " Bias Detection Finished for Test.csv \n", "-------------")
86+
except:
87+
pass
88+
5689
# Load data
5790
print("------------- \n", "Loading Data \n", "-------------")
5891
data_checker = DataChecker(input_folder)
@@ -92,4 +125,4 @@ def main(input_folder, output_folder):
92125
yaml_path = os.path.join(input_folder, "machine_learning_parameters.yaml")
93126
save_yaml(params, yaml_path)
94127

95-
main(input_folder, output_folder)
128+
main(input_folder, output_folder, params)

app.py

Lines changed: 54 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,14 @@
1414

1515

1616
def get_train_columns(input_folder):
17-
train_file_path = os.path.join(input_folder, "Train.csv")
17+
train_file_path = os.path.join(input_folder, "Test.csv")
1818
df = pd.read_csv(train_file_path)
1919
return df.columns.tolist()
2020

2121

2222
@app.route('/')
2323
def index():
24-
input_folder = './input_data' # specify the input folder path
24+
input_folder = './input_data'
2525
columns = get_train_columns(input_folder)
2626
return render_template('index.html', columns=columns)
2727

@@ -36,6 +36,7 @@ def pipeline_status_route():
3636
def submit():
3737
# Retrieve the selected facet from the form
3838
selected_facet = request.form.get('facet')
39+
bias_assess = request.form["bias_assess"].lower() == 'true'
3940

4041
input_folder = './input_data'
4142
output_folder = './Materials'
@@ -69,50 +70,64 @@ def submit():
6970
yaml.dump(params, file)
7071

7172
# Redirect to run pipeline
72-
return redirect(url_for('run_pipeline', selected_facet=selected_facet))
73+
return redirect(url_for('run_pipeline', selected_facet=selected_facet, bias_assess=bias_assess))
7374

7475
@app.route('/run_pipeline')
7576
def run_pipeline():
7677
global pipeline_status_message
7778
input_folder = "./input_data"
7879
output_folder = "./Materials"
7980
selected_facet = request.args.get('selected_facet')
81+
bias_assess = request.args.get('bias_assess').lower() == 'true'
8082
try:
8183
import threading
8284
pipeline_status_message = "Running"
8385
# Run the main function asynchronously
84-
threading.Thread(target=main, args=(input_folder, output_folder, selected_facet)).start()
86+
threading.Thread(target=main, args=(input_folder, output_folder, selected_facet, bias_assess)).start()
8587
return redirect(url_for('pipeline_status'))
8688
except Exception as e:
8789
pipeline_status_message = f"Error: {e}"
8890
return f"An error occurred: {e}"
8991

92+
9093
@app.route('/pipeline_status')
9194
def pipeline_status():
9295
return render_template('status.html', status=pipeline_status_message)
9396

94-
def main(input_folder, output_folder, selected_facet):
97+
98+
def main(input_folder, output_folder, selected_facet, bias_assess=False):
9599
global pipeline_status_message
96100
# Load parameters from YAML file
97101
read_yaml(input_folder)
98-
99-
DBDM.bias_config(
100-
file_path=os.path.join(input_folder, "Train.csv"),
101-
subgroup_analysis=0, # default is 0
102-
facet=selected_facet,
103-
outcome='Target',
104-
subgroup_col='', # default is ''
105-
label_value=1, # default is 1
106-
)
107-
108-
DBDM.bias_config(
109-
file_path=os.path.join(input_folder, "Test.csv"),
110-
subgroup_analysis=0, # default is 0
111-
facet=selected_facet,
112-
outcome='Target',
113-
subgroup_col='', # default is ''
114-
label_value=1, # default is 1
115-
)
102+
if bias_assess:
103+
print("------------- \n", " Bias Detection Started \n", "-------------")
104+
try:
105+
print("------------- \n", " Bias Detection Started for Train.csv \n", "-------------")
106+
DBDM.bias_config(
107+
file_path=os.path.join(input_folder, "Train.csv"),
108+
subgroup_analysis=0, # default is 0
109+
facet=selected_facet,
110+
outcome='Target',
111+
subgroup_col='', # default is ''
112+
label_value=1, # default is 1
113+
)
114+
print("------------- \n", " Bias Detection Finished for Train.csv \n", "-------------")
115+
except:
116+
pass
117+
try:
118+
print("------------- \n", " Bias Detection Started for Test.csv \n", "-------------")
119+
DBDM.bias_config(
120+
file_path=os.path.join(input_folder, "Test.csv"),
121+
subgroup_analysis=0, # default is 0
122+
facet=selected_facet,
123+
outcome='Target',
124+
subgroup_col='', # default is ''
125+
label_value=1, # default is 1
126+
)
127+
print("------------- \n", " Bias Detection Finished for Test.csv \n", "-------------")
128+
except:
129+
pass
130+
print("------------- \n", " Bias Detection Finished \n", "-------------")
116131
# Load data
117132
print("------------- \n", " Loading Data \n", "-------------")
118133
data_checker = DataChecker(input_folder)
@@ -125,22 +140,26 @@ def main(input_folder, output_folder, selected_facet):
125140
print(e)
126141
except ValueError as e:
127142
print(e)
128-
X_train = train.drop('Target', axis=1) # Drop the 'Target' column for X_train
129-
y_train = train['Target']
130-
X_test = test.drop('Target', axis=1) # Drop the 'Target' column for X_test
131-
y_test = test['Target']
132-
print("------------- \n", " Data Loaded successfully \n", "-------------")
143+
try:
144+
X_train = train.drop('Target', axis=1) # Drop the 'Target' column for X_train
145+
y_train = train['Target']
146+
X_test = test.drop('Target', axis=1) # Drop the 'Target' column for X_test
147+
y_test = test['Target']
148+
print("------------- \n", " Data Loaded successfully \n", "-------------")
133149

134-
# Run the pipeline
135-
print("------------- \n", " Training on K-Fold cross validation with Train.csv file and parameters set on machine_learning_parameters.yaml file \n", "-------------")
150+
# Run the pipeline
151+
print("------------- \n", " Training on K-Fold cross validation with Train.csv file and parameters set on machine_learning_parameters.yaml file \n", "-------------")
136152

137-
params_dict, scores_storage, thresholds, _ = train_k_fold(X_train, y_train)
138-
print("------------- \n", " Training on K-Fold cross validation with Train.csv file and parameters set on machine_learning_parameters.yaml file completed successfully \n", "-------------")
153+
params_dict, scores_storage, thresholds, _ = train_k_fold(X_train, y_train)
154+
print("------------- \n", " Training on K-Fold cross validation with Train.csv file and parameters set on machine_learning_parameters.yaml file completed successfully \n", "-------------")
139155

140-
print("------------- \n", " Evaluating algorithms on Test.csv \n", "-------------")
141-
external_test(X_train, y_train, X_test, y_test, params_dict, thresholds)
156+
print("------------- \n", " Evaluating algorithms on Test.csv \n", "-------------")
157+
external_test(X_train, y_train, X_test, y_test, params_dict, thresholds)
142158

143-
pipeline_status_message = "Completed"
159+
pipeline_status_message = "Completed"
160+
except UnboundLocalError as e:
161+
print(e)
162+
pipeline_status_message = f"Error: {e}"
144163

145164

146165
if __name__ == "__main__":

0 commit comments

Comments
 (0)