Skip to content

Commit 968458b

Browse files
committed
Training Process
1 parent 5c5eda9 commit 968458b

File tree

2 files changed

+648
-20
lines changed

2 files changed

+648
-20
lines changed

projects/aries-fl/notebooks/hospital/Hospital.ipynb

Lines changed: 271 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
{
44
"cell_type": "code",
55
"execution_count": 1,
6-
"id": "comfortable-blowing",
6+
"id": "intended-medicaid",
77
"metadata": {},
88
"outputs": [
99
{
@@ -12,6 +12,17 @@
1212
"text": [
1313
"IPython autoawait is `on`, and set to use `asyncio`\n"
1414
]
15+
},
16+
{
17+
"ename": "ModuleNotFoundError",
18+
"evalue": "No module named 'aries_basic_controller'",
19+
"output_type": "error",
20+
"traceback": [
21+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
22+
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
23+
"\u001b[0;32m<ipython-input-1-cf8eac349ea5>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0masyncio\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0maries_basic_controller\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maries_controller\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAriesAgentController\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mWEBHOOK_HOST\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"0.0.0.0\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
24+
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'aries_basic_controller'"
25+
]
1526
}
1627
],
1728
"source": [
@@ -32,11 +43,10 @@
3243
{
3344
"cell_type": "code",
3445
"execution_count": 4,
35-
"id": "weird-lunch",
46+
"id": "approved-reverse",
3647
"metadata": {},
3748
"outputs": [],
3849
"source": [
39-
"\n",
4050
"loop = asyncio.get_event_loop()\n",
4151
"loop.create_task(agent_controller.listen_webhooks())\n",
4252
"\n",
@@ -67,7 +77,7 @@
6777
},
6878
{
6979
"cell_type": "markdown",
70-
"id": "anonymous-president",
80+
"id": "amino-router",
7181
"metadata": {},
7282
"source": [
7383
"## Copy Invite from Researcher"
@@ -76,7 +86,7 @@
7686
{
7787
"cell_type": "code",
7888
"execution_count": 5,
79-
"id": "expired-double",
89+
"id": "tired-atmosphere",
8090
"metadata": {},
8191
"outputs": [],
8292
"source": [
@@ -87,7 +97,7 @@
8797
{
8898
"cell_type": "code",
8999
"execution_count": 6,
90-
"id": "finished-milan",
100+
"id": "diagnostic-colombia",
91101
"metadata": {},
92102
"outputs": [
93103
{
@@ -117,7 +127,7 @@
117127
{
118128
"cell_type": "code",
119129
"execution_count": null,
120-
"id": "objective-fabric",
130+
"id": "straight-frame",
121131
"metadata": {},
122132
"outputs": [],
123133
"source": [
@@ -130,16 +140,265 @@
130140
},
131141
{
132142
"cell_type": "code",
133-
"execution_count": null,
134-
"id": "spanish-officer",
143+
"execution_count": 6,
144+
"id": "similar-tracy",
135145
"metadata": {},
136-
"outputs": [],
137-
"source": []
146+
"outputs": [
147+
{
148+
"data": {
149+
"text/plain": [
150+
"333"
151+
]
152+
},
153+
"execution_count": 6,
154+
"metadata": {},
155+
"output_type": "execute_result"
156+
}
157+
],
158+
"source": [
159+
"# Data pre-processing\n",
160+
"\n",
161+
"import numpy as np # linear algebra\n",
162+
"import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
163+
"import matplotlib.pyplot as plt\n",
164+
"import seaborn as sns\n",
165+
"\n",
166+
"\n",
167+
"# prep\n",
168+
"from sklearn.model_selection import train_test_split\n",
169+
"from sklearn import preprocessing\n",
170+
"from sklearn.datasets import make_classification\n",
171+
"from sklearn.preprocessing import binarize, LabelEncoder, MinMaxScaler\n",
172+
"\n",
173+
"#Read in Data\n",
174+
"train_df = pd.read_csv('hospital1.csv')\n",
175+
"\n",
176+
"\n",
177+
"########## START DATA CLEANING ###############\n",
178+
"\n",
179+
"\n",
180+
"#dealing with missing data\n",
181+
"#Let’s get rid of the variables \"Timestamp\",“comments”, “state” just to make our lives easier.\n",
182+
"train_df = train_df.drop(['comments'], axis= 1)\n",
183+
"train_df = train_df.drop(['state'], axis= 1)\n",
184+
"train_df = train_df.drop(['Timestamp'], axis= 1)\n",
185+
"\n",
186+
"# Assign default values for each data type\n",
187+
"defaultInt = 0\n",
188+
"defaultString = 'NaN'\n",
189+
"defaultFloat = 0.0\n",
190+
"\n",
191+
"# Create lists by data tpe\n",
192+
"intFeatures = ['Age']\n",
193+
"stringFeatures = ['Gender', 'Country', 'self_employed', 'family_history', 'treatment', 'work_interfere',\n",
194+
" 'no_employees', 'remote_work', 'tech_company', 'anonymity', 'leave', 'mental_health_consequence',\n",
195+
" 'phys_health_consequence', 'coworkers', 'supervisor', 'mental_health_interview', 'phys_health_interview',\n",
196+
" 'mental_vs_physical', 'obs_consequence', 'benefits', 'care_options', 'wellness_program',\n",
197+
" 'seek_help']\n",
198+
"floatFeatures = []\n",
199+
"\n",
200+
"# Clean the NaN's\n",
201+
"for feature in train_df:\n",
202+
" if feature in intFeatures:\n",
203+
" train_df[feature] = train_df[feature].fillna(defaultInt)\n",
204+
" elif feature in stringFeatures:\n",
205+
" train_df[feature] = train_df[feature].fillna(defaultString)\n",
206+
" elif feature in floatFeatures:\n",
207+
" train_df[feature] = train_df[feature].fillna(defaultFloat)\n",
208+
" else:\n",
209+
" print('Error: Feature %s not recognized.' % feature)\n",
210+
"\n",
211+
"#clean 'Gender'\n",
212+
"#Slower case all columm's elements\n",
213+
"gender = train_df['Gender'].str.lower()\n",
214+
"#print(gender)\n",
215+
"\n",
216+
"#Select unique elements\n",
217+
"gender = train_df['Gender'].unique()\n",
218+
"\n",
219+
"#Made gender groups\n",
220+
"male_str = [\"male\", \"m\", \"male-ish\", \"maile\", \"mal\", \"male (cis)\", \"make\", \"male \", \"man\",\"msle\", \"mail\", \"malr\",\"cis man\", \"Cis Male\", \"cis male\"]\n",
221+
"trans_str = [\"trans-female\", \"something kinda male?\", \"queer/she/they\", \"non-binary\",\"nah\", \"all\", \"enby\", \"fluid\", \"genderqueer\", \"androgyne\", \"agender\", \"male leaning androgynous\", \"guy (-ish) ^_^\", \"trans woman\", \"neuter\", \"female (trans)\", \"queer\", \"ostensibly male, unsure what that really means\"]\n",
222+
"female_str = [\"cis female\", \"f\", \"female\", \"woman\", \"femake\", \"female \",\"cis-female/femme\", \"female (cis)\", \"femail\"]\n",
223+
"\n",
224+
"for (row, col) in train_df.iterrows():\n",
225+
"\n",
226+
" if str.lower(col.Gender) in male_str:\n",
227+
" train_df['Gender'].replace(to_replace=col.Gender, value='male', inplace=True)\n",
228+
"\n",
229+
" if str.lower(col.Gender) in female_str:\n",
230+
" train_df['Gender'].replace(to_replace=col.Gender, value='female', inplace=True)\n",
231+
"\n",
232+
" if str.lower(col.Gender) in trans_str:\n",
233+
" train_df['Gender'].replace(to_replace=col.Gender, value='trans', inplace=True)\n",
234+
"\n",
235+
"#Get rid of bullshit\n",
236+
"stk_list = ['A little about you', 'p']\n",
237+
"train_df = train_df[~train_df['Gender'].isin(stk_list)]\n",
238+
"\n",
239+
"#complete missing age with mean\n",
240+
"train_df['Age'].fillna(train_df['Age'].median(), inplace = True)\n",
241+
"\n",
242+
"# Fill with media() values < 18 and > 120\n",
243+
"s = pd.Series(train_df['Age'])\n",
244+
"s[s<18] = train_df['Age'].median()\n",
245+
"train_df['Age'] = s\n",
246+
"s = pd.Series(train_df['Age'])\n",
247+
"s[s>120] = train_df['Age'].median()\n",
248+
"train_df['Age'] = s\n",
249+
"\n",
250+
"#Ranges of Age\n",
251+
"train_df['age_range'] = pd.cut(train_df['Age'], [0,20,30,65,100], labels=[\"0-20\", \"21-30\", \"31-65\", \"66-100\"], include_lowest=True)\n",
252+
"\n",
253+
"#There are only 0.20% of self work_interfere so let's change NaN to \"Don't know\n",
254+
"#Replace \"NaN\" string from defaultString\n",
255+
"\n",
256+
"train_df['work_interfere'] = train_df['work_interfere'].replace([defaultString], 'Don\\'t know' )\n",
257+
"\n",
258+
"#Encoding data\n",
259+
"labelDict = {}\n",
260+
"for feature in train_df:\n",
261+
" le = preprocessing.LabelEncoder()\n",
262+
" le.fit(train_df[feature])\n",
263+
" le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))\n",
264+
" train_df[feature] = le.transform(train_df[feature])\n",
265+
" # Get labels\n",
266+
" labelKey = 'label_' + feature\n",
267+
" labelValue = [*le_name_mapping]\n",
268+
" labelDict[labelKey] =labelValue\n",
269+
"\n",
270+
"#Get rid of 'Country'\n",
271+
"train_df = train_df.drop(['Country'], axis= 1)\n",
272+
"\n",
273+
"# Scaling Age\n",
274+
"scaler = MinMaxScaler()\n",
275+
"train_df['Age'] = scaler.fit_transform(train_df[['Age']])\n",
276+
"\n",
277+
"# define X and y\n",
278+
"feature_cols = ['Age', 'Gender', 'family_history', 'benefits', 'care_options', 'anonymity', 'leave', 'work_interfere']\n",
279+
"X = train_df[feature_cols]\n",
280+
"y = train_df.treatment\n",
281+
"\n",
282+
"# split X and y into training and testing sets\n",
283+
"X_train, y_train = X, y\n",
284+
"\n",
285+
"# Transform pandas dataframe to torch tensor for DL\n",
286+
"\n",
287+
"x_train_data = torch.from_numpy(X_train.values)\n",
288+
"x_train_data = x_train_data.float()\n",
289+
"\n",
290+
"y_train_data = []\n",
291+
"for data in y_train.values:\n",
292+
" y_train_data.append([data])\n",
293+
"y_train_data = torch.tensor(y_train_data).float()\n",
294+
"\n",
295+
"len(y_train_data)"
296+
]
297+
},
298+
{
299+
"cell_type": "code",
300+
"execution_count": 7,
301+
"id": "interesting-marathon",
302+
"metadata": {},
303+
"outputs": [
304+
{
305+
"name": "stdout",
306+
"output_type": "stream",
307+
"text": [
308+
"/Users/pavlito/PyDentity/projects/aries-fl/notebooks/hospital/../model.pt\n",
309+
"HOSPITAL MODEL LOADED\n",
310+
"HOSPITAL IS TRAINING\n",
311+
"loss at epoch 0 : tensor(0.2684)\n",
312+
"loss at epoch 5000 : tensor(0.1264)\n",
313+
"loss at epoch 10000 : tensor(0.1152)\n",
314+
"loss at epoch 15000 : tensor(0.1114)\n",
315+
"loss at epoch 20000 : tensor(0.1083)\n",
316+
"loss at epoch 25000 : tensor(0.1058)\n",
317+
"loss at epoch 30000 : tensor(0.1040)\n",
318+
"loss at epoch 35000 : tensor(0.1023)\n",
319+
"loss at epoch 40000 : tensor(0.1008)\n",
320+
"loss at epoch 45000 : tensor(0.0996)\n"
321+
]
322+
}
323+
],
324+
"source": [
325+
"import torch\n",
326+
"import traceback\n",
327+
"\n",
328+
"# models\n",
329+
"from torch import nn\n",
330+
"from torch import optim\n",
331+
"from torch.autograd import Variable\n",
332+
"import os\n",
333+
"import sys\n",
334+
"\n",
335+
"# Receive model from the Researcher and train it\n",
336+
"\n",
337+
"model_dir = os.getcwd() + \"/../model.pt\"\n",
338+
"\n",
339+
"print(model_dir)\n",
340+
"\n",
341+
"# Pull in model\n",
342+
"try:\n",
343+
" model = torch.load(model_dir)\n",
344+
"except Exception as e:\n",
345+
" print(\"HOSPITAL FAILED TO LOAD MODEL\")\n",
346+
" print(\"Exception Value: \",e)\n",
347+
" print(\"Traceback \",traceback.format_exc())\n",
348+
"# return False\n",
349+
"\n",
350+
"print(\"HOSPITAL MODEL LOADED\")\n",
351+
"\n",
352+
"\n",
353+
"# Training Logic\n",
354+
"print(\"HOSPITAL IS TRAINING\")\n",
355+
"\n",
356+
"# Define Optimizer\n",
357+
"opt = optim.SGD(params=model.parameters(), lr=0.1)\n",
358+
" \n",
359+
"# opt = torch.optim.SGD(model.parameters(), lr=0.05)\n",
360+
"\n",
361+
"\n",
362+
"# Apply Differential Privacy\n",
363+
"\n",
364+
"#privacy_engine = PrivacyEngine(model, batch_size=333, sample_size=1000, alphas=[10, 100], \n",
365+
"# noise_multiplier=1.3, max_grad_norm=1.0)\n",
366+
"\n",
367+
"# privacy_engine.attach(opt)\n",
368+
"\n",
369+
"for iter in range(50000):\n",
370+
"\n",
371+
" # 1) erase previous gradients (if they exist)\n",
372+
" opt.zero_grad()\n",
373+
" # log_msg(\"TRAIN DATA\", x_train_data)\n",
374+
"\n",
375+
" # 2) make a prediction\n",
376+
" pred = model(x_train_data)\n",
377+
"\n",
378+
" # 3) calculate how much we missed\n",
379+
" loss = (((y_train_data - pred) ** 2).sum()) / len(x_train_data)\n",
380+
"\n",
381+
" # 4) figure out which weights caused us to miss\n",
382+
" loss.backward()\n",
383+
"\n",
384+
" # 5) change those weights\n",
385+
" opt.step()\n",
386+
"\n",
387+
" # 6) log_msg our progress\n",
388+
" if (iter % 5000 == 0):\n",
389+
" print(\"loss at epoch \", iter, \": \", loss.data)\n",
390+
"\n",
391+
"torch.save(model, \"../trained_model.pt\")\n",
392+
"\n",
393+
"# Detach the Differential Privacy library from the Optimizer (We may don't need this at all)\n",
394+
"# privacy_engine.detach()\n",
395+
"\n"
396+
]
138397
},
139398
{
140399
"cell_type": "code",
141400
"execution_count": null,
142-
"id": "laughing-demand",
401+
"id": "martial-military",
143402
"metadata": {},
144403
"outputs": [],
145404
"source": [

0 commit comments

Comments
 (0)