didx-xyz
diff --git a/‎projects/aries-fl/notebooks/hospital/Hospital.ipynb
Lines changed: 271 additions & 12 deletions b/‎projects/aries-fl/notebooks/hospital/Hospital.ipynb
Lines changed: 271 additions & 12 deletions
@@ -3,7 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "id": "comfortable-blowing",
+   "id": "intended-medicaid",
    "metadata": {},
    "outputs": [
     {
@@ -12,6 +12,17 @@
      "text": [
       "IPython autoawait is `on`, and set to use `asyncio`\n"
      ]
+    },
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'aries_basic_controller'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-1-cf8eac349ea5>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0masyncio\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0maries_basic_controller\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maries_controller\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAriesAgentController\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0mWEBHOOK_HOST\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"0.0.0.0\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'aries_basic_controller'"
+     ]
     }
    ],
    "source": [
@@ -32,11 +43,10 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "id": "weird-lunch",
+   "id": "approved-reverse",
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
     "loop = asyncio.get_event_loop()\n",
     "loop.create_task(agent_controller.listen_webhooks())\n",
     "\n",
@@ -67,7 +77,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "anonymous-president",
+   "id": "amino-router",
    "metadata": {},
    "source": [
     "## Copy Invite from Researcher"
@@ -76,7 +86,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "id": "expired-double",
+   "id": "tired-atmosphere",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -87,7 +97,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "id": "finished-milan",
+   "id": "diagnostic-colombia",
    "metadata": {},
    "outputs": [
     {
@@ -117,7 +127,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "objective-fabric",
+   "id": "straight-frame",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -130,16 +140,265 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "spanish-officer",
+   "execution_count": 6,
+   "id": "similar-tracy",
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "333"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Data pre-processing\n",
+    "\n",
+    "import numpy as np # linear algebra\n",
+    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "\n",
+    "\n",
+    "# prep\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn import preprocessing\n",
+    "from sklearn.datasets import make_classification\n",
+    "from sklearn.preprocessing import binarize, LabelEncoder, MinMaxScaler\n",
+    "\n",
+    "#Read in Data\n",
+    "train_df = pd.read_csv('hospital1.csv')\n",
+    "\n",
+    "\n",
+    "########## START DATA CLEANING ###############\n",
+    "\n",
+    "\n",
+    "#dealing with missing data\n",
+    "#Let’s get rid of the variables \"Timestamp\",“comments”, “state” just to make our lives easier.\n",
+    "train_df = train_df.drop(['comments'], axis= 1)\n",
+    "train_df = train_df.drop(['state'], axis= 1)\n",
+    "train_df = train_df.drop(['Timestamp'], axis= 1)\n",
+    "\n",
+    "# Assign default values for each data type\n",
+    "defaultInt = 0\n",
+    "defaultString = 'NaN'\n",
+    "defaultFloat = 0.0\n",
+    "\n",
+    "# Create lists by data tpe\n",
+    "intFeatures = ['Age']\n",
+    "stringFeatures = ['Gender', 'Country', 'self_employed', 'family_history', 'treatment', 'work_interfere',\n",
+    "                     'no_employees', 'remote_work', 'tech_company', 'anonymity', 'leave', 'mental_health_consequence',\n",
+    "                     'phys_health_consequence', 'coworkers', 'supervisor', 'mental_health_interview', 'phys_health_interview',\n",
+    "                     'mental_vs_physical', 'obs_consequence', 'benefits', 'care_options', 'wellness_program',\n",
+    "                     'seek_help']\n",
+    "floatFeatures = []\n",
+    "\n",
+    "# Clean the NaN's\n",
+    "for feature in train_df:\n",
+    "    if feature in intFeatures:\n",
+    "        train_df[feature] = train_df[feature].fillna(defaultInt)\n",
+    "    elif feature in stringFeatures:\n",
+    "        train_df[feature] = train_df[feature].fillna(defaultString)\n",
+    "    elif feature in floatFeatures:\n",
+    "        train_df[feature] = train_df[feature].fillna(defaultFloat)\n",
+    "    else:\n",
+    "        print('Error: Feature %s not recognized.' % feature)\n",
+    "\n",
+    "#clean 'Gender'\n",
+    "#Slower case all columm's elements\n",
+    "gender = train_df['Gender'].str.lower()\n",
+    "#print(gender)\n",
+    "\n",
+    "#Select unique elements\n",
+    "gender = train_df['Gender'].unique()\n",
+    "\n",
+    "#Made gender groups\n",
+    "male_str = [\"male\", \"m\", \"male-ish\", \"maile\", \"mal\", \"male (cis)\", \"make\", \"male \", \"man\",\"msle\", \"mail\", \"malr\",\"cis man\", \"Cis Male\", \"cis male\"]\n",
+    "trans_str = [\"trans-female\", \"something kinda male?\", \"queer/she/they\", \"non-binary\",\"nah\", \"all\", \"enby\", \"fluid\", \"genderqueer\", \"androgyne\", \"agender\", \"male leaning androgynous\", \"guy (-ish) ^_^\", \"trans woman\", \"neuter\", \"female (trans)\", \"queer\", \"ostensibly male, unsure what that really means\"]\n",
+    "female_str = [\"cis female\", \"f\", \"female\", \"woman\",  \"femake\", \"female \",\"cis-female/femme\", \"female (cis)\", \"femail\"]\n",
+    "\n",
+    "for (row, col) in train_df.iterrows():\n",
+    "\n",
+    "    if str.lower(col.Gender) in male_str:\n",
+    "        train_df['Gender'].replace(to_replace=col.Gender, value='male', inplace=True)\n",
+    "\n",
+    "    if str.lower(col.Gender) in female_str:\n",
+    "        train_df['Gender'].replace(to_replace=col.Gender, value='female', inplace=True)\n",
+    "\n",
+    "    if str.lower(col.Gender) in trans_str:\n",
+    "        train_df['Gender'].replace(to_replace=col.Gender, value='trans', inplace=True)\n",
+    "\n",
+    "#Get rid of bullshit\n",
+    "stk_list = ['A little about you', 'p']\n",
+    "train_df = train_df[~train_df['Gender'].isin(stk_list)]\n",
+    "\n",
+    "#complete missing age with mean\n",
+    "train_df['Age'].fillna(train_df['Age'].median(), inplace = True)\n",
+    "\n",
+    "# Fill with media() values < 18 and > 120\n",
+    "s = pd.Series(train_df['Age'])\n",
+    "s[s<18] = train_df['Age'].median()\n",
+    "train_df['Age'] = s\n",
+    "s = pd.Series(train_df['Age'])\n",
+    "s[s>120] = train_df['Age'].median()\n",
+    "train_df['Age'] = s\n",
+    "\n",
+    "#Ranges of Age\n",
+    "train_df['age_range'] = pd.cut(train_df['Age'], [0,20,30,65,100], labels=[\"0-20\", \"21-30\", \"31-65\", \"66-100\"], include_lowest=True)\n",
+    "\n",
+    "#There are only 0.20% of self work_interfere so let's change NaN to \"Don't know\n",
+    "#Replace \"NaN\" string from defaultString\n",
+    "\n",
+    "train_df['work_interfere'] = train_df['work_interfere'].replace([defaultString], 'Don\\'t know' )\n",
+    "\n",
+    "#Encoding data\n",
+    "labelDict = {}\n",
+    "for feature in train_df:\n",
+    "    le = preprocessing.LabelEncoder()\n",
+    "    le.fit(train_df[feature])\n",
+    "    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))\n",
+    "    train_df[feature] = le.transform(train_df[feature])\n",
+    "    # Get labels\n",
+    "    labelKey = 'label_' + feature\n",
+    "    labelValue = [*le_name_mapping]\n",
+    "    labelDict[labelKey] =labelValue\n",
+    "\n",
+    "#Get rid of 'Country'\n",
+    "train_df = train_df.drop(['Country'], axis= 1)\n",
+    "\n",
+    "# Scaling Age\n",
+    "scaler = MinMaxScaler()\n",
+    "train_df['Age'] = scaler.fit_transform(train_df[['Age']])\n",
+    "\n",
+    "# define X and y\n",
+    "feature_cols = ['Age', 'Gender', 'family_history', 'benefits', 'care_options', 'anonymity', 'leave', 'work_interfere']\n",
+    "X = train_df[feature_cols]\n",
+    "y = train_df.treatment\n",
+    "\n",
+    "# split X and y into training and testing sets\n",
+    "X_train, y_train = X, y\n",
+    "\n",
+    "# Transform pandas dataframe to torch tensor for DL\n",
+    "\n",
+    "x_train_data = torch.from_numpy(X_train.values)\n",
+    "x_train_data = x_train_data.float()\n",
+    "\n",
+    "y_train_data = []\n",
+    "for data in y_train.values:\n",
+    "    y_train_data.append([data])\n",
+    "y_train_data = torch.tensor(y_train_data).float()\n",
+    "\n",
+    "len(y_train_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "interesting-marathon",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/Users/pavlito/PyDentity/projects/aries-fl/notebooks/hospital/../model.pt\n",
+      "HOSPITAL MODEL LOADED\n",
+      "HOSPITAL IS TRAINING\n",
+      "loss at epoch  0 :  tensor(0.2684)\n",
+      "loss at epoch  5000 :  tensor(0.1264)\n",
+      "loss at epoch  10000 :  tensor(0.1152)\n",
+      "loss at epoch  15000 :  tensor(0.1114)\n",
+      "loss at epoch  20000 :  tensor(0.1083)\n",
+      "loss at epoch  25000 :  tensor(0.1058)\n",
+      "loss at epoch  30000 :  tensor(0.1040)\n",
+      "loss at epoch  35000 :  tensor(0.1023)\n",
+      "loss at epoch  40000 :  tensor(0.1008)\n",
+      "loss at epoch  45000 :  tensor(0.0996)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import traceback\n",
+    "\n",
+    "# models\n",
+    "from torch import nn\n",
+    "from torch import optim\n",
+    "from torch.autograd import Variable\n",
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "# Receive model from the Researcher and train it\n",
+    "\n",
+    "model_dir = os.getcwd() + \"/../model.pt\"\n",
+    "\n",
+    "print(model_dir)\n",
+    "\n",
+    "# Pull in model\n",
+    "try:\n",
+    "    model = torch.load(model_dir)\n",
+    "except Exception as e:\n",
+    "    print(\"HOSPITAL FAILED TO LOAD MODEL\")\n",
+    "    print(\"Exception Value: \",e)\n",
+    "    print(\"Traceback \",traceback.format_exc())\n",
+    "#     return False\n",
+    "\n",
+    "print(\"HOSPITAL MODEL LOADED\")\n",
+    "\n",
+    "\n",
+    "# Training Logic\n",
+    "print(\"HOSPITAL IS TRAINING\")\n",
+    "\n",
+    "# Define Optimizer\n",
+    "opt = optim.SGD(params=model.parameters(), lr=0.1)\n",
+    " \n",
+    "# opt = torch.optim.SGD(model.parameters(), lr=0.05)\n",
+    "\n",
+    "\n",
+    "# Apply Differential Privacy\n",
+    "\n",
+    "#privacy_engine = PrivacyEngine(model, batch_size=333, sample_size=1000, alphas=[10, 100], \n",
+    "#                            noise_multiplier=1.3, max_grad_norm=1.0)\n",
+    "\n",
+    "# privacy_engine.attach(opt)\n",
+    "\n",
+    "for iter in range(50000):\n",
+    "\n",
+    "    # 1) erase previous gradients (if they exist)\n",
+    "    opt.zero_grad()\n",
+    "    # log_msg(\"TRAIN DATA\", x_train_data)\n",
+    "\n",
+    "    # 2) make a prediction\n",
+    "    pred = model(x_train_data)\n",
+    "\n",
+    "    # 3) calculate how much we missed\n",
+    "    loss = (((y_train_data - pred) ** 2).sum()) / len(x_train_data)\n",
+    "\n",
+    "    # 4) figure out which weights caused us to miss\n",
+    "    loss.backward()\n",
+    "\n",
+    "    # 5) change those weights\n",
+    "    opt.step()\n",
+    "\n",
+    "    # 6) log_msg our progress\n",
+    "    if (iter % 5000 == 0):\n",
+    "        print(\"loss at epoch \", iter, \": \", loss.data)\n",
+    "\n",
+    "torch.save(model, \"../trained_model.pt\")\n",
+    "\n",
+    "# Detach the Differential Privacy library from the Optimizer (We may don't need this at all)\n",
+    "# privacy_engine.detach()\n",
+    "\n"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "laughing-demand",
+   "id": "martial-military",
    "metadata": {},
    "outputs": [],
    "source": [