diff --git a/exercises/01_penguin_classification.ipynb b/exercises/01_penguin_classification.ipynb index 4d8b931..7e79de5 100644 --- a/exercises/01_penguin_classification.ipynb +++ b/exercises/01_penguin_classification.ipynb @@ -23,20 +23,23 @@ "### Task 1: look at the data\n", "In the following code block, we import the ``load_penguins`` function from the ``palmerpenguins`` package.\n", "\n", - "- Call this function, which returns a single object, and assign it to the variable ``data``.\n", - " - Print ``data`` and recognise that ``load_penguins`` has returned a ``pandas.DataFrame``.\n", - "- Consider which features it might make sense to use in order to classify the species of the penguins.\n", - " - You can print the column titles using ``pd.DataFrame.keys()``\n", - " - You can also obtain useful information using ``pd.DataFrame.Series.describe()``" + "- Call this function, which returns a single object in the form of a ``pandas.DataFrame``, and assign it to the variable ``data``.\n", + " - Print ``data`` and recognise that ``load_penguins`` has returned the dataframe.\n", + "- Analyse which features it might make sense to use in order to classify the species of the penguins.\n", + " - You can print the column names using ``pd.DataFrame.keys()``\n", + " - You can also obtain useful statical information on the dataset using ``pd.DataFrame.Series.describe()``" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "from palmerpenguins import load_penguins" + "from palmerpenguins import load_penguins\n", + "\n", + "# Load the penguin data\n", + "penguins = load_penguins()\n" ] }, { @@ -402,7 +405,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -416,7 +419,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.4" } }, "nbformat": 4, diff --git a/slides/slides.qmd b/slides/slides.qmd index a8861f4..6d30874 100644 --- a/slides/slides.qmd +++ b/slides/slides.qmd @@ -1,6 +1,6 @@ --- -title: "Introduction to Machine Learning with PyTorch" -subtitle: "NCAS & ICCS Summer Schools 2023" +title: "Introduction to Neural Networks with PyTorch" +subtitle: "ICCS Summer School 2024" format: revealjs: embed-resources: true @@ -12,12 +12,12 @@ format: theme: [dark, custom.scss] render-on-save: true authors: - - name: Jack Atkinson - orcid: 0000-0001-5001-4812 + - name: Matt Archer + orcid: 0009-0002-7043-6769 affiliations: ICCS/Cambridge - - name: Jim Denholm - affiliations: Cambridge - orcid: 0000-0002-2389-3134 + - name: Surbhi Goel + affiliations: ICCS/Cambridge + orcid: 0009-0005-0237-756X revealjs-plugins: - attribution --- @@ -61,6 +61,17 @@ Helping Today: # Part 1: Neural-network basics -- and fun applications. +## Machine learning + +- Machine learns underlying patterns and relations in given data to produce an output. + +- Machine learning is accomplished by learning a mathematical function that can represent the data. + + +## Types of Machine learning + +- Learning can be supervised, unsupervised, semi-supervised, self-supervised, reinforcement etc. depending on the task in hand. + ## Stochastic gradient descent (SGD) @@ -253,6 +264,10 @@ Image source: [3Blue1Brown](https://www.3blue1brown.com/topics/neural-networks) ::: +## The Learning process summarised + +![](ModelLearning.png) + # Python and PyTorch {.smaller} - In this workshop-lecture-thing, we will implement some straightforward neural networks in PyTorch, and use them for different classification and regression problems. @@ -261,6 +276,40 @@ Image source: [3Blue1Brown](https://www.3blue1brown.com/topics/neural-networks) - See the PyTorch website: [https://pytorch.org/](https://pytorch.org/) + +## Getting to the Exercise + *Github Repository Cloning* + +- Navigate to +[https://tinyurl.com/ml-iccs-24](https://tinyurl.com/ml-iccs-24) +- Go to terminal and type in the below command +- `git clone https://github.com/Cambridge-ICCS/practical-ml-with-pytorch` + +## Using online platform +*Using Colab* + +`https://tinyurl.com/4arrjjt5` + +*Using Jupyter Notebook* + +- Download the repo from the [https://tinyurl.com/ml-iccs-24](https://tinyurl.com/ml-iccs-24) +- `cd ` +- `jupyter notebook` + +## Creating virtual environement +*Installing with venv* + +- `python3 -m venv venv` +- `source venv/bin/activate` + +*Installing with conda* + +- `conda create -n ml-workshop "python>=3.9.10"` +- `conda activate ml-workshop` +- `cd practical-ml-with-PyTorch` +- `pip install . ` + + # Exercises diff --git a/worked-solutions/01_penguin_classification_solutions.ipynb b/worked-solutions/01_penguin_classification_solutions.ipynb index c8666eb..01277b3 100644 --- a/worked-solutions/01_penguin_classification_solutions.ipynb +++ b/worked-solutions/01_penguin_classification_solutions.ipynb @@ -13,6 +13,8 @@ "\n", "In this exercise, we will use the python package [``palmerpenguins``](https://github.com/mcnakhaee/palmerpenguins) to supply a toy dataset containing various features and measurements of penguins.\n", "\n", + "The penguin datset is a set of real-life observations collected from a research centre called Palmer Station in Antarctica by Dr. Kristen Gorman. The dataset consists of 7 variables, out of which 4 are numerical and 3 are categorical which we will see just now. The task here is to classify penguins' species based on their physical characteristics and a few other variables that might help the classification. \n", + "\n", "We have already created a PyTorch dataset which yields data for each of the penguins, but first we should examine the dataset and see what it contains." ] }, @@ -23,16 +25,51 @@ "### Task 1: look at the data\n", "In the following code block, we import the ``load_penguins`` function from the ``palmerpenguins`` package.\n", "\n", - "- Call this function, which returns a single object, and assign it to the variable ``data``.\n", - " - Print ``data`` and recognise that ``load_penguins`` has returned a ``pandas.DataFrame``.\n", + "- Call this function, which returns a single object in the form of a ``pandas.DataFrame``, and assign it to the variable ``data``.\n", + " - Print ``data`` and recognise that ``load_penguins`` has returned the dataframe.\n", "- Consider which features it might make sense to use in order to classify the species of the penguins.\n", - " - You can print the column titles using ``pd.DataFrame.keys()``\n", - " - You can also obtain useful information using ``pd.DataFrame.Series.describe()``" + " - You can print the column names using ``pd.DataFrame.keys()``\n", + " - You can also obtain useful statistical information on the dataset using ``pd.DataFrame.Series.describe()``" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " species island bill_length_mm bill_depth_mm flipper_length_mm \\\n", + "0 Adelie Torgersen 39.1 18.7 181.0 \n", + "1 Adelie Torgersen 39.5 17.4 186.0 \n", + "2 Adelie Torgersen 40.3 18.0 195.0 \n", + "3 Adelie Torgersen NaN NaN NaN \n", + "4 Adelie Torgersen 36.7 19.3 193.0 \n", + "\n", + " body_mass_g sex year \n", + "0 3750.0 male 2007 \n", + "1 3800.0 female 2007 \n", + "2 3250.0 female 2007 \n", + "3 NaN NaN 2007 \n", + "4 3450.0 female 2007 \n" + ] + } + ], + "source": [ + "from palmerpenguins import load_penguins\n", + "\n", + "data = load_penguins()\n", + "\n", + "print(data.head())\n", + "\n", + "# print(data.keys())" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -57,23 +94,14 @@ "25% 2007.000000 \n", "50% 2008.000000 \n", "75% 2009.000000 \n", - "max 2009.000000 \n", - "Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',\n", - " 'flipper_length_mm', 'body_mass_g', 'sex', 'year'],\n", - " dtype='object')\n" + "max 2009.000000 \n" ] } ], "source": [ - "from palmerpenguins import load_penguins\n", - "\n", - "data = load_penguins()\n", - "\n", "# Note: ``pd.DataFrame.describe`` is a useful function for giving an overview\n", "# of what a ``pd.DataFrame`` contains.\n", - "print(data.describe())\n", - "\n", - "print(data.keys())" + "print(data.describe())" ] }, { @@ -97,7 +125,7 @@ "\n", "#### Let's reject\n", "- ``\"island\"``\n", - " - While island is likely to be predictive, it seems potentially misleading to use this feature. One island could be heavily dominated by one species of penguin, while other species abide there in much smaller numbers. Such a situation could result in a model giving too much weight to this feature, and confounding the results.\n", + " - While island is likely to be predictive, it seems potentially misleading to use this feature. One island could be heavily dominated by one species of penguin, while other species abide there in much smaller numbers. Such a situation could result in a model giving too much weight to this feature, and confounding the results. \n", "- ``\"year\"``\n", " - This feature could also be important: then behaviour of certain species may be changing in response to time-dependent environmental factors such as melting ice. It does however seem like the least biologically-relevant feature, and the most likely source of bias, so we reject it." ] @@ -108,34 +136,108 @@ "source": [ "### Task 2: creating a ``torch.utils.data.Dataset``\n", "\n", - "All PyTorch dataset objects are subclasses of the ``torch.utils.data.Dataset`` class. To make a custom dataset, create a class which inherits from the ``Dataset`` class, implement some methods (the Python magic (or dunder) methods ``__len__`` and ``__getitem__``) and supply some data.\n", + "To be able to use Pytorch functionalities, we need to make the dataset compatible with Pytorch. We do it using PyTorch's Dataset class called ``torch.utils.data.Dataset``. \n", + "\n", + "To make a custom dataset, create a new class which inherits from the ``Dataset`` class, implement some methods (the Python magic (or dunder) like ``__len__`` and ``__getitem__``) and supply data.\n", "\n", "Spoiler alert: we've done this for you already in ``src/ml_workshop/_penguins.py``.\n", "\n", - "- Open the file ``src/ml_workshop/_penguins.py``.\n", + "- Open the above mentioned file.\n", "- Let's examine, and discuss, each of the methods together.\n", " - ``__len__``\n", " - What does the ``__len__`` method do?\n", - " - The ``__len__`` method is a so-called \"magic method\", which tells python to do if the ``len`` function is called on the object containing it.\n", + " - The ``__len__`` method is a so-called \"magic method\" in python, that defines what happens when the ``len`` function is called on an object.\n", " - ``__getitem__``\n", " - What does the ``__getitem__`` method do?\n", " - The ``__getitem__`` method is another magic method which tells python what to do if we try and index the object containing it (i.e. ``my_object[idx]``).\n", "- Review and discuss the class arguments.\n", - " - ``input_keys``— A sequence of strings telling the data set which objects to return as inputs to the model.\n", - " - ``target_keys``— Same as ``input_keys`` but specifying the targets.\n", + " - ``input_keys``— A sequence of strings telling the data set which objects to return as inputs to the model. These are basically the input column names.\n", + " - ``target_keys``— Same as ``input_keys`` but specifying the targets columns.\n", " - ``train``— A boolean variable determining if the model returns the training or validation split (``True`` for training).\n", - " - ``x_tfms``— A ``Compose`` object with functions which will convert the raw input to a tensor. This argument is _optional_.\n", + " - ``x_tfms``— A ``Compose`` object with functions which will convert the raw input to a tensor. This argument is _optional_. Recall that Pytorch deals with `torch.Tensors` only.\n", " - ``y_tfms``— A ``Compose`` object with functions which will convert the raw target to a tensor. This argument is _optional_." ] }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Optional, List, Dict, Tuple, Any\n", + "\n", + "# import pytorch functions necessary for transformations:\n", + "from torch import tensor, float32, eye\n", + "\n", + "from torch.utils.data import Dataset\n", + "from torchvision.transforms import Compose\n", + "\n", + "from pandas import DataFrame\n", + "\n", + "from palmerpenguins import load_penguins\n", + "\n", + "\n", + "class PenguinDataset(Dataset):\n", + " def __init__(\n", + " self,\n", + " input_keys: List[str],\n", + " target_keys: List[str],\n", + " train: bool,\n", + " ):\n", + " \"\"\"Build ``PenguinDataset``.\"\"\"\n", + " self.input_keys = input_keys\n", + " self.target_keys = target_keys\n", + "\n", + " data = load_penguins()\n", + " data = (\n", + " data.loc[~data.isna().any(axis=1)]\n", + " .sort_values(by=sorted(data.keys()))\n", + " .reset_index(drop=True)\n", + " )\n", + " # Transform the sex field into a float, with male represented by 1.0, female by 0.0\n", + " data.sex = (data.sex == \"male\").astype(float)\n", + " self.full_df = data\n", + "\n", + " valid_df = self.full_df.groupby(by=[\"species\", \"sex\"]).sample(\n", + " n=10,\n", + " random_state=123,\n", + " )\n", + " # The training items are simply the items *not* in the valid split\n", + " train_df = self.full_df.loc[~self.full_df.index.isin(valid_df.index)]\n", + "\n", + " self.split = {\"train\": train_df, \"valid\": valid_df}[\n", + " \"train\" if train is True else \"valid\"\n", + " ]\n", + "\n", + " def __len__(self) -> int:\n", + " return len(self.split)\n", + "\n", + " def __getitem__(self, idx: int) -> Tuple[Any, Any]:\n", + " # get the row index (idx) from the dataframe and\n", + " # select relevant column features (provided as input_keys)\n", + " feats = tuple(self.split.iloc[idx][self.input_keys])\n", + "\n", + " # this gives a 'species' i.e. one of ('Gentoo',), ('Chinstrap',), or ('Adelie',)\n", + " tgts = tuple(self.split.iloc[idx][self.target_keys])\n", + "\n", + " # Exercise #1: convert the feats (Series) to PyTorch Tensors\n", + " feats = tensor(feats, dtype=float32)\n", + "\n", + " # Exercise #2: convert target to a 'one-hot' vector.\n", + " target_names = sorted(self.full_df.species.unique())\n", + " tgts = eye(len(target_names))[target_names.index(tgts[0])]\n", + "\n", + " return feats, tgts" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Task 3: obtaining training and validation datasets\n", + "### Task 3: Obtaining features and targets from data\n", "\n", "- Instantiate the penguin dataloader.\n", - " - Make sure you supply the correct column titles for the features and the targets.\n", + " - Make sure you pass the correct column names for the input features and the targets.\n", "- Iterate over the dataset\n", " - Hint:\n", " ```python\n", @@ -146,38 +248,38 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] ('Gentoo',)\n", - "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] ('Gentoo',)\n", - "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] ('Gentoo',)\n", - "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] ('Gentoo',)\n", - "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] ('Gentoo',)\n", - "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] ('Gentoo',)\n", - "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] ('Gentoo',)\n", - "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] ('Gentoo',)\n", - "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] ('Gentoo',)\n", - "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] ('Gentoo',)\n", - "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] ('Gentoo',)\n", - "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] ('Gentoo',)\n", - "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] ('Gentoo',)\n", - "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] ('Gentoo',)\n", - "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] ('Gentoo',)\n", - "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] ('Gentoo',)\n", - "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] ('Gentoo',)\n", - "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] ('Gentoo',)\n", - "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] ('Gentoo',)\n", - "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] ('Gentoo',)\n" + "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] tensor([0., 0., 1.])\n", + "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] tensor([0., 0., 1.])\n", + "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] tensor([0., 0., 1.])\n", + "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] tensor([0., 0., 1.])\n", + "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] tensor([0., 0., 1.])\n", + "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] tensor([0., 0., 1.])\n", + "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] tensor([0., 0., 1.])\n", + "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] tensor([0., 0., 1.])\n", + "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] tensor([0., 0., 1.])\n", + "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] tensor([0., 0., 1.])\n", + "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] tensor([0., 0., 1.])\n", + "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] tensor([0., 0., 1.])\n", + "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] tensor([0., 0., 1.])\n", + "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] tensor([0., 0., 1.])\n", + "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] tensor([0., 0., 1.])\n", + "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] tensor([0., 0., 1.])\n", + "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] tensor([0., 0., 1.])\n", + "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] tensor([0., 0., 1.])\n", + "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] tensor([0., 0., 1.])\n", + "['bill_length_mm', 'bill_depth_mm', 'body_mass_g', 'flipper_length_mm', 'sex'] tensor([0., 0., 1.])\n" ] } ], "source": [ - "from ml_workshop import PenguinDataset\n", + "# from ml_workshop import PenguinDataset\n", "\n", "features = [\n", " \"bill_length_mm\",\n", @@ -187,6 +289,7 @@ " \"sex\",\n", "]\n", "\n", + "# Remove duplicate observations w.r.t a column of choice and then sort them alphabetically \n", "target_names = sorted(data.species.unique())\n", "\n", "data_set = PenguinDataset(\n", @@ -212,21 +315,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Task 4: Applying transforms to the data\n", + "### Task 4: Transforming Input Data for Neural Networks \n", + "\n", + "The purpose of transforming the data before passing it to the model is to apply optimally preprocessing to the input data. The preprocessing can include tasks such as normalization, reshaping, extrapolation etc.\n", "\n", - "A common way of transforming inputs to neural networks is to apply a series of transforms using ``torchvision.transforms.Compose``. The [``Compose``](https://pytorch.org/vision/stable/generated/torchvision.transforms.Compose.html) object takes a list of callable objects and applies them to the incoming data.\n", + "A common way of transforming inputs to neural networks is to apply transforms using ``torchvision.transforms.Compose``. The [``Compose``](https://pytorch.org/vision/stable/generated/torchvision.transforms.Compose.html) object takes a list of callable objects (preprocessing tasks) and applies them sequentially to the incoming data.\n", "\n", - "These transforms can be very useful for mapping between file paths and tensors of images, etc.\n", + "These transforms are very useful for converting file paths to tensors of images and for performing other necessary preprocessing tasks.\n", "\n", - "- Note: here we create a training and validation set.\n", - " - We allow the model to learn directly from the training set—i.e. we fit the function to these data.\n", - " - During training, we monitor the model's performance on the validation set in order to check how it's doing on unseen data. Normally, people use the validation performance to determine when to stop the training process.\n", + "- Note: Here we create a training and validation set.\n", + " - We allow the model to learn from the training setn i.e. we fit the function to these data.\n", + " - During training, we monitor the model's performance on the validation set in order to check how it's doing on unseen data. Normally, people use the validation performance to determine when to stop the training process (earlystopping).\n", "- For the validation set, we choose ten males and ten females of each species. This means the validation set is less likely to be biased by sex and species, and is potentially a more reliable measure of performance. You should always be _very_ careful when choosing metrics and splitting data." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 45, "metadata": {}, "outputs": [ { @@ -298,8 +403,8 @@ " input_keys=features,\n", " target_keys=[\"species\"],\n", " train=True,\n", - " x_tfms=get_input_transforms(),\n", - " y_tfms=get_target_tfms(),\n", + " # x_tfms=get_input_transforms(),\n", + " # y_tfms=get_target_tfms(),\n", ")\n", "\n", "\n", @@ -307,8 +412,8 @@ " input_keys=features,\n", " target_keys=[\"species\"],\n", " train=False,\n", - " x_tfms=get_input_transforms(),\n", - " y_tfms=get_target_tfms(),\n", + " # x_tfms=get_input_transforms(),\n", + " # y_tfms=get_target_tfms(),\n", ")\n", "\n", "\n", @@ -323,19 +428,19 @@ "### Task 5: Creating ``DataLoaders``—and why\n", "\n", "- Once we have created a ``Dataset`` object, we wrap it in a ``DataLoader``.\n", - " - The ``DataLoader`` object allows us to put our inputs and targets in mini-batches, which makes for more efficient training.\n", + " - The ``DataLoader`` object allows us to put our inputs and targets in mini-batches, which makes for more efficient training.\\n\".\n", " - Note: rather than supplying one input-target pair to the model at a time, we supply \"mini-batches\" of these data at once (typically a small power of 2, like 16 or 32).\n", " - The number of items we supply at once is called the batch size.\n", - " - The ``DataLoader`` can also randomly shuffle the data each epoch (when training).\n", - " - It allows us to load different mini-batches in parallel, which can be very useful for larger datasets and images that can't all fit in memory at once.\n", + " - The ``DataLoader`` can also randomly shuffle and re-batch the data each epoch (when training). It prevents the model from learning any order-specific patterns in the data.\n", + " - It also allows us to load the mini-batches in parallel, which can be very useful for larger datasets and images that can't all fit in memory at once.\n", "\n", "\n", - "Note: we are going to use batch normalisation layers in our network, which don't work if the batch size is one. This can happen on the last batch, if we don't choose a batch size that evenly divides the number of items in the data set. To avoid this, we can set the ``drop_last`` argument to ``True``. The last batch, which will be of size ``len(data_set) % batch_size`` gets dropped, and the data are reshuffled. This is only relevant during the training process - validation will use population statistics." + "Note: we are going to use batch normalisation in our network, which doesn't work if the batch size is one. This can happen on the last batch, if we don't choose a batch size that evenly divides the number of items in the data set. To avoid this, we can set the ``drop_last`` argument to ``True``. The last batch, which will be of size ``len(data_set) % batch_size`` gets dropped, and the data are reshuffled. This is only relevant during the training process - validation will use population statistics." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 46, "metadata": {}, "outputs": [ { @@ -380,13 +485,14 @@ "Here we will create our neural network in PyTorch, and have a general discussion on clean and messy ways of going about it.\n", "\n", "- First, we will create quite an ugly network to highlight how to make a neural network in PyTorch on a very basic level.\n", + "- The class will have an instance of the network defining layers and activation functions, and a forward layer which will define the way input flows through the layers and activations defined in __init__ to produce an output.\n", "- We will then discuss a trick for making the print-out nicer.\n", "- Finally, we will discuss how the best approach would be to write a class where various parameters (e.g. number of layers, dropout probabilities, etc.) are passed as arguments." ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 47, "metadata": {}, "outputs": [ { @@ -473,14 +579,14 @@ "source": [ "### Task 7: Selecting a loss function\n", "\n", - "- Binary cross-entropy is about the most common loss function for classification.\n", + "- Binary cross-entropy is the most common loss function for classification. \n", " - Details on this loss function are available in the [PyTorch docs](https://pytorch.org/docs/stable/generated/torch.nn.BCELoss.html).\n", "- Let's instantiate it together." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -495,14 +601,18 @@ "source": [ "### Task 8: Selecting an optimiser\n", "\n", + "The optimiser updates the model's weights and biases on the basis of the gradient of the loss function w.r.t each weight. The aim of weight update process is to reach an optimal weight value until the loss is minimised subject to some stopping criterion. \n", + "\n", + "Note: Gradient is computed during backpropagation. \n", + "\n", "While we talked about stochastic gradient descent in the slides, most people use the so-called [Adam optimiser](https://pytorch.org/docs/stable/generated/torch.optim.Adam.html).\n", "\n", - "You can think of it as a more complex and improved implementation of SGD." + "You can think of it as a more complex and improved implementation of SGD.\n" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -520,17 +630,21 @@ "\n", "- Before we jump in and write these loops, we must first choose an activation function to apply to the model's outputs.\n", " - Here we are going to use the softmax activation function: see [the PyTorch docs](https://pytorch.org/docs/stable/generated/torch.nn.Softmax.html).\n", - " - For those of you who've studied physics, you may be remininded of the partition function in thermodynamics.\n", + " - For those of you who've studied physics, you may be remininded of the partition function in thermodynamics.\\n,\n", " - This activation function is good for classifcation when the result is one of ``A or B or C``.\n", " - It's bad if you even want to assign two classification to one images—say a photo of a dog _and_ a cat.\n", " - It turns the raw outputs, or logits, into \"psuedo probabilities\", and we take our prediction to be the most probable class.\n", "\n", + "\n", + "- The purpose of the training loop is that for each epoch, all the mini-batches are sequentially passed through the model to make predictions, compute the loss, and update the model parameters. This is repeated it until we receive a satisfactory performance.\n", + "- The validation loop works similarly, but the purpose is to validate the model training's performance without any weight updates. Hence, we are not calculating any gradient during validation. This is made sure using the ``no_grad`` decorator which disables gradient tracking in the function. Using ``no_grad`` is an optional practice.\n", + "- ``batch_level_accuracy`` is also calculated to track the model performance. We also don't need gradients in the batch_level_accuracy() function.\n", "- We will write the training loop together, then you can go ahead and write the (simpler) validation loop." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -668,6 +782,28 @@ " return (preds.argmax(dim=1) == targets.argmax(dim=1)).float().mean()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### How this works step by step:\n", + "1. Load Dataet\n", + "2. Select Features\n", + "3. Build pytorch Dataset\n", + "4. Transform the dataset\n", + "5. Build a DataLoader\n", + "6. Build a neural net\n", + "7. Define a loss function\n", + "8. Define an optimiser\n", + "9. Build Train/Val loops\n", + "10. Training the net\n", + "11. Plotting\n", + "\n", + "\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -687,62 +823,86 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Epoch 0 time: 0.146 seconds.\n", - "Epoch 1 time: 0.096 seconds.\n", - "Epoch 2 time: 0.093 seconds.\n", - "Epoch 3 time: 0.092 seconds.\n", - "Epoch 4 time: 0.093 seconds.\n", - "Epoch 5 time: 0.093 seconds.\n", - "Epoch 6 time: 0.092 seconds.\n", - "Epoch 7 time: 0.093 seconds.\n", - "Epoch 8 time: 0.093 seconds.\n", - "Epoch 9 time: 0.094 seconds.\n", - "Epoch 10 time: 0.097 seconds.\n", - "Epoch 11 time: 0.097 seconds.\n", - "Epoch 12 time: 0.094 seconds.\n", - "Epoch 13 time: 0.094 seconds.\n", - "Epoch 14 time: 0.093 seconds.\n", - "Epoch 15 time: 0.092 seconds.\n", - "Epoch 16 time: 0.096 seconds.\n", - "Epoch 17 time: 0.098 seconds.\n", - "Epoch 18 time: 0.096 seconds.\n", - "Epoch 19 time: 0.093 seconds.\n", + "Epoch 0: Time: 0.136 seconds \n", + " Train Loss: 0.4514, Val Loss: 0.3982, Train Accuracy: 0.6912000179290771, Val Accuracy: 0.7968999743461609\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1: Time: 0.097 seconds \n", + " Train Loss: 0.337, Val Loss: 0.2835, Train Accuracy: 0.8603000044822693, Val Accuracy: 0.9688000082969666\n", + "\n", + "Epoch 2: Time: 0.085 seconds \n", + " Train Loss: 0.2711, Val Loss: 0.2226, Train Accuracy: 0.9154000282287598, Val Accuracy: 0.9688000082969666\n", + "\n", + "Epoch 3: Time: 0.086 seconds \n", + " Train Loss: 0.2212, Val Loss: 0.184, Train Accuracy: 0.9521999955177307, Val Accuracy: 0.9688000082969666\n", + "\n", + "Epoch 4: Time: 0.089 seconds \n", + " Train Loss: 0.182, Val Loss: 0.1509, Train Accuracy: 0.9484999775886536, Val Accuracy: 0.9843999743461609\n", + "\n", + "Epoch 5: Time: 0.082 seconds \n", + " Train Loss: 0.1631, Val Loss: 0.1332, Train Accuracy: 0.9484999775886536, Val Accuracy: 0.9843999743461609\n", + "\n", + "Epoch 6: Time: 0.088 seconds \n", + " Train Loss: 0.1332, Val Loss: 0.109, Train Accuracy: 0.9815999865531921, Val Accuracy: 0.9843999743461609\n", "\n", + "Epoch 7: Time: 0.099 seconds \n", + " Train Loss: 0.1212, Val Loss: 0.0965, Train Accuracy: 0.9706000089645386, Val Accuracy: 0.9843999743461609\n", "\n", - " loss_train accuracy_train loss_valid accuracy_valid\n", - "0 0.759151 0.279412 0.711391 0.375000\n", - "1 0.531332 0.584559 0.468364 0.609375\n", - "2 0.393045 0.779412 0.365031 0.796875\n", - "3 0.304282 0.915441 0.293924 0.890625\n", - "4 0.261206 0.937500 0.252864 0.937500\n", - "5 0.221856 0.937500 0.210456 0.937500\n", - "6 0.190321 0.963235 0.171965 0.968750\n", - "7 0.151137 0.966912 0.161003 0.968750\n", - "8 0.134465 0.974265 0.139327 0.968750\n", - "9 0.127652 0.963235 0.123636 0.968750\n", - "10 0.133551 0.959559 0.103402 0.968750\n", - "11 0.116820 0.985294 0.095507 0.968750\n", - "12 0.114006 0.970588 0.084369 0.984375\n", - "13 0.114615 0.955882 0.079413 0.984375\n", - "14 0.076280 0.985294 0.074852 0.984375\n", - "15 0.088352 0.981618 0.068665 0.984375\n", - "16 0.111566 0.955882 0.064788 0.984375\n", - "17 0.083331 0.966912 0.060488 0.984375\n", - "18 0.084988 0.977941 0.057387 0.984375\n", - "19 0.065010 0.985294 0.055872 0.984375\n" + "Epoch 8: Time: 0.082 seconds \n", + " Train Loss: 0.1104, Val Loss: 0.0896, Train Accuracy: 0.9706000089645386, Val Accuracy: 0.9843999743461609\n", + "\n", + "Epoch 9: Time: 0.084 seconds \n", + " Train Loss: 0.1056, Val Loss: 0.0759, Train Accuracy: 0.9743000268936157, Val Accuracy: 0.9843999743461609\n", + "\n", + "Epoch 10: Time: 0.085 seconds \n", + " Train Loss: 0.0869, Val Loss: 0.0746, Train Accuracy: 0.9743000268936157, Val Accuracy: 0.9843999743461609\n", + "\n", + "Epoch 11: Time: 0.090 seconds \n", + " Train Loss: 0.0782, Val Loss: 0.0701, Train Accuracy: 0.9815999865531921, Val Accuracy: 0.9843999743461609\n", + "\n", + "Epoch 12: Time: 0.095 seconds \n", + " Train Loss: 0.0937, Val Loss: 0.0619, Train Accuracy: 0.9668999910354614, Val Accuracy: 0.9843999743461609\n", + "\n", + "Epoch 13: Time: 0.088 seconds \n", + " Train Loss: 0.0661, Val Loss: 0.0637, Train Accuracy: 0.9890000224113464, Val Accuracy: 0.9688000082969666\n", + "\n", + "Epoch 14: Time: 0.084 seconds \n", + " Train Loss: 0.0778, Val Loss: 0.0607, Train Accuracy: 0.9706000089645386, Val Accuracy: 0.9843999743461609\n", + "\n", + "Epoch 15: Time: 0.087 seconds \n", + " Train Loss: 0.0698, Val Loss: 0.058, Train Accuracy: 0.9815999865531921, Val Accuracy: 0.9688000082969666\n", + "\n", + "Epoch 16: Time: 0.098 seconds \n", + " Train Loss: 0.0782, Val Loss: 0.0493, Train Accuracy: 0.9631999731063843, Val Accuracy: 1.0\n", + "\n", + "Epoch 17: Time: 0.090 seconds \n", + " Train Loss: 0.0564, Val Loss: 0.0503, Train Accuracy: 0.9779000282287598, Val Accuracy: 0.9843999743461609\n", + "\n", + "Epoch 18: Time: 0.090 seconds \n", + " Train Loss: 0.0529, Val Loss: 0.0479, Train Accuracy: 0.9743000268936157, Val Accuracy: 0.9843999743461609\n", + "\n", + "Epoch 19: Time: 0.095 seconds \n", + " Train Loss: 0.0483, Val Loss: 0.047, Train Accuracy: 0.9890000224113464, Val Accuracy: 0.9843999743461609\n", + "\n", + "\n", + "\n" ] } ], "source": [ "from time import perf_counter\n", - "\n", "from pandas import DataFrame\n", "\n", "epochs = 20\n", @@ -752,22 +912,238 @@ "for epoch in range(epochs):\n", " start_time = perf_counter()\n", "\n", - " train_metrics.append(train_one_epoch(model, train_loader, optimiser, loss_func))\n", + " train_loss = train_one_epoch(model, train_loader, optimiser, loss_func)\n", + " train_metrics.append(train_loss)\n", "\n", - " valid_metrics.append(validate_one_epoch(model, valid_loader, loss_func))\n", + " valid_loss = validate_one_epoch(model, valid_loader, loss_func)\n", + " valid_metrics.append(valid_loss)\n", "\n", " stop_time = perf_counter()\n", "\n", - " print(f\"Epoch {epoch} time: {stop_time - start_time:.3f} seconds.\")\n", - "\n", + " print(f\"Epoch {epoch}: Time: {stop_time - start_time:.3f} seconds \\n Train Loss: {round(train_loss['loss'],4)}, Val Loss: {round(valid_loss['loss'],4)}, Train Accuracy: {round(train_loss['accuracy'],4)}, Val Accuracy: {round(valid_loss['accuracy'],4)}\\n\")\n", + " \n", "print(\"\\n\")\n", "\n", "train_metrics = DataFrame(train_metrics)\n", "valid_metrics = DataFrame(valid_metrics)\n", - "\n", "metrics = train_metrics.join(valid_metrics, lsuffix=\"_train\", rsuffix=\"_valid\")\n", + "metrics.index = range(0,epochs)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " loss_train accuracy_train loss_valid accuracy_valid\n", + "0 0.451392 0.691176 0.398212 0.796875\n", + "1 0.337014 0.860294 0.283476 0.968750\n", + "2 0.271069 0.915441 0.222645 0.968750\n", + "3 0.221187 0.952206 0.183997 0.968750\n", + "4 0.181975 0.948529 0.150913 0.984375\n" + ] + } + ], + "source": [ + "print(metrics.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task 11: Visualise some results\n", "\n", - "print(metrics)" + "Let's do this part together—though feel free to make a start on your own if you have completed the previous exercises." + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Loss plot\n", + "plt.figure(figsize=(6, 4))\n", + "plt.plot(metrics.index, metrics['loss_train'], label='Training Loss')\n", + "plt.plot(metrics.index, metrics['loss_valid'], label='Validation Loss')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Loss')\n", + "plt.title('Loss vs. Epochs')\n", + "plt.xticks(range(epochs)) \n", + "plt.legend()\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Accuracy plot\n", + "plt.figure(figsize=(6, 4))\n", + "plt.plot(metrics.index, metrics['accuracy_train'], label='Training Accuracy')\n", + "plt.plot(metrics.index, metrics['accuracy_valid'], label='Validation Accuracy')\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Accuracy')\n", + "plt.title('Accuracy vs. Epochs')\n", + "plt.xticks(range(epochs)) \n", + "plt.legend()\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Bonus: Run the net on 'new' inputs\n", + "\n", + "We have built and trained a net, and evaluated and visualised its performance. However, how do we now utilise it going forward?\n", + "\n", + "Here we construct some 'new' input data and use our trained net to infer the species. Whilst this is relatively straightforward there is still some work required to transform the outputs from the net to a meaningful result." + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Raw input:\n", + "tensor([[4.2900e+01, 1.3100e+01, 5.0000e+03, 2.1500e+02, 0.0000e+00],\n", + " [3.3600e+01, 1.1300e+01, 2.0000e+03, 2.1100e+02, 1.0000e+00]])\n", + "\n", + "Raw output:\n", + "tensor([[1.1300e-03, 8.3503e-04, 9.9804e-01],\n", + " [7.4916e-01, 1.5521e-03, 2.4928e-01]])\n", + "\n", + "Predicted species:\n", + "['Gentoo', 'Adelie']\n", + "\n" + ] + } + ], + "source": [ + "from torch import no_grad\n", + "\n", + "# Construct a tensor of inputs to run the model over\n", + "demo_input = tensor(\n", + " [\n", + " [42.9, 13.1, 5000.0, 215.0, 0.0],\n", + " [33.6, 11.3, 2000.0, 211.0, 1.0],\n", + " ]\n", + ")\n", + "print(f\"Raw input:\\n{demo_input}\\n\")\n", + "\n", + "# Place model in eval mode and run over inputs with no_grad\n", + "model.eval()\n", + "with no_grad():\n", + " demo_output = model(demo_input).softmax(dim=1)\n", + "\n", + "# Print the raw output from the net\n", + "print(f\"Raw output:\\n{demo_output}\\n\")\n", + "\n", + "# Transform the raw output back to human-readable format\n", + "print(f\"Predicted species:\\n{[target_names[val.argmax()] for val in demo_output]}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 0 time: 0.119 seconds Train Loss: 0.0571 Val Loss: 0.0467\n", + "Epoch 1 time: 0.093 seconds Train Loss: 0.0549 Val Loss: 0.0406\n", + "Epoch 2 time: 0.092 seconds Train Loss: 0.0544 Val Loss: 0.042\n", + "Epoch 3 time: 0.098 seconds Train Loss: 0.0632 Val Loss: 0.04\n", + "Epoch 4 time: 0.094 seconds Train Loss: 0.0536 Val Loss: 0.0374\n", + "Epoch 5 time: 0.084 seconds Train Loss: 0.0718 Val Loss: 0.0367\n", + "Epoch 6 time: 0.083 seconds Train Loss: 0.0422 Val Loss: 0.0394\n", + "Epoch 7 time: 0.090 seconds Train Loss: 0.0461 Val Loss: 0.0347\n", + "Epoch 8 time: 0.095 seconds Train Loss: 0.0415 Val Loss: 0.0341\n", + "Epoch 9 time: 0.092 seconds Train Loss: 0.0421 Val Loss: 0.0342\n", + "Epoch 10 time: 0.091 seconds Train Loss: 0.0294 Val Loss: 0.0359\n", + "Epoch 11 time: 0.095 seconds Train Loss: 0.0417 Val Loss: 0.0335\n", + "Epoch 12 time: 0.087 seconds Train Loss: 0.0599 Val Loss: 0.0332\n", + "Epoch 13 time: 0.085 seconds Train Loss: 0.0321 Val Loss: 0.0333\n", + "Epoch 14 time: 0.083 seconds Train Loss: 0.0489 Val Loss: 0.031\n", + "Epoch 15 time: 0.089 seconds Train Loss: 0.0362 Val Loss: 0.0284\n", + "Epoch 16 time: 0.088 seconds Train Loss: 0.0431 Val Loss: 0.0278\n", + "Epoch 17 time: 0.088 seconds Train Loss: 0.0291 Val Loss: 0.0304\n", + "Epoch 18 time: 0.094 seconds Train Loss: 0.0354 Val Loss: 0.0339\n", + "Epoch 19 time: 0.083 seconds Train Loss: 0.0452 Val Loss: 0.0286\n", + "\n", + "\n" + ] + } + ], + "source": [ + "from time import perf_counter\n", + "\n", + "from pandas import DataFrame\n", + "\n", + "epochs = 20\n", + "\n", + "train_metrics, valid_metrics = [], []\n", + "\n", + "for epoch in range(epochs):\n", + " start_time = perf_counter()\n", + "\n", + " train_loss = train_one_epoch(model, train_loader, optimiser, loss_func)\n", + " train_metrics.append(train_loss)\n", + "\n", + " valid_loss = validate_one_epoch(model, valid_loader, loss_func)\n", + " valid_metrics.append(valid_loss)\n", + "\n", + " stop_time = perf_counter()\n", + "\n", + " print(f\"Epoch {epoch} time: {stop_time - start_time:.3f} seconds Train Loss: {round(train_loss['loss'],4)} Val Loss: {round(valid_loss['loss'],4)}\")\n", + "\n", + "print(\"\\n\")\n", + "\n", + "train_metrics = DataFrame(train_metrics)\n", + "valid_metrics = DataFrame(valid_metrics)\n", + "metrics = train_metrics.join(valid_metrics, lsuffix=\"_train\", rsuffix=\"_valid\")\n" ] }, { @@ -781,12 +1157,12 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 58, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -822,7 +1198,8 @@ " axis.set_xlim(left=1, right=epochs)\n", " axis.set_xlabel(\"Epoch\", fontsize=15)\n", "\n", - "fig.tight_layout()" + "fig.tight_layout()\n", + "\n" ] }, { @@ -838,7 +1215,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 59, "metadata": {}, "outputs": [ { @@ -850,8 +1227,8 @@ " [3.3600e+01, 1.1300e+01, 2.0000e+03, 2.1100e+02, 1.0000e+00]])\n", "\n", "Raw output:\n", - "tensor([[0.0035, 0.0021, 0.9943],\n", - " [0.9547, 0.0013, 0.0440]])\n", + "tensor([[9.8420e-05, 6.0667e-05, 9.9984e-01],\n", + " [6.3560e-01, 5.4130e-04, 3.6386e-01]])\n", "\n", "Predicted species:\n", "['Gentoo', 'Adelie']\n", @@ -900,7 +1277,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.4" } }, "nbformat": 4,