Skip to content

Commit 871032d

Browse files
authored
Initial commit
0 parents  commit 871032d

15 files changed

+866
-0
lines changed

.flake8

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#########################
2+
# Flake8 Configuration #
3+
# (.flake8) #
4+
#########################
5+
[flake8]
6+
ignore =
7+
# pickle
8+
S301
9+
S403
10+
S404
11+
S603
12+
# Line break before binary operator (flake8 is wrong)
13+
W503
14+
# Ignore the spaces black puts before columns.
15+
E203
16+
# allow path extensions for testing.
17+
E402
18+
DAR101
19+
DAR201
20+
# flake and pylance disagree on linebreaks in strings.
21+
N400
22+
# asserts are ok in test.
23+
S101
24+
exclude =
25+
.tox,
26+
.git,
27+
__pycache__,
28+
docs/conf.py,
29+
build,
30+
dist,
31+
*.pyc,
32+
*.bib,
33+
*.egg-info,
34+
.cache,
35+
.eggs,
36+
data.
37+
src/jaxwt/__init__.py
38+
max-line-length = 120
39+
max-complexity = 20
40+
import-order-style = pycharm
41+
application-import-names =
42+
jaxwt
43+
tests

.github/workflows/test.yml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
name: Tests
2+
3+
on: [ push, pull_request ]
4+
5+
jobs:
6+
tests:
7+
name: Tests
8+
runs-on: ${{ matrix.os }}
9+
strategy:
10+
matrix:
11+
os: [ ubuntu-latest ]
12+
python-version: [3.11.0]
13+
steps:
14+
- uses: actions/checkout@v2
15+
- name: Set up Python ${{ matrix.python-version }}
16+
uses: actions/setup-python@v2
17+
with:
18+
python-version: ${{ matrix.python-version }}
19+
- name: Install dependencies
20+
run: pip install nox
21+
- name: Test with pytest
22+
run:
23+
nox -s test
24+
lint:
25+
name: Lint
26+
runs-on: ubuntu-latest
27+
strategy:
28+
matrix:
29+
python-version: [3.11.0]
30+
steps:
31+
- uses: actions/checkout@v2
32+
- name: Set up Python ${{ matrix.python-version }}
33+
uses: actions/setup-python@v2
34+
with:
35+
python-version: ${{ matrix.python-version }}
36+
- name: Install dependencies
37+
run: pip install nox
38+
- name: Run flake8
39+
run: nox -s lint
40+
- name: Run mypy
41+
run: nox -s typing

.gitignore

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
.vscode/
2+
.pytest_cache/
3+
4+
# Byte-compiled / optimized / DLL files
5+
__pycache__/
6+
*.py[cod]
7+
*$py.class
8+
9+
# C extensions
10+
*.so
11+
12+
# Distribution / packaging
13+
.Python
14+
build/
15+
develop-eggs/
16+
dist/
17+
downloads/
18+
eggs/
19+
.eggs/
20+
lib/
21+
lib64/
22+
parts/
23+
sdist/
24+
var/
25+
wheels/
26+
share/python-wheels/
27+
*.egg-info/
28+
.installed.cfg
29+
*.egg
30+
MANIFEST
31+
32+
# PyInstaller
33+
# Usually these files are written by a python script from a template
34+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
35+
*.manifest
36+
*.spec
37+
38+
# Installer logs
39+
pip-log.txt
40+
pip-delete-this-directory.txt
41+
42+
# Unit test / coverage reports
43+
htmlcov/
44+
.tox/
45+
.nox/
46+
.coverage
47+
.coverage.*
48+
.cache
49+
nosetests.xml
50+
coverage.xml
51+
*.cover
52+
*.py,cover
53+
.hypothesis/
54+
.pytest_cache/
55+
cover/
56+
57+
# Translations
58+
*.mo
59+
*.pot
60+
61+
# Django stuff:
62+
*.log
63+
local_settings.py
64+
db.sqlite3
65+
db.sqlite3-journal
66+
67+
# Flask stuff:
68+
instance/
69+
.webassets-cache
70+
71+
# Scrapy stuff:
72+
.scrapy
73+
74+
# Sphinx documentation
75+
docs/_build/
76+
77+
# PyBuilder
78+
.pybuilder/
79+
target/
80+
81+
# Jupyter Notebook
82+
.ipynb_checkpoints
83+
84+
# IPython
85+
profile_default/
86+
ipython_config.py
87+
88+
# pyenv
89+
# For a library or package, you might want to ignore these files since the code is
90+
# intended to run in multiple environments; otherwise, check them in:
91+
# .python-version
92+
93+
# pipenv
94+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
96+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
97+
# install all needed dependencies.
98+
#Pipfile.lock
99+
100+
# poetry
101+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102+
# This is especially recommended for binary packages to ensure reproducibility, and is more
103+
# commonly ignored for libraries.
104+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105+
#poetry.lock
106+
107+
# pdm
108+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
109+
#pdm.lock
110+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
111+
# in version control.
112+
# https://pdm.fming.dev/#use-with-ide
113+
.pdm.toml
114+
115+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
116+
__pypackages__/
117+
118+
# Celery stuff
119+
celerybeat-schedule
120+
celerybeat.pid
121+
122+
# SageMath parsed files
123+
*.sage.py
124+
125+
# Environments
126+
.env
127+
.venv
128+
env/
129+
venv/
130+
ENV/
131+
env.bak/
132+
venv.bak/
133+
134+
# Spyder project settings
135+
.spyderproject
136+
.spyproject
137+
138+
# Rope project settings
139+
.ropeproject
140+
141+
# mkdocs documentation
142+
/site
143+
144+
# mypy
145+
.mypy_cache/
146+
.dmypy.json
147+
dmypy.json
148+
149+
# Pyre type checker
150+
.pyre/
151+
152+
# pytype static type analyzer
153+
.pytype/
154+
155+
# Cython debug symbols
156+
cython_debug/
157+
158+
# PyCharm
159+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161+
# and can be added to the global gitignore or merged into this file. For a more nuclear
162+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
163+
#.idea/

README.md

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# Support Vector Machines Exercise
2+
3+
Today we will experiment with a very powerful algorithm, called *Support Vector Machine (SVM)*. SVMs are widely used for classification, but can also solve regression problems, as we will demostrate in our optional task for time series prediction.
4+
### Task 1: Linear Classification with SVMs
5+
6+
In this exercise, your task is to implement a linear SVM classifier for the Iris dataset, which contains data for different iris species that we want to classify.
7+
8+
To simplify this task, we will again use the popular python library for machine learning called [Scikit-Learn](https://scikit-learn.org/stable/index.html). It implements all kinds of machine learning algorithms and tools including SVMs.
9+
10+
1. Navigate to the `__main__` function of `src/ex1_linear_svm.py` and load the iris dataset from `sklearn.datasets`.
11+
2. Get access to the data, the labels and the class names. In the lecture, you saw how an SVM can be used for the binary classification problem. Find out how `sklearn` implements multi-class classification.
12+
3. Split the dataset into training and test data using ``sklearn.model_selection.train_test_split``. The test set should contain 25% of the data. Use `random_state=29` in the split function to generate reproducible results.
13+
1. Implement a function `train_test_svc` to create a Linear SVM classifier from the ``sklearn.svm.LinearSVC`` class and fit it to the training data.
14+
2. In the same function, test the classifier on the test set and evaluate its performance by computing the accuracy. ``sklearn.metrics`` provides functions to evaluate this metric. Express the accuracy as a percentage and round to one decimal place. Your function `train_test_svc` should return the classifier and the accuracy value.
15+
4. Print the accuracy.
16+
17+
5. Plot the confusion matrix using `sklearn.metrics.ConfusionMatrixDisplay.from_estimator`. Use `display_labels=iris.target_names` for better visualization.
18+
19+
### Task 2: Classification with soft-margin SVMs
20+
21+
In the lecture, you have seen that linear hard-margin SVMs fail to classify the data if it is not linearly separable inside the input space. For this reason, we use different kernels that transform the data into a different space, where it is easier to perform the separation. Additionally, some points may be on the wrong side of the hyperplane due to noise. To handle this, we often use *soft-margin* SVMs. Different from the hard-margin SVM, soft-margin SVM allows for certain data points to be on "the wrong side" of the hyperplane based on their distance to said plane and introduces a hyperparameter $C$ to control the effect of the regularization.
22+
23+
We will now apply this algorithm for face recognition. We will use the [Labeled Faces in the Wild Dataset](http://vis-www.cs.umass.edu/lfw/).
24+
25+
1. Starting in the `__main__` function of `src/ex2_soft_margin_svm.py` load the dataset from ``sklearn.datasets.fetch_lfw_people``. This can take a while when running for the first time because it has to download the dataset. For this exercise, we only want classes with at least 70 images per person. To improve the runtime, you can also resize the images. Use a resize factor below 0.5.
26+
2. Gather information about the dataset: Print the number of samples, the number of image features (pixels) and the number of classes.
27+
28+
3. Use the provided function `plot_image_matrix` to plot the first 12 images with their corresponding labels as titles.
29+
30+
4. Split the data 80:20 into training and test data. Use `random_state=42` in the split function.
31+
5. Use the `StandardScaler` from `sklearn.preprocessing` on the train set and scale both the train and the test set.
32+
33+
A lot of machine learning approaches are configurable. This means that there are parameters that are not learned by the algorithm itself but rather chosen by the developer. These *hyperparameters* have to be chosen in a way to maximize performance. In this case, we have two new parameters we want to evaluate:
34+
* The regularization constant $C$
35+
* and the choice of the kernel function.
36+
37+
6. Now we need to find the best values for our hyperparameters. Implement the hyperparameter search in the function `cv_svm` following these steps:
38+
1. Define a dictionary of parameters that you want to cross-validate. (Hint: Reasonable values for $C$ range from 0.01 to 1000, while for kernels it is usually sufficient to test `linear`, `rbf` and `poly`.)
39+
2. Initialize your model using the `sklearn.svm.SVC` class. Use the ``sklearn.model_selection.GridSearchCV`` class to find optimal hyperparameters for this task.
40+
41+
7. Print the parameters of the best estimator found with the function `cv_svm`.
42+
43+
8. Calculate and print the accuracy of the best performing model.
44+
45+
9. Plot the output for some images from the test set using the function `plot_image_matrix`. Plot the predictions and the true labels of images as titles.
46+
47+
Another way to evaluate the performance of a model is the [ROC-Curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic). It is obtained by plotting the true positive rate (TPR) against the false positive rate (FPR) for different threshold settings.
48+
49+
10. (Optional) Calculate the ROC curve and the area under the curve (AUC) for each class of the test set. Appropriate functions can be found inside ``sklearn.metrics``. Plot the results.
50+
(Hint: You can obtain the scores of each prediction by calling the ``decision_function`` of your classifier.)
51+
52+
### Task 3 (Optional): Time-series prediction with SVM
53+
54+
You can also use SVMs for regression. In this exercise, we will take a brief look at time-series predictions. The goal is to infer new values from a set of old observations. For this we will look at the number of Covid-19 cases.
55+
56+
0. Open `src/ex3_time_series.py`, move to the `__main__` function and have a look at the code. Inspect the dataset closely and make sure you understand what information the columns depict.
57+
1. In the code we generate two arrays: `raw_data` and `raw_data_short`. Plot both curves with the `plot_curve` function. Do you notice any change in behavior in these curves? Is there a point were the rate of change increases? The data that lies before this point won't be considered anymore.
58+
59+
2. With the number of covid cases for the last week (7 days), we want to predict the expected number of cases for the next 5 days. Set the number of days you want to forecast and the number of days that will be taken into account for the forecast.
60+
61+
3. Build the dataset for training and testing:
62+
* For this, split the data in the following way:
63+
```python
64+
sequence = [10,14,15,19,20,25,26] # Number of cases
65+
X = [[10,14],
66+
[14,15],
67+
[15,19],
68+
[19,20],
69+
[20,25]]
70+
Y = [15, 19, 20, 25, 26]
71+
```
72+
In this example it means that we use the first 2 days (``[10,14]``) to predict the third day (``[15]``) and the second and third day to predict the fourth and so on. Instead of 2 days, we use 7 days for the prediction.
73+
4. SVMs are not scale invariant, so it is important to normalize the input data. Normalize the data to its maximum value, such that it lies between [0,1]. (Hint: `numpy.amax`). Note, if you normalize the train data, you need to normalize the test data as well!
74+
75+
Now we need to train an SVM regressor and find the best values for the hyperparameters. For this task, we will choose a Gaussian `rbf` kernel and evaluate the following parameters:
76+
* The regularization constant $C$,
77+
* $epsilon$ in the epsilon-SVR model. It specifies the the epsilon-tube, within which no penalty is associated in the training loss function with points predicted within a distance epsilon from the actual value,
78+
* and the $gamma$ parameter of the `rbf` kernel.
79+
80+
5. Implement the hyperparameter search in the function `cv_svr` following these steps:
81+
1. Define a dictionary of parameters that you want to cross-validate. (Hint: Reasonable values for $epsilon$ range from 0.1 to 0.001 and for $gamma$ you can try the values `auto` and `scale`.)
82+
2. Initialize your model using the `sklearn.svm.SVR` class. Use the grid search to find optimal hyperparameters.
83+
84+
6. Print the parameters of the best estimator found with the function `cv_svr`.
85+
7. After that go to the ``recursive_forecast()`` function where the new predictions are recursivley used to generate predictions even further in the future. Implement the TODOs.
86+
8. Use the function `recursive_forecast` to make predictions for the next 5 days. Don't forget to denormalize your predictions. Use `numpy.round` to round the predictions after denormalization.
87+
9. Plot the predicted results with `plot_prediction`.

noxfile.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"""This module implements our CI function calls."""
2+
import nox
3+
4+
5+
@nox.session(name="test")
6+
def run_test(session):
7+
"""Run pytest."""
8+
session.install("-r", "requirements.txt")
9+
session.install("pytest")
10+
session.run("pytest")
11+
12+
13+
@nox.session(name="lint")
14+
def lint(session):
15+
"""Check code conventions."""
16+
session.install("flake8")
17+
session.install(
18+
"flake8-black",
19+
"flake8-docstrings",
20+
"flake8-bugbear",
21+
"flake8-broken-line",
22+
"pep8-naming",
23+
"pydocstyle",
24+
"darglint",
25+
)
26+
session.run("flake8", "src", "tests", "noxfile.py")
27+
28+
29+
@nox.session(name="typing")
30+
def mypy(session):
31+
"""Check type hints."""
32+
session.install("-r", "requirements.txt")
33+
session.install("mypy")
34+
session.run(
35+
"mypy",
36+
"--install-types",
37+
"--non-interactive",
38+
"--ignore-missing-imports",
39+
"--no-strict-optional",
40+
"--no-warn-return-any",
41+
"--implicit-reexport",
42+
"--allow-untyped-calls",
43+
"src",
44+
)
45+
46+
47+
@nox.session(name="format")
48+
def format(session):
49+
"""Fix common convention problems automatically."""
50+
session.install("black")
51+
session.install("isort")
52+
session.run("isort", "src", "tests", "noxfile.py")
53+
session.run("black", "src", "tests", "noxfile.py")

pytest.ini

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[pytest]
2+
markers =
3+
slow: this test is slow and should only run locally.
4+
pythonpath = .

0 commit comments

Comments
 (0)