Skip to content

Commit aa4778d

Browse files
add github workflow and config to dvc pipeline
1 parent f3498a2 commit aa4778d

File tree

5 files changed

+70
-62
lines changed

5 files changed

+70
-62
lines changed

.github/workflows/run_pipeline.yaml

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
name: Run pipeline
2+
on:
3+
push:
4+
branches:
5+
- main
6+
paths:
7+
- config/**
8+
- src/**
9+
- data/*
10+
- .github/workflows/run_pipeline.yaml
11+
jobs:
12+
run_pipeline:
13+
name: Run pipeline
14+
runs-on: ubuntu-latest
15+
steps:
16+
- name: Check out the current repository
17+
id: checkout
18+
uses: actions/checkout@v3
19+
20+
- name: Set up Python
21+
uses: actions/setup-python@v4
22+
with:
23+
python-version: '3.11'
24+
25+
- name: Install uv
26+
uses: astral-sh/setup-uv@v5
27+
28+
- name: Install dependencies
29+
run: uv sync
30+
31+
- name: Pull data from DVC
32+
run: uv run dvc pull
33+
34+
- name: Run the pipeline with DVC
35+
run: uv run dvc repro
36+
37+
- name: Push the outcomes to DVC remote storage
38+
run: uv run dvc push
39+
40+
- name: Commit changes in dvc.lock
41+
uses: stefanzweifel/git-auto-commit-action@v4
42+
with:
43+
commit_message: Commit changes in dvc.lock
44+
branch: main
45+
file_pattern: dvc.lock

config/process/process_1.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ keep_columns:
1616
- family_size
1717

1818
remove_outliers_threshold:
19-
age: 84
19+
age: 83
2020
Income: 600000
2121

2222
family_size:

dvc.lock

Lines changed: 15 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@ stages:
33
process_data:
44
cmd: python src/process_data.py
55
deps:
6+
- path: config
7+
hash: md5
8+
md5: a2380b625b5f5f10411a2f8dd4d7174f.dir
9+
size: 1483
10+
nfiles: 4
611
- path: data/raw
712
md5: 10c3f643286f509fa7f6b4675d9efbad.dir
813
size: 222379
@@ -11,36 +16,6 @@ stages:
1116
hash: md5
1217
md5: 5920b8b9838a6fdb8afdda6c82e35986
1318
size: 2654
14-
params:
15-
config/process/process_1.yaml:
16-
family_size:
17-
Married: 2
18-
Together: 2
19-
Absurd: 1
20-
Widow: 1
21-
YOLO: 1
22-
Divorced: 1
23-
Single: 1
24-
Alone: 1
25-
keep_columns:
26-
- Income
27-
- Recency
28-
- NumWebVisitsMonth
29-
- AcceptedCmp3
30-
- AcceptedCmp4
31-
- AcceptedCmp5
32-
- AcceptedCmp1
33-
- AcceptedCmp2
34-
- Complain
35-
- Response
36-
- age
37-
- total_purchases
38-
- enrollment_years
39-
- family_size
40-
name: process_1
41-
remove_outliers_threshold:
42-
age: 84
43-
Income: 600000
4419
outs:
4520
- path: data/intermediate
4621
hash: md5
@@ -50,39 +25,27 @@ stages:
5025
train:
5126
cmd: python src/segment.py
5227
deps:
28+
- path: config
29+
hash: md5
30+
md5: a2380b625b5f5f10411a2f8dd4d7174f.dir
31+
size: 1483
32+
nfiles: 4
5333
- path: data/intermediate
5434
hash: md5
5535
md5: 69c6a4e21a7e575450a4ce26f70f394f.dir
5636
size: 624234
5737
nfiles: 1
5838
- path: src/segment.py
5939
hash: md5
60-
md5: b0f72dee173f4a36c4e9849fa3b0545c
61-
size: 2245
62-
params:
63-
config/main.yaml:
64-
defaults:
65-
- process: process_1
66-
- _self_
67-
final:
68-
dir: data/final
69-
name: segmented.csv
70-
path: ${final.dir}/${final.name}
71-
intermediate:
72-
dir: data/intermediate
73-
name: scale_features.csv
74-
path: ${intermediate.dir}/${intermediate.name}
75-
model:
76-
path: model/cluster.pkl
77-
raw_data:
78-
path: data/raw/marketing_campaign.csv
40+
md5: 631a294f601a1275a7b9bf3f87d49709
41+
size: 2953
7942
outs:
8043
- path: data/final
8144
hash: md5
82-
md5: fcdc1dd0b9a2a1877736c356b9602f6a.dir
45+
md5: 53d9c18e9c74863889ef30db81b8ae26.dir
8346
size: 610251
8447
nfiles: 1
8548
- path: model/cluster.pkl
8649
hash: md5
87-
md5: 8fd544c7627269bc5cbee2243e6cee58
88-
size: 9701
50+
md5: e515bf89630b6a41b857b702130ddbe6
51+
size: 9772

dvc.yaml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,18 @@
11
stages:
22
process_data:
33
cmd: python src/process_data.py
4-
params:
5-
- config/process/process_1.yaml:
64
deps:
75
- data/raw
86
- src/process_data.py
7+
- config
98
outs:
109
- data/intermediate
1110
train:
1211
cmd: python src/segment.py
13-
params:
14-
- config/main.yaml:
1512
deps:
1613
- data/intermediate
1714
- src/segment.py
15+
- config
1816
outs:
1917
- data/final
2018
- model/cluster.pkl

src/segment.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,20 @@ def read_process_data(config: DictConfig):
2222

2323

2424
def get_pca_model(data: pd.DataFrame) -> PCA:
25-
pca = PCA(n_components=3)
25+
pca = PCA(n_components=4)
2626
pca.fit(data)
2727
return pca
2828

2929

3030
def reduce_dimension(df: pd.DataFrame, pca: PCA) -> pd.DataFrame:
31-
return pd.DataFrame(pca.transform(df), columns=["col1", "col2", "col3"])
31+
n_components = pca.n_components_
32+
columns = [f"col{i+1}" for i in range(n_components)]
33+
return pd.DataFrame(pca.transform(df), columns=columns)
3234

3335

34-
def get_3d_projection(pca_df: pd.DataFrame) -> dict:
35-
"""A 3D Projection Of Data In The Reduced Dimensionality Space"""
36-
return {"x": pca_df["col1"], "y": pca_df["col2"], "z": pca_df["col3"]}
36+
def get_projection(pca_df: pd.DataFrame) -> dict:
37+
"""Get projection of data in the reduced dimensionality space"""
38+
return {f"dim{i+1}": pca_df[col] for i, col in enumerate(pca_df.columns)}
3739

3840

3941
def get_best_k_cluster(pca_df: pd.DataFrame) -> pd.DataFrame:

0 commit comments

Comments
 (0)