add github workflow and config to dvc pipeline

khuyentran1401 · khuyentran1401 · commit aa4778d7f555 · 2025-05-10T13:58:19.000-05:00
diff --git a/.github/workflows/run_pipeline.yaml b/.github/workflows/run_pipeline.yaml
@@ -0,0 +1,45 @@
+name: Run pipeline
+on:
+  push:
+    branches:
+     - main
+    paths:
+      - config/**
+      - src/**
+      - data/*
+      - .github/workflows/run_pipeline.yaml
+jobs:
+  run_pipeline:
+    name: Run pipeline
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out the current repository
+        id: checkout
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+
+      - name: Install dependencies
+        run: uv sync
+
+      - name: Pull data from DVC
+        run: uv run dvc pull
+
+      - name: Run the pipeline with DVC
+        run: uv run dvc repro
+
+      - name: Push the outcomes to DVC remote storage
+        run: uv run dvc push
+
+      - name: Commit changes in dvc.lock
+        uses: stefanzweifel/git-auto-commit-action@v4
+        with:
+          commit_message: Commit changes in dvc.lock
+          branch: main
+          file_pattern: dvc.lock
diff --git a/config/process/process_1.yaml b/config/process/process_1.yaml
@@ -16,7 +16,7 @@ keep_columns:
   - family_size
 
 remove_outliers_threshold:
-  age: 84
+  age: 83
   Income: 600000
 
 family_size:
diff --git a/dvc.lock b/dvc.lock
@@ -3,6 +3,11 @@ stages:
   process_data:
     cmd: python src/process_data.py
     deps:
+    - path: config
+      hash: md5
+      md5: a2380b625b5f5f10411a2f8dd4d7174f.dir
+      size: 1483
+      nfiles: 4
     - path: data/raw
       md5: 10c3f643286f509fa7f6b4675d9efbad.dir
       size: 222379
@@ -11,36 +16,6 @@ stages:
       hash: md5
       md5: 5920b8b9838a6fdb8afdda6c82e35986
       size: 2654
-    params:
-      config/process/process_1.yaml:
-        family_size:
-          Married: 2
-          Together: 2
-          Absurd: 1
-          Widow: 1
-          YOLO: 1
-          Divorced: 1
-          Single: 1
-          Alone: 1
-        keep_columns:
-        - Income
-        - Recency
-        - NumWebVisitsMonth
-        - AcceptedCmp3
-        - AcceptedCmp4
-        - AcceptedCmp5
-        - AcceptedCmp1
-        - AcceptedCmp2
-        - Complain
-        - Response
-        - age
-        - total_purchases
-        - enrollment_years
-        - family_size
-        name: process_1
-        remove_outliers_threshold:
-          age: 84
-          Income: 600000
     outs:
     - path: data/intermediate
       hash: md5
@@ -50,39 +25,27 @@ stages:
   train:
     cmd: python src/segment.py
     deps:
+    - path: config
+      hash: md5
+      md5: a2380b625b5f5f10411a2f8dd4d7174f.dir
+      size: 1483
+      nfiles: 4
     - path: data/intermediate
       hash: md5
       md5: 69c6a4e21a7e575450a4ce26f70f394f.dir
       size: 624234
       nfiles: 1
     - path: src/segment.py
       hash: md5
-      md5: b0f72dee173f4a36c4e9849fa3b0545c
-      size: 2245
-    params:
-      config/main.yaml:
-        defaults:
-        - process: process_1
-        - _self_
-        final:
-          dir: data/final
-          name: segmented.csv
-          path: ${final.dir}/${final.name}
-        intermediate:
-          dir: data/intermediate
-          name: scale_features.csv
-          path: ${intermediate.dir}/${intermediate.name}
-        model:
-          path: model/cluster.pkl
-        raw_data:
-          path: data/raw/marketing_campaign.csv
+      md5: 631a294f601a1275a7b9bf3f87d49709
+      size: 2953
     outs:
     - path: data/final
       hash: md5
-      md5: fcdc1dd0b9a2a1877736c356b9602f6a.dir
+      md5: 53d9c18e9c74863889ef30db81b8ae26.dir
       size: 610251
       nfiles: 1
     - path: model/cluster.pkl
       hash: md5
-      md5: 8fd544c7627269bc5cbee2243e6cee58
-      size: 9701
+      md5: e515bf89630b6a41b857b702130ddbe6
+      size: 9772
diff --git a/dvc.yaml b/dvc.yaml
@@ -1,20 +1,18 @@
 stages:
   process_data:
     cmd: python src/process_data.py
-    params:
-        - config/process/process_1.yaml:
     deps:
     - data/raw
     - src/process_data.py
+    - config
     outs:
     - data/intermediate
   train:
     cmd: python src/segment.py
-    params:
-        - config/main.yaml:
     deps:
     - data/intermediate
     - src/segment.py
+    - config
     outs:
     - data/final
     - model/cluster.pkl
diff --git a/src/segment.py b/src/segment.py
@@ -22,18 +22,20 @@ def read_process_data(config: DictConfig):
 
 
 def get_pca_model(data: pd.DataFrame) -> PCA:
-    pca = PCA(n_components=3)
+    pca = PCA(n_components=4)
     pca.fit(data)
     return pca
 
 
 def reduce_dimension(df: pd.DataFrame, pca: PCA) -> pd.DataFrame:
-    return pd.DataFrame(pca.transform(df), columns=["col1", "col2", "col3"])
+    n_components = pca.n_components_
+    columns = [f"col{i+1}" for i in range(n_components)]
+    return pd.DataFrame(pca.transform(df), columns=columns)
 
 
-def get_3d_projection(pca_df: pd.DataFrame) -> dict:
-    """A 3D Projection Of Data In The Reduced Dimensionality Space"""
-    return {"x": pca_df["col1"], "y": pca_df["col2"], "z": pca_df["col3"]}
+def get_projection(pca_df: pd.DataFrame) -> dict:
+    """Get projection of data in the reduced dimensionality space"""
+    return {f"dim{i+1}": pca_df[col] for i, col in enumerate(pca_df.columns)}
 
 
 def get_best_k_cluster(pca_df: pd.DataFrame) -> pd.DataFrame: