|
10 | 10 | from yellowbrick.cluster import KElbowVisualizer
|
11 | 11 | import hydra
|
12 | 12 | from pathlib import Path
|
| 13 | +import mlflow |
| 14 | +from sklearn.metrics import silhouette_score |
| 15 | +from mlflow.models import infer_signature |
13 | 16 |
|
14 | 17 | warnings.simplefilter(action="ignore", category=DeprecationWarning)
|
15 | 18 |
|
@@ -65,15 +68,30 @@ def save_data_and_model(data: pd.DataFrame, model: KMeans, config: DictConfig):
|
65 | 68 |
|
66 | 69 | @hydra.main(config_path="../config", config_name="main", version_base="1.2")
|
67 | 70 | def segment(config: DictConfig) -> None:
|
| 71 | + |
| 72 | + # Data processing |
68 | 73 | data = read_process_data(config)
|
69 | 74 | pca = get_pca_model(data)
|
70 | 75 | pca_df = reduce_dimension(data, pca)
|
71 | 76 | k_best = get_best_k_cluster(pca_df)
|
72 | 77 | model = get_clusters_model(pca_df, k_best)
|
73 | 78 | pred = predict(model, pca_df)
|
74 | 79 | data = insert_clusters_to_df(data, pred)
|
| 80 | + silhouette_avg = silhouette_score(pca_df, pred) |
| 81 | + |
| 82 | + # Save data and model locally |
75 | 83 | save_data_and_model(data, model, config)
|
76 | 84 |
|
| 85 | + with mlflow.start_run(): |
| 86 | + |
| 87 | + mlflow.log_params({"n_components": 3, "random_state": 42, "best_k": k_best}) |
| 88 | + mlflow.log_metric("silhouette_score", silhouette_avg) |
| 89 | + signature = infer_signature(pca_df, pred) |
| 90 | + mlflow.sklearn.log_model( |
| 91 | + model, "kmeans_model", signature=signature, input_example=pca_df.head() |
| 92 | + ) |
| 93 | + mlflow.log_artifact(config.final.path, "processed_data") |
| 94 | + |
77 | 95 |
|
78 | 96 | if __name__ == "__main__":
|
79 | 97 | segment()
|
0 commit comments