Merge pull request #6 from Code-Plus-CUMI/CSFelix-patch-1

CSFelix · web-flow · commit 761efb5e64fe · 2023-03-19T11:59:00.000-03:00
🎉 Added
diff --git a/4 - Python/4 - Data Science/2.2 - Clustering/0 - Introduction.py b/4 - Python/4 - Data Science/2.2 - Clustering/0 - Introduction.py
@@ -0,0 +1,24 @@
+"""
+	
+	*******************************
+	** Clustering - Introduction **
+	*******************************
+
+	Clustering is a Machine Learning Technique to classify the
+datas into groups (clusters) following patterns the algorithm
+has learned.
+
+	These patterns are not explicity, that means that once a
+model has clustered the datas, it is a task to Data Scientists
+figure out what are the patterns and why they have been choosen.
+
+	Versions-wise, there a lot of them, but the two main ones are
+K-Means and Hierarchial, being:
+
+	- K-Means Clustering: applies K-Nearest Neighbors to create the
+clusters;
+
+	- Hiearchial Clustering: applies K-Neartes Neighbors to create
+the clusters AND there is a hierarchial / importance relationship
+between the groups.
+"""
diff --git a/4 - Python/4 - Data Science/2.2 - Clustering/1 - K-Means.py b/4 - Python/4 - Data Science/2.2 - Clustering/1 - K-Means.py
@@ -0,0 +1,106 @@
+"""
+
+	************************
+	** K-Means - Clusters **
+	************************
+
+	K-Means defines CENTROIDS and it's goal is to find
+the perfect position for each centroid and its territory 
+(TESSALATION).
+
+-*-*-*-*-
+
+	When creating this algorithm, you have to pay attention to three
+parameters:
+
+	/ n_clusters: number of Clusters (K)
+
+	/ max_iter: number of iterations
+
+	/ n_init: gets the Centroids' Position has the least total
+distance between each point and its centroid, the optimal 
+clustering.
+
+-*-*-*-*-
+
+	Besides, since K-Means clustering is sensitive to scale, it can
+be a good idea RESCALE or NORMALIZE data with extreme values.
+Our features are already roughly on the same scale, so we'll
+leave them as-is.
+"""
+
+
+
+# ---- Importing Libraries and Preparing DataSet ----
+from sklearn.cluster import KMeans
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+sns.set_style('whitegrid')
+
+df = pd.read_csv('filepath')
+X = df.loc[:, ['Latitude', 'Longitude', 'MedInc']]
+
+
+
+# ---- WCSS and Elbow Method ----
+#
+# Their function is to check out how many clusters is great
+# to the process
+#
+wcss = []
+
+# testing out with 1 to 11 clusters
+for i in range(1, 11):
+
+    # n_clusters >> number of clusters to be identified
+    # max_iter   >> number of iterations for each run (n_init)
+    # n_init     >> number of runs (centroids iteration)
+    kmeans = KMeans(n_clusters=i,
+    				init='k-means++',
+    				max_iter=300,
+    				n_init=10,
+    				random_state=0)
+
+    kmeans.fit(X)
+    wcss.append(kmeans.inertia_)
+
+# plotting the results (we often choose the amount
+# of clusters where the WCSS starts to level off -
+# elbow method)
+plt.plot(range(1, 11), wcss)
+plt.title('Elbow Method')
+plt.xlabel('Number of clusters')
+plt.ylabel('WCSS')
+plt.show()
+
+
+
+# ---- Using K-Means ----
+#
+# Consider that 4 clusters was the great amount and we
+# will repeat the K-Means centroids moviment 10 times
+kmeans = KMeans(n_clusters=4,
+    			init='k-means++',
+    			max_iter=300,
+    			n_init=10,
+    			random_state=0)
+
+X['Cluster'] = kmeans.fit_predict(X)
+X['Cluster'] = X['Cluster'].astype('category')
+
+
+
+# ---- Plotting the Results ----
+sns.relplot(
+    x="Longitude", y="Latitude", hue="Cluster", data=X, height=6,
+);
+
+
+
+# OBS.: Comparing the Target - box-plots show the distribution of
+#the target within each cluster. If the clustering is informative,
+# these distributions should, for the most part, separate across
+# MedHouseVal (Target), which is indeed what we see.
+X["MedHouseVal"] = df["MedHouseVal"]
+sns.catplot(x="MedHouseVal", y="Cluster", data=X, kind="boxen", height=6);
diff --git a/4 - Python/4 - Data Science/2.2 - Clustering/2 - Hierarchial.py b/4 - Python/4 - Data Science/2.2 - Clustering/2 - Hierarchial.py
@@ -0,0 +1,62 @@
+"""
+	
+	****************************
+	** Hierarchial Clustering **
+	****************************
+
+	Hierarchial cluster the datas and each cluster has a weight
+that represents its position in a hierarchy between them.
+
+-*-*-*-*-
+
+	When creating this algorithm, you have to pay attention to three
+parameters:
+
+	/ n_clusters: number of Clusters (K)
+
+	/ linkage: type of linkage in the hierarchy
+
+	/ afinity: variation of the algorithm (Euclidean is the most
+common).
+
+-*-*-*-*-
+
+	Besides, since Hierarchial clustering is sensitive to scale, it can
+be a good idea RESCALE or NORMALIZE data with extreme values.
+Our features are already roughly on the same scale, so we'll
+leave them as-is.
+"""
+
+# ---- Importing Libraries ----
+from scipy.cluster.hierarchy import dendrogram, linkage
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.cluster import KMeans
+import pandas as pd
+import matplotlib.pyplot as plt
+
+
+
+# ---- Dendograms and Linkage Visualization ----
+#
+# 	With this visualization is possible to know how many
+# clusters (n_clusters parameter) will best fit the algorithm
+#
+linkage_data = linkage(df, method='ward', metric='euclidean')
+dendrogram(linkage_data)
+
+
+
+# ---- Applying Hierarchial ----
+hierarchical_cluster = AgglomerativeClustering(
+	n_clusters=2
+	, affinity='euclidean'
+	, linkage='ward'
+)
+
+labels = hierarchical_cluster.fit_predict(df)
+
+
+
+# ---- Plotting the Result ----
+plt.scatter(x, y, c=labels)
+plt.show()