Skip to content

Adding class abstractions and it's corresponding tests #66

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 118 additions & 31 deletions src/github_analysis/cluster.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,132 @@
import pandas as pd
from sklearn.cluster import KMeans
import pickle
import logging
import pickle
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import DBSCAN, MiniBatchKMeans


class Cluster():
def __init__(self):
""" Initializes the Cluster class

Parameters
----------
raw_data: pd.DataFrame or np.ndarray
Data in a 2 dimensional ndarray or a pandas Data Frame

Returns
-------
None
"""
self.raw_data = None
self.data = None
self.algorithm = None
self.transformed_data = None
self.fitted = None

def open_embeddings(self, input_file):
self.raw_data = pd.read_csv(input_file, index_col = 0)

logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", filename="log.log", level=logging.INFO)
def scale_data(self, min_max = True):
""" Scales the data in all columns to a same scale

def get_embedding_clusters(embedding_input_file='./results/embeddings.csv', k_for_clustering=10, random_state=None,
output_file='./results/clusters.pickle'):
""" Given a file with embeddings (or other features) cluster similar rows together using kmeans.

Parameters
----------
embedding_input_file: file where every row is a project and every col a feature.
k_for_clustering: how many groups to cluster into.
random_state: random state for clustering algo.
output_file: string with the filename to output the results to as a pickle. If this param is set to None no file will be outputted.
min_max: bool
If True uses the MinMaxScaler, if False uses the StandardScaler

Returns
-------
a dictionary where the keys are the cluster labels and the values are lists of GitHub projectIds that fall in that cluster.
"""
embeddings = pd.read_csv(embedding_input_file, index_col=0)

# Run k-means algo TODO: spend more time on this algo: tune hyperparams, consider algo that better handles high dim, etc.
kmeans = KMeans(n_clusters=k_for_clustering, random_state=random_state).fit(embeddings.values)

# Make dict where key is cluster # and value are projects in that clusters
clusters = {}
for n, label in enumerate(kmeans.labels_):
if label in clusters:
clusters[label].append(embeddings.index[n])
None
"""
data = self.raw_data

if min_max:
scaled_data = MinMaxScaler().fit_transform(data)
else:
clusters[label] = [embeddings.index[n]]
scaled_data = StandardScaler().fit_transform(data)

if output_file is not None:
with open(output_file, 'wb') as output:
pickle.dump(clusters, output)
logging.info('Cluster file outputted!')
self.data = scaled_data

def set_algorithm(self, name, **kwargs):
""" Sets the clustering algorithm to use

Parameters
----------
name: str
Name of the algorithm to use
**kwargs
Named arguments specific to the algorithm to use

Returns
-------
None
"""
name = name.lower()
if name == 'k_means':
self.algorithm = MiniBatchKMeans(**kwargs)
elif name == 'dbscan':
self.algorithm = DBSCAN(**kwargs)

def fit_algorithm(self):
""" Fits the algorithm to the scaled data

return clusters
Parameters
----------
None

Returns
-------
None
"""
self.scale_data()
self.algorithm.fit(self.data)
self.fitted = True

def get_labels(self):
""" Gets the cluster labels

Parameters
----------
None

Returns
-------
ndarray
Array of cluster labels
"""
self.labels = self.algorithm.labels_
return self.labels

def get_inertia(self):
""" Gets the inertia of the clusters

Parameters
----------
None

Returns
-------
float
Returns the intertia if the algorithm has an inertia attribute
"""
try:
self.inertia = self.algorithm.inertia_
return self.inertia
except:
print('Not Inertia in this algorithm')

def save_file(self, output_file):
embeddings = self.data
clusters = {}
for n, label in enumerate(self.get_labels()):
if label in clusters:
clusters[label].append(embeddings.index[n])
else:
clusters[label] = [embeddings.index[n]]

with open(output_file, 'wb') as output:
pickle.dump(clusters, output)

if __name__ == '__main__':
get_embedding_clusters()
logging.info('Cluster file outputted.')
5 changes: 5 additions & 0 deletions src/github_analysis/data_pull.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import pandas as pd

csv_commits = pd.read_csv('https://storage.cloud.google.com/rstudio_bucket/2019_seed_commits.csv?_ga=2.112003524.-1920784121.1551992733')

csv_commits.to_feather('../artifacts/commits.feather')
107 changes: 107 additions & 0 deletions src/github_analysis/dim_reduce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from sklearn.decomposition import PCA
from MulticoreTSNE import MulticoreTSNE as TSNE
from sklearn.manifold import Isomap, LocallyLinearEmbedding, MDS, SpectralEmbedding
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np

class ReduceDim():
def __init__(self, n_dimensions):
""" Initializes the ReduceDim class

Parameters
----------
raw_data: pd.DataFrame or np.ndarray
Data in a 2 dimensional ndarray or a pandas Data Frame
n_dimensions: int
Number of dimensions we want to reduce to

Returns
-------
None
"""
self.dimensions = n_dimensions
self.raw_data = None
self.data = None
self.algorithm = None
self.transformed_data = None

def open_embeddings(self, input_file):
self.raw_data = pd.read_csv(input_file, index_col = 0)

def scale_data(self, min_max = True):
""" Scales the data in all columns to a same scale

Parameters
----------
min_max: bool
If True uses the MinMaxScaler, if False uses the StandardScaler

Returns
-------
None
"""
data = self.raw_data

if min_max:
scaled_data = MinMaxScaler().fit_transform(data)
else:
scaled_data = StandardScaler().fit_transform(data)

self.data = scaled_data

def set_algorithm(self, name, **kwargs):
""" Sets the dimensionality reduction algorithm to use

Parameters
----------
name: str
Name of the algorithm to use ### (*add algorithms available in the docstring*)
**kwargs
Named arguments specific to the algorithm to use

Returns
-------
None
"""
name = name.lower()

if name == 'pca':
self.algorithm = PCA(n_components = self.dimensions, **kwargs)
elif name == 't_sne':
self.algorithm = TSNE(n_components = self.dimensions, **kwargs)
elif name == 'isomap':
self.algorithm = Isomap(n_components = self.dimensions, **kwargs)
elif name == 'locally_linear':
self.algorithm = LocallyLinearEmbedding(n_components = self.dimensions, **kwargs)
elif name == 'mds':
self.algorithm = MDS(n_components = self.dimensions, **kwargs)
elif name == 'spectral':
self.algorithm = SpectralEmbedding(n_components = self.dimensions, **kwargs)

def fit_transform(self):
""" Fits the algorithm to the scaled data

Parameters
----------
None

Returns
-------
ndarray
Dimensionality reduced data
"""
self.scale_data()

self.transformed_data = self.algorithm.fit_transform(self.data)
self.transformed_data = pd.DataFrame(self.transformed_data, columns = ['x', 'y'])
self.transformed_data.index = self.raw_data.index
return self.transformed_data

def plot_tsne(self, file_name):
fig, ax = plt.subplots()
ax.scatter(self.transformed_data.x, self.transformed_data.y)
ax.set_title('Embedding Clusters (t-SNE Transformed)')
plt.savefig(file_name)

def save_reduced_data(self, output_file):
self.transformed_data.to_csv(output_file)
Loading