Skip to content

Commit 82005ef

Browse files
committed
update function documentation
1 parent 66b259a commit 82005ef

File tree

7 files changed

+184
-146
lines changed

7 files changed

+184
-146
lines changed

src/github_analysis/cluster.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,18 @@
77

88
def get_embedding_clusters(embedding_input_file='./results/embeddings.csv', k_for_clustering=10, random_state=None,
99
output_file='./results/clusters.pickle'):
10-
"""
11-
Given a file with embeddings (or other features) cluster similar rows together using kmeans.
12-
13-
:param embedding_input_file: file where every row is a project and every col a feature
14-
:param k_for_clustering: how many groups to cluster
15-
:param random_state: random state for clustering algo
16-
:param output_file: string with the filename to output the results to as a pickle. If this param is set to None no file will be outputted.
17-
:return: a dictionary where the keys are the cluster labels and the values are lists of GitHub projectIds that fall in that cluster.
10+
""" Given a file with embeddings (or other features) cluster similar rows together using kmeans.
11+
12+
Parameters
13+
----------
14+
embedding_input_file: file where every row is a project and every col a feature.
15+
k_for_clustering: how many groups to cluster into.
16+
random_state: random state for clustering algo.
17+
output_file: string with the filename to output the results to as a pickle. If this param is set to None no file will be outputted.
18+
19+
Returns
20+
-------
21+
a dictionary where the keys are the cluster labels and the values are lists of GitHub projectIds that fall in that cluster.
1822
"""
1923
embeddings = pd.read_csv(embedding_input_file, index_col=0)
2024

src/github_analysis/data_layer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ def getUniqueProjectNamesFromDf(df):
88
return df.project_name.unique()
99

1010
class data_layer:
11+
""" Read the feather file of commit history data, and group commits per project ID.
12+
It provides the option for filtering the projects with a minimum number of commits.
13+
"""
1114
def __init__(self, data_path, min_number_commits=None):
1215
self.data_path = data_path
1316
self.commits_df = pd.read_feather(data_path)

src/github_analysis/freq_graph.py

Lines changed: 24 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,18 @@
1010

1111

1212
def generate_motif_visualisations_by_cluster(input_file_motif_clusters='./results/motifs_by_cluster.pickle', output_file='./results/clustering_output.pdf'):
13-
"""
14-
:param input_file_motif_clusters: string with a filepath to a pickled dictionary where the keys are cluster names and the values
15-
are dictionaries where the keys are motifs (nx subgraph) of length k and the values are how many times similar
16-
(isomorphic) motifs occur in the graph.
17-
:param output_file: string thats a path of a pdf file to output the graphs to
13+
""" Visualize the motif clustering result and output the visualize result to pdf file.
14+
15+
Parameters
16+
----------
17+
input_file_motif_clusters: string with a filepath to a pickled dictionary where the keys are cluster names and the values
18+
are dictionaries where the keys are motifs (nx subgraph) of length k and the values are how
19+
many times similar(isomorphic) motifs occur in the graph.
20+
output_file: string thats a path of a pdf file to output the graphs to.
21+
22+
Returns
23+
-------
24+
Visulization result of motif clustering, by cluster, saved down to a pdf file.
1825
"""
1926
with open(input_file_motif_clusters, 'rb') as pickle_in:
2027
motif_clusters = pickle.load(pickle_in)
@@ -30,15 +37,18 @@ def generate_motif_visualisations_by_cluster(input_file_motif_clusters='./result
3037

3138

3239
def visualize_motif_samples_bar_graph(motifs, plot_title='Motif Frequency in Dataset', motifs_to_show=8):
33-
"""
34-
Given a collection of motifs and their frequency in a graph, output a file with a bar chart showing the motifs and
35-
their associated frequencies.
36-
37-
:param motifs: dictionary where the keys are motifs (nx subgraph) of length k and the values are how many times similar
38-
(isomorphic) motifs occur in the graph.
39-
:param plot_title: string thats the tile of your plot.
40-
:return: fig that is a bar chart of the most common motifs and how often they occurred
41-
40+
""" Given a collection of motifs and their frequency in a graph, output a file with a bar chart showing the motifs and
41+
their associated frequencies.
42+
43+
Parameters
44+
----------
45+
motifs: dictionary where the keys are motifs (nx subgraph) of length k and the values are how many times similar
46+
(isomorphic) motifs occur in the graph.
47+
plot_title: string thats the tile of your plot.
48+
49+
Returns
50+
-------
51+
A bar chart figure of the most common motifs and how often they occurred.
4252
"""
4353
motifs_sorted = sorted(motifs.items(), key=lambda kv: kv[1], reverse=True)
4454

@@ -97,48 +107,5 @@ def visualize_motif_samples_bar_graph(motifs, plot_title='Motif Frequency in Dat
97107
'{}% of Sampled Motifs are a Single Chain'.format(round(100 * single_chain_occurences / number_of_samples, 3)))
98108
return fig
99109

100-
# plt.savefig(output_file, pad_inches=2)
101-
# plt.close()
102-
103-
104-
# def visualize_motif_samples(motifs, output_file):
105-
# """
106-
# Given a sample of motifs, output a file with their graphs and how often they occurred.
107-
#
108-
# :param motifs: a dictionary where the keys are motifs (nx subgraph) of length k and the keys are how many times similar
109-
# (isomorphic) motifs occur in the graph.
110-
# :param output_file: string thats apath of a pdf file to output the graphs to
111-
# :return: a pdf file with name output_file with the graphs and how often they occured
112-
# """
113-
# motif_count = sum(motifs.values())
114-
# motifs_sorted = sorted(motifs.items(), key=lambda kv: kv[1], reverse=True)
115-
# with PdfPages(output_file) as pdf:
116-
# for motif in motifs_sorted:
117-
# fig = plt.figure()
118-
# nx.draw_kamada_kawai(motif[0], node_size=25, arrowsize=5)
119-
# fig.suptitle('{} Occurrences ({}%)'.format(motif[1], round(100 * motif[1] / motif_count, 3)))
120-
# pdf.savefig(fig)
121-
# plt.close()
122-
#
123-
124110
if __name__ == '__main__':
125111
main()
126-
127-
# try:
128-
# makedirs('results/clustering_{}'.format(output_folder_suffix)) # make output folder
129-
# except FileExistsError:
130-
# print('About to overwrite existing output folder and files...')
131-
# #TODO: Have user have to type 'y' or something continue, then also delete all files in folder so theres not like one cluster left over from before.
132-
133-
134-
135-
136-
# cluster_visual = visualize_motif_samples_bar_graph(motifs, 'Cluster ' + str(cluster), number_of_samples)
137-
# pdf.savefig(cluster_visual,pad_inches=2)
138-
# #visualize_motif_samples(motifs, './results/clustering_{}/cluster_{}.pdf'.format(output_folder_suffix,cluster))
139-
#
140-
141-
142-
# # Sort keys in cluster dictionary so they are outputted
143-
# sorted_cluster_keys = list(clusters.keys())
144-
# sorted_cluster_keys.sort()

src/github_analysis/graph2vec.py

Lines changed: 37 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,17 @@ def extract_features(self, projectGraphs):
5757
return document_collections
5858

5959
def feature_extractor(self, graph, rounds, name):
60-
"""
61-
Function to extract WL features from a graph.
62-
:param graph: The nx graph.
63-
:param rounds: Number of WL iterations.
64-
:param name: ProjectId to output
65-
:return doc: Document collection object.
60+
""" Function to extract WL features from a graph.
61+
62+
Parameters
63+
----------
64+
graph: The nx graph.
65+
rounds: Number of WL iterations.
66+
name: ProjectId to output
67+
68+
Return
69+
----------
70+
doc: Document collection object.
6671
"""
6772
features = nx.degree(graph)
6873
features = {int(k):v for k,v, in features}
@@ -73,9 +78,11 @@ def feature_extractor(self, graph, rounds, name):
7378
return doc
7479

7580
def get_embeddings(self, n_graphs):
76-
"""
77-
Function to get embeddings from the model.
78-
:param n_graphs: The number of graphs used to train the model.
81+
""" Function to get embeddings from the model.
82+
83+
Parameters
84+
----------
85+
n_graphs: The number of graphs used to train the model.
7986
"""
8087
if not self.fitted:
8188
print("Model has not been fit, run Graph2Vec.fit() before getting embeddings")
@@ -91,11 +98,13 @@ def get_embeddings(self, n_graphs):
9198
return out
9299

93100
def save_embeddings(self, n_graphs, output_path='./results/embeddings.csv', projectGraphsIndex=None):
94-
"""
95-
Function to save the embedding.
96-
:param output_path: Path to the embedding csv.
97-
:param n_graphs: The number of graphs used to train the model.
98-
:param dimensions: The embedding dimension parameter.
101+
""" Function to save the embedding.
102+
103+
Parameters
104+
----------
105+
output_path: Path to the embedding csv.
106+
n_graphs: The number of graphs used to train the model.
107+
dimensions: The embedding dimension parameter.
99108
"""
100109
if not self.fitted:
101110
print("Model has not been fit, run Graph2Vec.fit() before saving embeddings")
@@ -113,11 +122,13 @@ class WeisfeilerLehmanMachine:
113122
Weisfeiler Lehman feature extractor class.
114123
"""
115124
def __init__(self, graph, features, iterations):
116-
"""
117-
Initialization method which executes feature extraction.
118-
:param graph: The Nx graph object.
119-
:param features: Feature hash table.
120-
:param iterations: Number of WL iterations.
125+
""" Initialization method which executes feature extraction.
126+
127+
Parameters
128+
----------
129+
graph: The Nx graph object.
130+
features: Feature hash table.
131+
iterations: Number of WL iterations.
121132
"""
122133
self.iterations = iterations
123134
self.graph = graph
@@ -127,9 +138,11 @@ def __init__(self, graph, features, iterations):
127138
self.do_recursions()
128139

129140
def do_a_recursion(self):
130-
"""
131-
The method does a single WL recursion.
132-
:return new_features: The hash table with extracted WL features.
141+
""" The method does a single WL recursion.
142+
143+
Returns
144+
-------
145+
new_features: The hash table with extracted WL features.
133146
"""
134147
new_features = {}
135148
for node in self.nodes:
@@ -144,8 +157,8 @@ def do_a_recursion(self):
144157
return new_features
145158

146159
def do_recursions(self):
147-
"""
148-
The method does a series of WL recursions.
160+
""" The method does a series of WL recursions.
161+
149162
"""
150163
for iteration in range(self.iterations):
151164
self.features = self.do_a_recursion()

0 commit comments

Comments
 (0)