Skip to content

Commit d8b7cf5

Browse files
committed
minor updates to make_report; add generated images
1 parent b41a87f commit d8b7cf5

File tree

5 files changed

+26
-11
lines changed

5 files changed

+26
-11
lines changed

results/mcp_histograms.png

41.7 KB
Loading

results/motif_25_visual.png

168 KB
Loading

results/motif_5_visual.png

137 KB
Loading

results/motif_example.png

27.1 KB
Loading

src/github_analysis/make_report.py

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,9 @@ def calc_conf(a):
101101
return interval[1]-interval[0]
102102

103103
class Report:
104+
"""Class to generate images used in report and presentation."""
104105
def __init__(self, data_path='/Users/richiezitomer/Documents/RStudio-Data-Repository/clean_data/commits_by_org.feather',
105-
embedding_path='results/embeddings.csv', num_motifs_to_sample=100, motif_lengths=[5,10,25,50,100]):
106+
embedding_path='results/embeddings.csv', num_motifs_to_sample=1000, motif_lengths=[5,10,25,50,100]):
106107
self.emb = pd.read_csv(embedding_path)
107108
self.project_ids = self.emb.type.values
108109
self.proj_ids_string = ",".join(self.project_ids.astype(str))
@@ -113,7 +114,8 @@ def __init__(self, data_path='/Users/richiezitomer/Documents/RStudio-Data-Reposi
113114

114115
self.project_stats_created = False
115116

116-
def make_proj_stats_df(self, read_from_motif_files=False,do_cluster=False):
117+
def make_proj_stats_df(self):
118+
"""Method to make dataframe with stats by project."""
117119
# Load Data
118120
comm_auth_by_proj = pull_queries(COMM_AUTH_BY_PROJ.format(proj_ids=self.proj_ids_string)).set_index(
119121
'p_id') # pd.read_csv('data/author_commits_by_proj_100.csv').set_index('p_id')
@@ -140,7 +142,7 @@ def make_proj_stats_df(self, read_from_motif_files=False,do_cluster=False):
140142
self.project_stats_created = True
141143

142144
def get_multi_chain_percent_by_proj(self, k,proj_id):
143-
"""f """
145+
"""Method that gets multi-chain percentage of each project."""
144146
projects_cluster = self.commits_dl.getCommitsByProjectId(proj_id)
145147
G = git_graph(projects_cluster)
146148
roots = [n for n, d in G.in_degree() if d == 0]
@@ -190,30 +192,41 @@ def get_multi_chain_percent_by_proj(self, k,proj_id):
190192
# fig.savefig(output_path)
191193

192194
def get_most_common_motifs(self, motif_length=5):
195+
"""Method that gets 8 or 9 most common motifs for a given project or group of projects."""
193196
motifs = mf.get_motifs(self.project_ids, motif_length, self.num_motifs_to_sample, self.commits_dl)
194197

195-
fig, axs = plt.subplots(3, 3)
198+
if motif_length == 5:
199+
fig, axs = plt.subplots(3, 3)
200+
else:
201+
fig, axs = plt.subplots(4, 2)
202+
196203
fig.set_size_inches(18.5, 10.5)
197204
for n, key in enumerate(sorted(motifs, key=motifs.get, reverse=True)):
198-
if n >= 9:
199-
break
200205
if motif_length == 5:
206+
if n >= 9:
207+
break
201208
nx.draw_kamada_kawai(key, node_size=300, width=1.5, arrowsize=50, ax=axs.flatten()[n])
202-
axs.flatten()[n].set_title('{}. Motif Frequency: {}%'.format(str(n + 1), str(motifs[key] / 10)),
203-
fontsize=15)
209+
axs.flatten()[n].set_title(
210+
'{}. {}% (n={})'.format(str(n + 1), str(round(100*(motifs[key] / self.num_motifs_to_sample))), str(motifs[key])),
211+
fontsize=20)
204212
else:
213+
if n >= 8:
214+
break
205215
if n == 0:
206216
nx.draw_kamada_kawai(key, node_size=100, width=1, ax=axs.flatten()[n])
217+
axs.flatten()[n].set_title('{}. {}% (n={})'.format(str(n + 1), str(round(100 * (motifs[key] / self.num_motifs_to_sample))),
218+
str(motifs[key])),fontsize = 20)
207219
else:
208220
nx.draw_spring(key, node_size=100, width=.8, arrowsize=20, ax=axs.flatten()[n])
209-
axs.flatten()[n].set_title('{}. Motif Frequency: {}%'.format(str(n + 1), str(motifs[key] / 10)),
210-
fontsize=15)
221+
axs.flatten()[n].set_title('{}. {}% (n={})'.format(str(n + 1), str(round(100 * (motifs[key] / self.num_motifs_to_sample))),
222+
str(motifs[key])),fontsize = 20)
211223

212-
fig.suptitle('Most Common Motifs Length {}'.format(motif_length), fontsize=25)
224+
fig.suptitle('Most Common Motifs Length {} Occurrence Rate and Count'.format(motif_length), fontsize=25)
213225
fig.savefig('results/motif_{}_visual.png'.format(motif_length))
214226
return fig
215227

216228
def get_motif_example(self, motif_length=25):
229+
"""Method that gets an example motif of motif_length."""
217230
motifs = mf.get_motifs(self.project_ids, motif_length, self.num_motifs_to_sample, self.commits_dl)
218231
second_most_common_motif = sorted(motifs, key=motifs.get, reverse=True)[1]
219232

@@ -225,6 +238,7 @@ def get_motif_example(self, motif_length=25):
225238
return fig
226239

227240
def get_mcp_hist(self):
241+
"""Method that makes a histogram of different motif lengths by project."""
228242
if not self.project_stats_created:
229243
self.make_proj_stats_df()
230244
df = self.project_stats[['mcp_5', 'mcp_10', 'mcp_25', 'mcp_50', 'mcp_100']]
@@ -238,6 +252,7 @@ def get_mcp_hist(self):
238252
return fig
239253

240254
def get_gh_feature_comparison(self):
255+
"""Method that gets relative GH features of high- and low-complexity projects."""
241256
if not self.project_stats_created:
242257
self.make_proj_stats_df()
243258
self.project_stats['complexity'] = self.project_stats.mcp_25.apply(complexity_tag)

0 commit comments

Comments
 (0)