3
3
import plotly .express as px
4
4
import plotly .graph_objects as go
5
5
import humanize
6
+ from wordcloud import WordCloud
7
+ import matplotlib .pyplot as plt
8
+
6
9
7
10
# TODO add viz comment
8
11
# Viz 1 -
9
12
def number_of_tracked_reports (df ):
10
13
number_of_tracked_reports = len (df .groupby (["year" , "mnc" ])["mnc" ])
11
14
return number_of_tracked_reports
12
15
16
+
13
17
# TODO add viz comment
14
18
def number_of_tracked_reports_company (df_selected_company ):
15
19
number_of_tracked_reports_company = len (
16
20
df_selected_company .groupby (["year" ])["year" ]
17
21
)
18
22
return number_of_tracked_reports_company
19
23
24
+
20
25
# TODO add viz comment
21
26
def number_of_tracked_reports_sector (df_selected_sector ):
22
27
number_of_tracked_reports_sector = len (
23
- df_selected_sector .groupby (["year" , "mnc" ])["year" ]
24
- )
28
+ df_selected_sector .groupby (["year" , "mnc" ])["year" ]
29
+ )
25
30
return number_of_tracked_reports_sector
26
31
32
+
27
33
# TODO add viz comment
28
- def number_of_tracked_reports_country (df_selected_country ):
34
+ def number_of_tracked_reports_country (df_selected_country ):
29
35
number_of_tracked_reports_country = len (
30
36
df_selected_country .groupby (["year" , "mnc" ])["year" ]
31
37
)
32
38
return number_of_tracked_reports_country
33
39
40
+
34
41
# TODO add viz comment
35
42
# Viz 2 - Number of tracked reports over time
36
43
def number_of_tracked_reports_over_time (df ):
37
44
df_count = df .groupby (["year" ])["mnc" ].nunique ().reset_index ()
38
45
return df_count
39
46
47
+
40
48
# TODO add viz comment
41
49
def number_of_tracked_reports_over_time_company (df_selected_company ):
42
50
df_count_company = (
43
51
df_selected_company .groupby (["year" ])["mnc" ].nunique ().reset_index ()
44
- )
52
+ )
45
53
# df_count_all_company = df.groupby(["year"])["mnc"].nunique().reset_index()
46
54
47
55
# row[3].line_chart(df_count_all_company, x="year", y="mnc")
@@ -52,6 +60,7 @@ def number_of_tracked_reports_over_time_company(df_selected_company):
52
60
# )
53
61
return df_count_company
54
62
63
+
55
64
# TODO add viz comment
56
65
def number_of_tracked_reports_over_time_sector (df_selected_sector ):
57
66
df_count_sector = (
@@ -70,6 +79,7 @@ def number_of_tracked_reports_over_time_sector(df_selected_sector):
70
79
# )
71
80
return df_count_sector
72
81
82
+
73
83
# TODO add viz comment
74
84
def number_of_tracked_reports_over_time_country (df_selected_country ):
75
85
df_count_country = (
@@ -82,18 +92,20 @@ def number_of_tracked_reports_over_time_country(df_selected_country):
82
92
# row[5].line_chart(df_count_all_country, x="year", y="mnc", color="jur_name")
83
93
return df_count_country
84
94
95
+
85
96
# Viz 16
86
97
87
98
# company’s % pre-tax profit and profit per employee
88
99
# plot chart : x-axis = % profit, y axis = profit / employee
89
100
# size of the bubble based on % profit and a color code for
90
101
# tax havens vs others
91
102
def company_pourcentage_pretax_profit_and_profit_per_employee (df_selected_company ):
92
- pretax_profit_col_name = 'profit_before_tax'
103
+ # pretax_profit_col_name = 'profit_before_tax'
93
104
profit_col_name = ''
94
105
employee_col_name = 'employees'
95
106
df_selected_company [profit_col_name ] / df_selected_company [employee_col_name ]
96
107
108
+
97
109
# Viz 19
98
110
# what are the tax havens being used by the company
99
111
# to test but could be a table with one row per jurisdiction (filtering on TH) with
@@ -111,9 +123,7 @@ def tax_haven_used_by_company(df_selected_company):
111
123
df_selected_company_th = df_selected_company [df_selected_company ['jur_tax_haven' ] != 'not.TH' ]
112
124
df_selected_company_nth = df_selected_company [df_selected_company ['jur_tax_haven' ] == 'not.TH' ]
113
125
114
-
115
126
for col in pc_list :
116
-
117
127
df_selected_company .insert (
118
128
len (df_selected_company .columns ),
119
129
col + '_domestic_sum' ,
@@ -137,11 +147,9 @@ def tax_haven_used_by_company(df_selected_company):
137
147
df_selected_company .insert (
138
148
len (df_selected_company .columns ),
139
149
col + '_pc' ,
140
- 100 * df_selected_company [col ] / df_selected_company [col + '_sum' ])
150
+ 100 * df_selected_company [col ] / df_selected_company [col + '_sum' ])
141
151
# df_selected_company[col + '_pc'] = 100 * df_selected_company[col] / df_selected_company[col+'_sum']
142
152
143
-
144
-
145
153
df_selected_company_th = df_selected_company [df_selected_company ['jur_tax_haven' ] != 'not.TH' ]
146
154
df_selected_company_th_agg = df_selected_company_th .groupby (['mnc' , 'jur_name' ]).agg (
147
155
profit_before_tax = ('profit_before_tax' , 'sum' ),
@@ -151,16 +159,18 @@ def tax_haven_used_by_company(df_selected_company):
151
159
related_revenues_pc = ('related_revenues_pc' , 'sum' )
152
160
)
153
161
df_selected_company_th_agg = df_selected_company_th_agg .reset_index ()
154
- df_selected_company_th_agg ['profit per employee' ] = \
155
- df_selected_company_th_agg ['profit_before_tax' ]/ df_selected_company_th_agg ['employees' ]
156
- df_selected_company_th_agg ['profit per employee' ] = df_selected_company_th_agg ['profit per employee' ].replace ([np .inf , - np .inf ], None )
162
+ df_selected_company_th_agg ['profit per employee' ] = \
163
+ df_selected_company_th_agg ['profit_before_tax' ] / df_selected_company_th_agg ['employees' ]
164
+ df_selected_company_th_agg ['profit per employee' ] = df_selected_company_th_agg ['profit per employee' ].replace (
165
+ [np .inf , - np .inf ], None )
157
166
158
167
return df_selected_company , df_selected_company_th_agg
159
168
169
+
160
170
# TODO add viz comment
161
171
# complete table table showing for all jurisdictions revenues, profits, employees, taxes with % of total for each (color code for tax havens)
162
172
def company_table (df_selected_company ):
163
- company_upe_code = df_selected_company ['upe_code' ].unique ()[0 ]
173
+ # company_upe_code = df_selected_company['upe_code'].unique()[0]
164
174
pc_list = ['employees' , 'profit_before_tax' , 'unrelated_revenues' , 'related_revenues' , 'total_revenues' , 'tax_paid' ]
165
175
166
176
for col in pc_list :
@@ -190,12 +200,11 @@ def company_table(df_selected_company):
190
200
return df_selected_company_by_jur .reset_index ()
191
201
192
202
193
-
194
203
# Viz 4 - Breakdown of reports by sector (pie chart)
195
204
def breakdown_of_reports_by_sector (df ):
196
-
197
205
#Dataframe called df
198
- df_reports_per_sector_year = df .groupby (['sector' , 'year' ])['mnc' ].nunique ().reset_index (name = 'unique_company_count' )
206
+ df_reports_per_sector_year = df .groupby (['sector' , 'year' ])['mnc' ].nunique ().reset_index (
207
+ name = 'unique_company_count' )
199
208
200
209
# Aggregate the counts of unique companies across all years for each sector
201
210
df_reports_per_sector = df_reports_per_sector_year .groupby ('sector' )['unique_company_count' ].sum ().reset_index ()
@@ -204,27 +213,30 @@ def breakdown_of_reports_by_sector(df):
204
213
total_companies = df_reports_per_sector ['unique_company_count' ].sum ()
205
214
206
215
# Calculate the percentage of each sector's count relative to the total count and round to 2 decimals
207
- df_reports_per_sector ['percent' ] = ((df_reports_per_sector ['unique_company_count' ] / total_companies ) * 100 ).round (2 )
216
+ df_reports_per_sector ['percent' ] = ((df_reports_per_sector ['unique_company_count' ] / total_companies ) * 100 ).round (
217
+ 2 )
208
218
209
219
# Sort the DataFrame by the count of unique companies in ascending order
210
220
df_reports_per_sector = df_reports_per_sector .sort_values (by = 'unique_company_count' , ascending = True )
211
221
212
222
return df_reports_per_sector
213
223
224
+
214
225
def breakdown_of_reports_by_sector_viz (df_reports_per_sector ):
215
226
# Plotting the horizontal bar chart with Plotly Express
216
227
fig = px .bar (df_reports_per_sector , y = 'sector' , x = 'percent' ,
217
- orientation = 'h' , # Horizontal orientation
218
- title = 'Breakdown of Reports by Sector (All Years)' ,
219
- labels = {'percent' : 'Percentage of Companies (%)' , 'sector' : 'Sector' },
220
- text = 'percent' , # Show the percentage as text label
221
- hover_data = {'unique_company_count' : True , 'percent' : ':.2f%' }, # Add tooltip for count and rounded percentage
222
- )
228
+ orientation = 'h' , # Horizontal orientation
229
+ title = 'Breakdown of Reports by Sector (All Years)' ,
230
+ labels = {'percent' : 'Percentage of Companies (%)' , 'sector' : 'Sector' },
231
+ text = 'percent' , # Show the percentage as text label
232
+ hover_data = {'unique_company_count' : True , 'percent' : ':.2f%' },
233
+ # Add tooltip for count and rounded percentage
234
+ )
223
235
224
236
# Update layout to display the title above the chart
225
237
fig .update_layout (title = 'Breakdown of Reports by Sector' ,
226
- title_x = 0.5 , title_y = 0.9 , # Adjust position
227
- title_font_size = 20 ) # Adjust font size
238
+ title_x = 0.5 , title_y = 0.9 , # Adjust position
239
+ title_font_size = 20 ) # Adjust font size
228
240
229
241
# Show the horizontal bar chart
230
242
return go .Figure (fig )
@@ -244,13 +256,14 @@ def breakdown_of_reports_by_hq_country(df):
244
256
245
257
# Calculate the percentage of each HQ country's count relative to the total count and round to 2 decimals
246
258
df_reports_per_country ['percent' ] = (
247
- (df_reports_per_country ['unique_company_count' ] / total_companies ) * 100 ).round (2 )
259
+ (df_reports_per_country ['unique_company_count' ] / total_companies ) * 100 ).round (2 )
248
260
249
261
# Sort the DataFrame by the count of unique companies in ascending order
250
262
df_reports_per_country = df_reports_per_country .sort_values (by = 'unique_company_count' , ascending = True )
251
263
252
264
return df_reports_per_country
253
265
266
+
254
267
def breakdown_of_reports_by_hq_country_viz (df_reports_per_country ):
255
268
# Plotting the horizontal bar chart with Plotly Express
256
269
fig = px .bar (df_reports_per_country , y = 'upe_name' , x = 'percent' ,
@@ -271,11 +284,12 @@ def breakdown_of_reports_by_hq_country_viz(df_reports_per_country):
271
284
# fig.show()
272
285
return go .Figure (fig )
273
286
287
+
274
288
## Viz 6 - Breakdown of reports by sector over time (bar chart)
275
289
276
290
277
291
def breakdown_of_reports_by_sector_over_time (df ):
278
- df_reports_per_sector_over_time = df
292
+ # df_reports_per_sector_over_time = df
279
293
# return df_reports_per_sector_over_time
280
294
281
295
# Step 1: Determine the top 10 sectors that released reports
@@ -285,14 +299,16 @@ def breakdown_of_reports_by_sector_over_time(df):
285
299
df ['Sectors' ] = df ['sector' ].apply (lambda x : x if x in top_10_sectors else 'Others' )
286
300
287
301
# Step 3: Group the DataFrame by 'year', 'Sectors', and count the number of unique companies for each year and sector
288
- df_reports_per_year_sector = df .groupby (['year' , 'Sectors' ])['mnc' ].nunique ().reset_index (name = 'unique_company_count' )
302
+ df_reports_per_year_sector = df .groupby (['year' , 'Sectors' ])['mnc' ].nunique ().reset_index (
303
+ name = 'unique_company_count' )
289
304
290
305
# Sort sectors alphabetically
291
306
df_reports_per_year_sector = df_reports_per_year_sector .sort_values (by = 'Sectors' , ascending = False )
292
307
293
308
return df_reports_per_year_sector , top_10_sectors
294
- def breakdown_of_reports_by_sector_over_time_viz (df_reports_per_year_sector , top_10_sectors ):
295
309
310
+
311
+ def breakdown_of_reports_by_sector_over_time_viz (df_reports_per_year_sector , top_10_sectors ):
296
312
# Define the order of sectors for the stacked bar chart and legend, reversed
297
313
chart_order = ['Others' ] + top_10_sectors [::- 1 ]
298
314
legend_order = ['Others' ] + top_10_sectors [::- 1 ]
@@ -319,10 +335,6 @@ def breakdown_of_reports_by_sector_over_time_viz(df_reports_per_year_sector, top
319
335
return go .Figure (fig )
320
336
321
337
322
-
323
-
324
-
325
-
326
338
## Viz 7 - Breakdown of reports by HQ country over time (bar chart)
327
339
# TODO add code
328
340
@@ -372,6 +384,7 @@ def compute_company_available_reports(df: pd.DataFrame, company: str) -> dict:
372
384
373
385
return data
374
386
387
+
375
388
def display_company_available_reports (
376
389
df : pd .DataFrame , company : str , hide_company : bool = True ) -> pd .DataFrame :
377
390
"""Display the number of reports tracked for a specific company and the
@@ -449,6 +462,8 @@ def compute_company_key_financials_kpis(
449
462
data = df .to_dict (orient = 'index' )
450
463
451
464
return data
465
+
466
+
452
467
def display_company_key_financials_kpis (
453
468
df : pd .DataFrame , company : str , year : int = None ) -> pd .DataFrame :
454
469
"""Display key financial KPIs for a company.
@@ -470,6 +485,7 @@ def display_company_key_financials_kpis(
470
485
471
486
return df
472
487
488
+
473
489
# Viz 14
474
490
def compute_top_jurisdictions_revenue (
475
491
df : pd .DataFrame , company : str , year : int ) -> dict :
@@ -728,8 +744,6 @@ def display_pretax_profit_and_employees_rank(
728
744
return go .Figure (fig )
729
745
730
746
731
-
732
-
733
747
# Viz 18
734
748
735
749
def compute_related_and_unrelated_revenues_breakdown (
@@ -852,7 +866,8 @@ def display_related_and_unrelated_revenues_breakdown(df: pd.DataFrame, company:
852
866
853
867
# fig.show()
854
868
return go .Figure (fig )
855
-
869
+
870
+
856
871
# Viz 21 - evolution of tax havens use over time : % profit vs % employees in TH over time
857
872
def compute_tax_havens_use_evolution (df : pd .DataFrame , company : str ) -> dict :
858
873
"""Compute the evolution of tax havens use by company over time.
@@ -946,8 +961,8 @@ def display_tax_havens_use_evolution(df: pd.DataFrame, company: str):
946
961
947
962
948
963
# Viz 24
949
- from wordcloud import WordCloud
950
- import matplotlib . pyplot as plt
964
+
965
+
951
966
952
967
def viz_24_compute_data (df ):
953
968
# Drop duplicates to ensure each MNC appears only once per year
@@ -961,6 +976,7 @@ def viz_24_compute_data(df):
961
976
962
977
return mnc_report_count
963
978
979
+
964
980
def viz_24_viz (mnc_report_count ):
965
981
# Generate the word cloud using the report counts as weights
966
982
wordcloud = WordCloud (width = 800 , height = 400 , background_color = 'white' ).generate_from_frequencies (mnc_report_count )
@@ -972,18 +988,6 @@ def viz_24_viz(mnc_report_count):
972
988
return fig
973
989
974
990
975
-
976
-
977
-
978
-
979
-
980
-
981
-
982
-
983
-
984
-
985
-
986
-
987
991
# Viz 25
988
992
989
993
# List financial columns
@@ -1090,9 +1094,6 @@ def compute_completness_score(df: pd.DataFrame, company: str, year: int) -> floa
1090
1094
return completness_score
1091
1095
1092
1096
1093
-
1094
-
1095
-
1096
1097
def compute_transparency_score (df : pd .DataFrame , company : str ) -> dict :
1097
1098
"""Compute the transparency score which is the average of component I
1098
1099
(geographic score) and component II (completness score).
@@ -1131,7 +1132,6 @@ def compute_transparency_score(df: pd.DataFrame, company: str) -> dict:
1131
1132
return data
1132
1133
1133
1134
1134
-
1135
1135
def transparency_scores_to_csv (
1136
1136
df : pd .DataFrame , csv_path : str = './' ) -> pd .DataFrame :
1137
1137
"""Compute transparency score for all companies and all years into a
@@ -1221,9 +1221,6 @@ def display_transparency_score(df: pd.DataFrame, company: str, year: int = None)
1221
1221
fig .show ()
1222
1222
1223
1223
1224
-
1225
-
1226
-
1227
1224
# Viz 26
1228
1225
1229
1226
# Functions below use the 'financial_columns' list, and same computation
@@ -1322,6 +1319,3 @@ def display_transparency_score_over_time_details(
1322
1319
})
1323
1320
1324
1321
return df
1325
-
1326
-
1327
-
0 commit comments