@@ -34,14 +34,14 @@ def _():
34
34
"category" : np .random .choice (["A" , "B" , "C" , "D" ], size = n_rows ),
35
35
"value" : np .random .rand (n_rows ) * 1000 ,
36
36
}
37
- df = pd .DataFrame (data )
38
- df .head (10 )
39
- return data , df , n_rows , np , pd
37
+ pandas_df = pd .DataFrame (data )
38
+ pandas_df .head (10 )
39
+ return data , n_rows , np , pandas_df , pd
40
40
41
41
42
42
@app .cell
43
- def _ (df ):
44
- df .to_csv ("large_file.csv" , index = False )
43
+ def _ (pandas_df ):
44
+ pandas_df .to_csv ("large_file.csv" , index = False )
45
45
return
46
46
47
47
@@ -61,11 +61,11 @@ def _(mo):
61
61
def _ (pd ):
62
62
import time
63
63
64
- start = time .time ()
64
+ start_read_pd = time .time ()
65
65
df_pd = pd .read_csv ("large_file.csv" )
66
- end = time .time ()
67
- print (f"Pandas read_csv took { end - start :.2f} seconds" )
68
- return df_pd , end , start , time
66
+ end_read_pd = time .time ()
67
+ print (f"Pandas read_csv took { end_read_pd - start_read_pd :.2f} seconds" )
68
+ return df_pd , end_read_pd , start_read_pd , time
69
69
70
70
71
71
@app .cell (hide_code = True )
@@ -78,11 +78,11 @@ def _(mo):
78
78
def _ (time ):
79
79
import polars as pl
80
80
81
- start_1 = time .time ()
82
- df_pl = pl .read_csv ("large_file.csv" )
83
- end_1 = time .time ()
84
- print (f"Polars read_csv took { end_1 - start_1 :.2f} seconds" )
85
- return df_pl , end_1 , pl , start_1
81
+ start_read_pl = time .time ()
82
+ polars_df = pl .read_csv ("large_file.csv" )
83
+ end_read_pl = time .time ()
84
+ print (f"Polars read_csv took { end_read_pl - start_read_pl :.2f} seconds" )
85
+ return end_read_pl , pl , polars_df , start_read_pl
86
86
87
87
88
88
@app .cell (hide_code = True )
@@ -92,16 +92,16 @@ def _(mo):
92
92
93
93
94
94
@app .cell
95
- def _ (df_pl , pl ):
96
- lazy_df = df_pl .lazy ()
95
+ def _ (pl , polars_df ):
96
+ lazy_polars_df = polars_df .lazy ()
97
97
result = (
98
- lazy_df .filter (pl .col ("value" ) > 100 )
98
+ lazy_polars_df .filter (pl .col ("value" ) > 100 )
99
99
.group_by ("category" )
100
100
.agg (pl .col ("value" ).mean ().alias ("avg_value" ))
101
101
.collect ()
102
102
)
103
103
result .head (10 )
104
- return lazy_df , result
104
+ return lazy_polars_df , result
105
105
106
106
107
107
@app .cell (hide_code = True )
@@ -112,39 +112,109 @@ def _(mo):
112
112
113
113
@app .cell
114
114
def _ (data , pd , pl ):
115
- df_pd_1 = pd .DataFrame (data )
116
- df_pl_1 = pl .DataFrame (data )
117
- return df_pd_1 , df_pl_1
115
+ pandas_groupby_df = pd .DataFrame (data )
116
+ polars_groupby_df = pl .DataFrame (data )
117
+ return pandas_groupby_df , polars_groupby_df
118
118
119
119
120
120
@app .cell (hide_code = True )
121
121
def _ (mo ):
122
- mo .md (r"""### Pandas """ )
122
+ mo .md (r"""### Groupby Mean """ )
123
123
return
124
124
125
125
126
126
@app .cell
127
- def _ (df_pd_1 , time ):
128
- start_2 = time .time ()
129
- df_pd_1 .groupby ("category" )["value" ].mean ()
130
- end_2 = time .time ()
131
- print (f"Pandas groupby took { end_2 - start_2 :.2f} seconds" )
132
- return end_2 , start_2
127
+ def _ (pandas_groupby_df , time ):
128
+ start_groupby_pd = time .time ()
129
+ pandas_groupby_df .groupby ("category" )["value" ].mean ()
130
+ end_groupby_pd = time .time ()
131
+ print (f"Pandas groupby took { end_groupby_pd - start_groupby_pd :.2f} seconds" )
132
+ return end_groupby_pd , start_groupby_pd
133
+
134
+
135
+ @app .cell
136
+ def _ (pl , polars_groupby_df , time ):
137
+ start_groupby_pl = time .time ()
138
+ polars_groupby_df .group_by ("category" ).agg (pl .col ("value" ).mean ())
139
+ end_groupby_pl = time .time ()
140
+ print (f"Polars groupby took { end_groupby_pl - start_groupby_pl :.2f} seconds" )
141
+ return end_groupby_pl , start_groupby_pl
133
142
134
143
135
144
@app .cell (hide_code = True )
136
145
def _ (mo ):
137
- mo .md (r"""### Polars """ )
146
+ mo .md (r"""### Filter Rows """ )
138
147
return
139
148
140
149
141
150
@app .cell
142
- def _ (df_pl_1 , pl , time ):
143
- start_3 = time .time ()
144
- df_pl_1 .group_by ("category" ).agg (pl .col ("value" ).mean ())
145
- end_3 = time .time ()
146
- print (f"Polars groupby took { end_3 - start_3 :.2f} seconds" )
147
- return end_3 , start_3
151
+ def _ (pandas_groupby_df , time ):
152
+ start_filter_pd = time .time ()
153
+ pandas_filtered_df = pandas_groupby_df [pandas_groupby_df ["value" ] > 500 ]
154
+ end_filter_pd = time .time ()
155
+ print (f"Pandas filter took { end_filter_pd - start_filter_pd :.2f} seconds" )
156
+ return end_filter_pd , pandas_filtered_df , start_filter_pd
157
+
158
+
159
+ @app .cell
160
+ def _ (pl , polars_groupby_df , time ):
161
+ start_filter_pl = time .time ()
162
+ polars_filtered_df = polars_groupby_df .filter (pl .col ("value" ) > 500 )
163
+ end_filter_pl = time .time ()
164
+ print (f"Polars filter took { end_filter_pl - start_filter_pl :.2f} seconds" )
165
+ return end_filter_pl , polars_filtered_df , start_filter_pl
166
+
167
+
168
+ @app .cell (hide_code = True )
169
+ def _ (mo ):
170
+ mo .md (r"""### Sort by Column""" )
171
+ return
172
+
173
+
174
+ @app .cell
175
+ def _ (pandas_groupby_df , time ):
176
+ start_sort_pd = time .time ()
177
+ pandas_sorted_df = pandas_groupby_df .sort_values ("value" )
178
+ end_sort_pd = time .time ()
179
+ print (f"Pandas sort took { end_sort_pd - start_sort_pd :.2f} seconds" )
180
+ return end_sort_pd , pandas_sorted_df , start_sort_pd
181
+
182
+
183
+ @app .cell
184
+ def _ (polars_groupby_df , time ):
185
+ start_sort_pl = time .time ()
186
+ polars_sorted_df = polars_groupby_df .sort ("value" )
187
+ end_sort_pl = time .time ()
188
+ print (f"Polars sort took { end_sort_pl - start_sort_pl :.2f} seconds" )
189
+ return end_sort_pl , polars_sorted_df , start_sort_pl
190
+
191
+
192
+ @app .cell (hide_code = True )
193
+ def _ (mo ):
194
+ mo .md (r"""### Join on Key""" )
195
+ return
196
+
197
+
198
+ @app .cell
199
+ def _ (pd , time ):
200
+ pandas_df1 = pd .DataFrame ({"key" : range (5_000_000 ), "val1" : range (5_000_000 )})
201
+ pandas_df2 = pd .DataFrame ({"key" : range (5_000_000 ), "val2" : range (5_000_000 )})
202
+ start_join_pd = time .time ()
203
+ pandas_joined_df = pd .merge (pandas_df1 , pandas_df2 , on = "key" )
204
+ end_join_pd = time .time ()
205
+ print (f"Pandas join took { end_join_pd - start_join_pd :.2f} seconds" )
206
+ return end_join_pd , pandas_df1 , pandas_df2 , pandas_joined_df , start_join_pd
207
+
208
+
209
+ @app .cell
210
+ def _ (pl , time ):
211
+ polars_df1 = pl .DataFrame ({"key" : range (5_000_000 ), "val1" : range (5_000_000 )})
212
+ polars_df2 = pl .DataFrame ({"key" : range (5_000_000 ), "val2" : range (5_000_000 )})
213
+ start_join_pl = time .time ()
214
+ polars_joined_df = polars_df1 .join (polars_df2 , on = "key" , how = "inner" )
215
+ end_join_pl = time .time ()
216
+ print (f"Polars join took { end_join_pl - start_join_pl :.2f} seconds" )
217
+ return end_join_pl , polars_df1 , polars_df2 , polars_joined_df , start_join_pl
148
218
149
219
150
220
@app .cell (hide_code = True )
@@ -160,15 +230,15 @@ def _(mo):
160
230
161
231
162
232
@app .cell
163
- def _ (df_pd_1 ):
164
- df_pd_1 [ df_pd_1 ["value" ] > 100 ]
165
- return
233
+ def _ (pandas_groupby_df ):
234
+ pandas_filtered_rows_df = pandas_groupby_df [ pandas_groupby_df ["value" ] > 100 ]
235
+ return ( pandas_filtered_rows_df ,)
166
236
167
237
168
238
@app .cell
169
- def _ (df_pl_1 , pl ):
170
- df_pl_1 .filter (pl .col ("value" ) > 100 )
171
- return
239
+ def _ (pl , polars_groupby_df ):
240
+ polars_filtered_rows_df = polars_groupby_df .filter (pl .col ("value" ) > 100 )
241
+ return ( polars_filtered_rows_df ,)
172
242
173
243
174
244
@app .cell (hide_code = True )
@@ -178,15 +248,15 @@ def _(mo):
178
248
179
249
180
250
@app .cell
181
- def _ (df_pd_1 ):
182
- df_pd_1 [["category" , "value" ]]
183
- return
251
+ def _ (pandas_groupby_df ):
252
+ pandas_selected_columns_df = pandas_groupby_df [["category" , "value" ]]
253
+ return ( pandas_selected_columns_df ,)
184
254
185
255
186
256
@app .cell
187
- def _ (df_pl_1 ):
188
- df_pl_1 .select (["category" , "value" ])
189
- return
257
+ def _ (polars_groupby_df ):
258
+ polars_selected_columns_df = polars_groupby_df .select (["category" , "value" ])
259
+ return ( polars_selected_columns_df ,)
190
260
191
261
192
262
@app .cell (hide_code = True )
@@ -196,20 +266,21 @@ def _(mo):
196
266
197
267
198
268
@app .cell
199
- def _ (df_pd_1 ):
200
- df_result = df_pd_1 [df_pd_1 ["value" ] > 1000 ]
201
- df_result = df_result .groupby ("category" )["value" ].mean ().reset_index ()
202
- return (df_result ,)
269
+ def _ (pandas_groupby_df ):
270
+ pandas_chained_operations_df = pandas_groupby_df [pandas_groupby_df ["value" ] > 1000 ]
271
+ pandas_chained_operations_df = (
272
+ pandas_chained_operations_df .groupby ("category" )["value" ].mean ().reset_index ()
273
+ )
274
+ return (pandas_chained_operations_df ,)
203
275
204
276
205
277
@app .cell
206
- def _ (df_pl_1 , pl ):
207
- df_result_1 = (
208
- df_pl_1 .filter (pl .col ("value" ) > 1000 )
209
- .group_by ("category" )
210
- .agg (pl .col ("value" ).mean ().alias ("avg_value" ))
211
- )
212
- return (df_result_1 ,)
278
+ def _ (pl , polars_groupby_df ):
279
+ polars_chained_operations_df = polars_groupby_df .filter (pl .col ("value" ) > 1000 )
280
+ polars_chained_operations_df = polars_chained_operations_df .group_by (
281
+ "category"
282
+ ).agg (pl .col ("value" ).mean ().alias ("avg_value" ))
283
+ return (polars_chained_operations_df ,)
213
284
214
285
215
286
@app .cell (hide_code = True )
@@ -219,9 +290,18 @@ def _(mo):
219
290
220
291
221
292
@app .cell
222
- def _ (df_pd_1 , df_pl_1 ):
223
- print (df_pd_1 .memory_usage (deep = True ).sum () / 1000000.0 , "MB" )
224
- print (df_pl_1 .estimated_size () / 1000000.0 , "MB" )
293
+ def _ (pandas_groupby_df , polars_groupby_df ):
294
+ print (
295
+ f"Pandas DataFrame memory usage: { pandas_groupby_df .memory_usage (deep = True ).sum () / 1000000.0 :2f} MB"
296
+ )
297
+ print (
298
+ f"Polars DataFrame estimated size: { polars_groupby_df .estimated_size () / 1000000.0 } MB"
299
+ )
300
+ return
301
+
302
+
303
+ @app .cell
304
+ def _ ():
225
305
return
226
306
227
307
0 commit comments