Skip to content

Commit cdc9cb2

Browse files
adjust notebook and workflow
1 parent e3bf18f commit cdc9cb2

File tree

4 files changed

+159
-65
lines changed

4 files changed

+159
-65
lines changed

.github/workflows/publish-marimo.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,5 +54,4 @@ jobs:
5454
id: deployment
5555
uses: actions/deploy-pages@v4
5656
with:
57-
artifact_name: github-pages
58-
path: marimo_notebooks/data_science_tools/polars_vs_pandas
57+
artifact_name: github-pages

.gitignore

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,5 +145,5 @@ dmypy.json
145145

146146
#hydra
147147
outputs
148-
149-
148+
marimo_notebooks
149+
*.csv

Makefile

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Define variables
2+
NOTEBOOK ?= notebook.py # Default value, can be overridden
3+
OUTPUT_DIR = marimo_notebooks
4+
OUTPUT_FILE = $(OUTPUT_DIR)/$(notdir $(NOTEBOOK:.py=.html))
5+
6+
# Create the output directory if it doesn't exist
7+
$(OUTPUT_DIR):
8+
mkdir -p $(OUTPUT_DIR)
9+
10+
# Export the Marimo notebook to Jupyter Notebook format
11+
html: $(OUTPUT_DIR)
12+
marimo export html $(NOTEBOOK) --output $(OUTPUT_FILE)
13+
14+
# Phony targets
15+
.PHONY: html

data_science_tools/polars_vs_pandas.py

Lines changed: 141 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,14 @@ def _():
3434
"category": np.random.choice(["A", "B", "C", "D"], size=n_rows),
3535
"value": np.random.rand(n_rows) * 1000,
3636
}
37-
df = pd.DataFrame(data)
38-
df.head(10)
39-
return data, df, n_rows, np, pd
37+
pandas_df = pd.DataFrame(data)
38+
pandas_df.head(10)
39+
return data, n_rows, np, pandas_df, pd
4040

4141

4242
@app.cell
43-
def _(df):
44-
df.to_csv("large_file.csv", index=False)
43+
def _(pandas_df):
44+
pandas_df.to_csv("large_file.csv", index=False)
4545
return
4646

4747

@@ -61,11 +61,11 @@ def _(mo):
6161
def _(pd):
6262
import time
6363

64-
start = time.time()
64+
start_read_pd = time.time()
6565
df_pd = pd.read_csv("large_file.csv")
66-
end = time.time()
67-
print(f"Pandas read_csv took {end - start:.2f} seconds")
68-
return df_pd, end, start, time
66+
end_read_pd = time.time()
67+
print(f"Pandas read_csv took {end_read_pd - start_read_pd:.2f} seconds")
68+
return df_pd, end_read_pd, start_read_pd, time
6969

7070

7171
@app.cell(hide_code=True)
@@ -78,11 +78,11 @@ def _(mo):
7878
def _(time):
7979
import polars as pl
8080

81-
start_1 = time.time()
82-
df_pl = pl.read_csv("large_file.csv")
83-
end_1 = time.time()
84-
print(f"Polars read_csv took {end_1 - start_1:.2f} seconds")
85-
return df_pl, end_1, pl, start_1
81+
start_read_pl = time.time()
82+
polars_df = pl.read_csv("large_file.csv")
83+
end_read_pl = time.time()
84+
print(f"Polars read_csv took {end_read_pl - start_read_pl:.2f} seconds")
85+
return end_read_pl, pl, polars_df, start_read_pl
8686

8787

8888
@app.cell(hide_code=True)
@@ -92,16 +92,16 @@ def _(mo):
9292

9393

9494
@app.cell
95-
def _(df_pl, pl):
96-
lazy_df = df_pl.lazy()
95+
def _(pl, polars_df):
96+
lazy_polars_df = polars_df.lazy()
9797
result = (
98-
lazy_df.filter(pl.col("value") > 100)
98+
lazy_polars_df.filter(pl.col("value") > 100)
9999
.group_by("category")
100100
.agg(pl.col("value").mean().alias("avg_value"))
101101
.collect()
102102
)
103103
result.head(10)
104-
return lazy_df, result
104+
return lazy_polars_df, result
105105

106106

107107
@app.cell(hide_code=True)
@@ -112,39 +112,109 @@ def _(mo):
112112

113113
@app.cell
114114
def _(data, pd, pl):
115-
df_pd_1 = pd.DataFrame(data)
116-
df_pl_1 = pl.DataFrame(data)
117-
return df_pd_1, df_pl_1
115+
pandas_groupby_df = pd.DataFrame(data)
116+
polars_groupby_df = pl.DataFrame(data)
117+
return pandas_groupby_df, polars_groupby_df
118118

119119

120120
@app.cell(hide_code=True)
121121
def _(mo):
122-
mo.md(r"""### Pandas""")
122+
mo.md(r"""### Groupby Mean""")
123123
return
124124

125125

126126
@app.cell
127-
def _(df_pd_1, time):
128-
start_2 = time.time()
129-
df_pd_1.groupby("category")["value"].mean()
130-
end_2 = time.time()
131-
print(f"Pandas groupby took {end_2 - start_2:.2f} seconds")
132-
return end_2, start_2
127+
def _(pandas_groupby_df, time):
128+
start_groupby_pd = time.time()
129+
pandas_groupby_df.groupby("category")["value"].mean()
130+
end_groupby_pd = time.time()
131+
print(f"Pandas groupby took {end_groupby_pd - start_groupby_pd:.2f} seconds")
132+
return end_groupby_pd, start_groupby_pd
133+
134+
135+
@app.cell
136+
def _(pl, polars_groupby_df, time):
137+
start_groupby_pl = time.time()
138+
polars_groupby_df.group_by("category").agg(pl.col("value").mean())
139+
end_groupby_pl = time.time()
140+
print(f"Polars groupby took {end_groupby_pl - start_groupby_pl:.2f} seconds")
141+
return end_groupby_pl, start_groupby_pl
133142

134143

135144
@app.cell(hide_code=True)
136145
def _(mo):
137-
mo.md(r"""### Polars""")
146+
mo.md(r"""### Filter Rows""")
138147
return
139148

140149

141150
@app.cell
142-
def _(df_pl_1, pl, time):
143-
start_3 = time.time()
144-
df_pl_1.group_by("category").agg(pl.col("value").mean())
145-
end_3 = time.time()
146-
print(f"Polars groupby took {end_3 - start_3:.2f} seconds")
147-
return end_3, start_3
151+
def _(pandas_groupby_df, time):
152+
start_filter_pd = time.time()
153+
pandas_filtered_df = pandas_groupby_df[pandas_groupby_df["value"] > 500]
154+
end_filter_pd = time.time()
155+
print(f"Pandas filter took {end_filter_pd - start_filter_pd:.2f} seconds")
156+
return end_filter_pd, pandas_filtered_df, start_filter_pd
157+
158+
159+
@app.cell
160+
def _(pl, polars_groupby_df, time):
161+
start_filter_pl = time.time()
162+
polars_filtered_df = polars_groupby_df.filter(pl.col("value") > 500)
163+
end_filter_pl = time.time()
164+
print(f"Polars filter took {end_filter_pl - start_filter_pl:.2f} seconds")
165+
return end_filter_pl, polars_filtered_df, start_filter_pl
166+
167+
168+
@app.cell(hide_code=True)
169+
def _(mo):
170+
mo.md(r"""### Sort by Column""")
171+
return
172+
173+
174+
@app.cell
175+
def _(pandas_groupby_df, time):
176+
start_sort_pd = time.time()
177+
pandas_sorted_df = pandas_groupby_df.sort_values("value")
178+
end_sort_pd = time.time()
179+
print(f"Pandas sort took {end_sort_pd - start_sort_pd:.2f} seconds")
180+
return end_sort_pd, pandas_sorted_df, start_sort_pd
181+
182+
183+
@app.cell
184+
def _(polars_groupby_df, time):
185+
start_sort_pl = time.time()
186+
polars_sorted_df = polars_groupby_df.sort("value")
187+
end_sort_pl = time.time()
188+
print(f"Polars sort took {end_sort_pl - start_sort_pl:.2f} seconds")
189+
return end_sort_pl, polars_sorted_df, start_sort_pl
190+
191+
192+
@app.cell(hide_code=True)
193+
def _(mo):
194+
mo.md(r"""### Join on Key""")
195+
return
196+
197+
198+
@app.cell
199+
def _(pd, time):
200+
pandas_df1 = pd.DataFrame({"key": range(5_000_000), "val1": range(5_000_000)})
201+
pandas_df2 = pd.DataFrame({"key": range(5_000_000), "val2": range(5_000_000)})
202+
start_join_pd = time.time()
203+
pandas_joined_df = pd.merge(pandas_df1, pandas_df2, on="key")
204+
end_join_pd = time.time()
205+
print(f"Pandas join took {end_join_pd - start_join_pd:.2f} seconds")
206+
return end_join_pd, pandas_df1, pandas_df2, pandas_joined_df, start_join_pd
207+
208+
209+
@app.cell
210+
def _(pl, time):
211+
polars_df1 = pl.DataFrame({"key": range(5_000_000), "val1": range(5_000_000)})
212+
polars_df2 = pl.DataFrame({"key": range(5_000_000), "val2": range(5_000_000)})
213+
start_join_pl = time.time()
214+
polars_joined_df = polars_df1.join(polars_df2, on="key", how="inner")
215+
end_join_pl = time.time()
216+
print(f"Polars join took {end_join_pl - start_join_pl:.2f} seconds")
217+
return end_join_pl, polars_df1, polars_df2, polars_joined_df, start_join_pl
148218

149219

150220
@app.cell(hide_code=True)
@@ -160,15 +230,15 @@ def _(mo):
160230

161231

162232
@app.cell
163-
def _(df_pd_1):
164-
df_pd_1[df_pd_1["value"] > 100]
165-
return
233+
def _(pandas_groupby_df):
234+
pandas_filtered_rows_df = pandas_groupby_df[pandas_groupby_df["value"] > 100]
235+
return (pandas_filtered_rows_df,)
166236

167237

168238
@app.cell
169-
def _(df_pl_1, pl):
170-
df_pl_1.filter(pl.col("value") > 100)
171-
return
239+
def _(pl, polars_groupby_df):
240+
polars_filtered_rows_df = polars_groupby_df.filter(pl.col("value") > 100)
241+
return (polars_filtered_rows_df,)
172242

173243

174244
@app.cell(hide_code=True)
@@ -178,15 +248,15 @@ def _(mo):
178248

179249

180250
@app.cell
181-
def _(df_pd_1):
182-
df_pd_1[["category", "value"]]
183-
return
251+
def _(pandas_groupby_df):
252+
pandas_selected_columns_df = pandas_groupby_df[["category", "value"]]
253+
return (pandas_selected_columns_df,)
184254

185255

186256
@app.cell
187-
def _(df_pl_1):
188-
df_pl_1.select(["category", "value"])
189-
return
257+
def _(polars_groupby_df):
258+
polars_selected_columns_df = polars_groupby_df.select(["category", "value"])
259+
return (polars_selected_columns_df,)
190260

191261

192262
@app.cell(hide_code=True)
@@ -196,20 +266,21 @@ def _(mo):
196266

197267

198268
@app.cell
199-
def _(df_pd_1):
200-
df_result = df_pd_1[df_pd_1["value"] > 1000]
201-
df_result = df_result.groupby("category")["value"].mean().reset_index()
202-
return (df_result,)
269+
def _(pandas_groupby_df):
270+
pandas_chained_operations_df = pandas_groupby_df[pandas_groupby_df["value"] > 1000]
271+
pandas_chained_operations_df = (
272+
pandas_chained_operations_df.groupby("category")["value"].mean().reset_index()
273+
)
274+
return (pandas_chained_operations_df,)
203275

204276

205277
@app.cell
206-
def _(df_pl_1, pl):
207-
df_result_1 = (
208-
df_pl_1.filter(pl.col("value") > 1000)
209-
.group_by("category")
210-
.agg(pl.col("value").mean().alias("avg_value"))
211-
)
212-
return (df_result_1,)
278+
def _(pl, polars_groupby_df):
279+
polars_chained_operations_df = polars_groupby_df.filter(pl.col("value") > 1000)
280+
polars_chained_operations_df = polars_chained_operations_df.group_by(
281+
"category"
282+
).agg(pl.col("value").mean().alias("avg_value"))
283+
return (polars_chained_operations_df,)
213284

214285

215286
@app.cell(hide_code=True)
@@ -219,9 +290,18 @@ def _(mo):
219290

220291

221292
@app.cell
222-
def _(df_pd_1, df_pl_1):
223-
print(df_pd_1.memory_usage(deep=True).sum() / 1000000.0, "MB")
224-
print(df_pl_1.estimated_size() / 1000000.0, "MB")
293+
def _(pandas_groupby_df, polars_groupby_df):
294+
print(
295+
f"Pandas DataFrame memory usage: {pandas_groupby_df.memory_usage(deep=True).sum() / 1000000.0:2f} MB"
296+
)
297+
print(
298+
f"Polars DataFrame estimated size: {polars_groupby_df.estimated_size() / 1000000.0} MB"
299+
)
300+
return
301+
302+
303+
@app.cell
304+
def _():
225305
return
226306

227307

0 commit comments

Comments
 (0)