adjust notebook and workflow

khuyentran1401 · khuyentran1401 · commit cdc9cb2b004b · 2025-04-18T11:32:19.000-07:00
diff --git a/.github/workflows/publish-marimo.yml b/.github/workflows/publish-marimo.yml
@@ -54,5 +54,4 @@ jobs:
           id: deployment
           uses: actions/deploy-pages@v4
           with:
-              artifact_name: github-pages
-              path: marimo_notebooks/data_science_tools/polars_vs_pandas
+              artifact_name: github-pages
diff --git a/.gitignore b/.gitignore
@@ -145,5 +145,5 @@ dmypy.json
 
 #hydra
 outputs
-
-
+marimo_notebooks
+*.csv
diff --git a/Makefile b/Makefile
@@ -0,0 +1,15 @@
+# Define variables
+NOTEBOOK ?= notebook.py  # Default value, can be overridden
+OUTPUT_DIR = marimo_notebooks
+OUTPUT_FILE = $(OUTPUT_DIR)/$(notdir $(NOTEBOOK:.py=.html))
+
+# Create the output directory if it doesn't exist
+$(OUTPUT_DIR):
+	mkdir -p $(OUTPUT_DIR)
+
+# Export the Marimo notebook to Jupyter Notebook format
+html: $(OUTPUT_DIR)
+	marimo export html $(NOTEBOOK) --output $(OUTPUT_FILE)
+
+# Phony targets
+.PHONY: html
diff --git a/data_science_tools/polars_vs_pandas.py b/data_science_tools/polars_vs_pandas.py
@@ -34,14 +34,14 @@ def _():
         "category": np.random.choice(["A", "B", "C", "D"], size=n_rows),
         "value": np.random.rand(n_rows) * 1000,
     }
-    df = pd.DataFrame(data)
-    df.head(10)
-    return data, df, n_rows, np, pd
+    pandas_df = pd.DataFrame(data)
+    pandas_df.head(10)
+    return data, n_rows, np, pandas_df, pd
 
 
 @app.cell
-def _(df):
-    df.to_csv("large_file.csv", index=False)
+def _(pandas_df):
+    pandas_df.to_csv("large_file.csv", index=False)
     return
 
 
@@ -61,11 +61,11 @@ def _(mo):
 def _(pd):
     import time
 
-    start = time.time()
+    start_read_pd = time.time()
     df_pd = pd.read_csv("large_file.csv")
-    end = time.time()
-    print(f"Pandas read_csv took {end - start:.2f} seconds")
-    return df_pd, end, start, time
+    end_read_pd = time.time()
+    print(f"Pandas read_csv took {end_read_pd - start_read_pd:.2f} seconds")
+    return df_pd, end_read_pd, start_read_pd, time
 
 
 @app.cell(hide_code=True)
@@ -78,11 +78,11 @@ def _(mo):
 def _(time):
     import polars as pl
 
-    start_1 = time.time()
-    df_pl = pl.read_csv("large_file.csv")
-    end_1 = time.time()
-    print(f"Polars read_csv took {end_1 - start_1:.2f} seconds")
-    return df_pl, end_1, pl, start_1
+    start_read_pl = time.time()
+    polars_df = pl.read_csv("large_file.csv")
+    end_read_pl = time.time()
+    print(f"Polars read_csv took {end_read_pl - start_read_pl:.2f} seconds")
+    return end_read_pl, pl, polars_df, start_read_pl
 
 
 @app.cell(hide_code=True)
@@ -92,16 +92,16 @@ def _(mo):
 
 
 @app.cell
-def _(df_pl, pl):
-    lazy_df = df_pl.lazy()
+def _(pl, polars_df):
+    lazy_polars_df = polars_df.lazy()
     result = (
-        lazy_df.filter(pl.col("value") > 100)
+        lazy_polars_df.filter(pl.col("value") > 100)
         .group_by("category")
         .agg(pl.col("value").mean().alias("avg_value"))
         .collect()
     )
     result.head(10)
-    return lazy_df, result
+    return lazy_polars_df, result
 
 
 @app.cell(hide_code=True)
@@ -112,39 +112,109 @@ def _(mo):
 
 @app.cell
 def _(data, pd, pl):
-    df_pd_1 = pd.DataFrame(data)
-    df_pl_1 = pl.DataFrame(data)
-    return df_pd_1, df_pl_1
+    pandas_groupby_df = pd.DataFrame(data)
+    polars_groupby_df = pl.DataFrame(data)
+    return pandas_groupby_df, polars_groupby_df
 
 
 @app.cell(hide_code=True)
 def _(mo):
-    mo.md(r"""### Pandas""")
+    mo.md(r"""### Groupby Mean""")
     return
 
 
 @app.cell
-def _(df_pd_1, time):
-    start_2 = time.time()
-    df_pd_1.groupby("category")["value"].mean()
-    end_2 = time.time()
-    print(f"Pandas groupby took {end_2 - start_2:.2f} seconds")
-    return end_2, start_2
+def _(pandas_groupby_df, time):
+    start_groupby_pd = time.time()
+    pandas_groupby_df.groupby("category")["value"].mean()
+    end_groupby_pd = time.time()
+    print(f"Pandas groupby took {end_groupby_pd - start_groupby_pd:.2f} seconds")
+    return end_groupby_pd, start_groupby_pd
+
+
+@app.cell
+def _(pl, polars_groupby_df, time):
+    start_groupby_pl = time.time()
+    polars_groupby_df.group_by("category").agg(pl.col("value").mean())
+    end_groupby_pl = time.time()
+    print(f"Polars groupby took {end_groupby_pl - start_groupby_pl:.2f} seconds")
+    return end_groupby_pl, start_groupby_pl
 
 
 @app.cell(hide_code=True)
 def _(mo):
-    mo.md(r"""### Polars""")
+    mo.md(r"""### Filter Rows""")
     return
 
 
 @app.cell
-def _(df_pl_1, pl, time):
-    start_3 = time.time()
-    df_pl_1.group_by("category").agg(pl.col("value").mean())
-    end_3 = time.time()
-    print(f"Polars groupby took {end_3 - start_3:.2f} seconds")
-    return end_3, start_3
+def _(pandas_groupby_df, time):
+    start_filter_pd = time.time()
+    pandas_filtered_df = pandas_groupby_df[pandas_groupby_df["value"] > 500]
+    end_filter_pd = time.time()
+    print(f"Pandas filter took {end_filter_pd - start_filter_pd:.2f} seconds")
+    return end_filter_pd, pandas_filtered_df, start_filter_pd
+
+
+@app.cell
+def _(pl, polars_groupby_df, time):
+    start_filter_pl = time.time()
+    polars_filtered_df = polars_groupby_df.filter(pl.col("value") > 500)
+    end_filter_pl = time.time()
+    print(f"Polars filter took {end_filter_pl - start_filter_pl:.2f} seconds")
+    return end_filter_pl, polars_filtered_df, start_filter_pl
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""### Sort by Column""")
+    return
+
+
+@app.cell
+def _(pandas_groupby_df, time):
+    start_sort_pd = time.time()
+    pandas_sorted_df = pandas_groupby_df.sort_values("value")
+    end_sort_pd = time.time()
+    print(f"Pandas sort took {end_sort_pd - start_sort_pd:.2f} seconds")
+    return end_sort_pd, pandas_sorted_df, start_sort_pd
+
+
+@app.cell
+def _(polars_groupby_df, time):
+    start_sort_pl = time.time()
+    polars_sorted_df = polars_groupby_df.sort("value")
+    end_sort_pl = time.time()
+    print(f"Polars sort took {end_sort_pl - start_sort_pl:.2f} seconds")
+    return end_sort_pl, polars_sorted_df, start_sort_pl
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""### Join on Key""")
+    return
+
+
+@app.cell
+def _(pd, time):
+    pandas_df1 = pd.DataFrame({"key": range(5_000_000), "val1": range(5_000_000)})
+    pandas_df2 = pd.DataFrame({"key": range(5_000_000), "val2": range(5_000_000)})
+    start_join_pd = time.time()
+    pandas_joined_df = pd.merge(pandas_df1, pandas_df2, on="key")
+    end_join_pd = time.time()
+    print(f"Pandas join took {end_join_pd - start_join_pd:.2f} seconds")
+    return end_join_pd, pandas_df1, pandas_df2, pandas_joined_df, start_join_pd
+
+
+@app.cell
+def _(pl, time):
+    polars_df1 = pl.DataFrame({"key": range(5_000_000), "val1": range(5_000_000)})
+    polars_df2 = pl.DataFrame({"key": range(5_000_000), "val2": range(5_000_000)})
+    start_join_pl = time.time()
+    polars_joined_df = polars_df1.join(polars_df2, on="key", how="inner")
+    end_join_pl = time.time()
+    print(f"Polars join took {end_join_pl - start_join_pl:.2f} seconds")
+    return end_join_pl, polars_df1, polars_df2, polars_joined_df, start_join_pl
 
 
 @app.cell(hide_code=True)
@@ -160,15 +230,15 @@ def _(mo):
 
 
 @app.cell
-def _(df_pd_1):
-    df_pd_1[df_pd_1["value"] > 100]
-    return
+def _(pandas_groupby_df):
+    pandas_filtered_rows_df = pandas_groupby_df[pandas_groupby_df["value"] > 100]
+    return (pandas_filtered_rows_df,)
 
 
 @app.cell
-def _(df_pl_1, pl):
-    df_pl_1.filter(pl.col("value") > 100)
-    return
+def _(pl, polars_groupby_df):
+    polars_filtered_rows_df = polars_groupby_df.filter(pl.col("value") > 100)
+    return (polars_filtered_rows_df,)
 
 
 @app.cell(hide_code=True)
@@ -178,15 +248,15 @@ def _(mo):
 
 
 @app.cell
-def _(df_pd_1):
-    df_pd_1[["category", "value"]]
-    return
+def _(pandas_groupby_df):
+    pandas_selected_columns_df = pandas_groupby_df[["category", "value"]]
+    return (pandas_selected_columns_df,)
 
 
 @app.cell
-def _(df_pl_1):
-    df_pl_1.select(["category", "value"])
-    return
+def _(polars_groupby_df):
+    polars_selected_columns_df = polars_groupby_df.select(["category", "value"])
+    return (polars_selected_columns_df,)
 
 
 @app.cell(hide_code=True)
@@ -196,20 +266,21 @@ def _(mo):
 
 
 @app.cell
-def _(df_pd_1):
-    df_result = df_pd_1[df_pd_1["value"] > 1000]
-    df_result = df_result.groupby("category")["value"].mean().reset_index()
-    return (df_result,)
+def _(pandas_groupby_df):
+    pandas_chained_operations_df = pandas_groupby_df[pandas_groupby_df["value"] > 1000]
+    pandas_chained_operations_df = (
+        pandas_chained_operations_df.groupby("category")["value"].mean().reset_index()
+    )
+    return (pandas_chained_operations_df,)
 
 
 @app.cell
-def _(df_pl_1, pl):
-    df_result_1 = (
-        df_pl_1.filter(pl.col("value") > 1000)
-        .group_by("category")
-        .agg(pl.col("value").mean().alias("avg_value"))
-    )
-    return (df_result_1,)
+def _(pl, polars_groupby_df):
+    polars_chained_operations_df = polars_groupby_df.filter(pl.col("value") > 1000)
+    polars_chained_operations_df = polars_chained_operations_df.group_by(
+        "category"
+    ).agg(pl.col("value").mean().alias("avg_value"))
+    return (polars_chained_operations_df,)
 
 
 @app.cell(hide_code=True)
@@ -219,9 +290,18 @@ def _(mo):
 
 
 @app.cell
-def _(df_pd_1, df_pl_1):
-    print(df_pd_1.memory_usage(deep=True).sum() / 1000000.0, "MB")
-    print(df_pl_1.estimated_size() / 1000000.0, "MB")
+def _(pandas_groupby_df, polars_groupby_df):
+    print(
+        f"Pandas DataFrame memory usage: {pandas_groupby_df.memory_usage(deep=True).sum() / 1000000.0:2f} MB"
+    )
+    print(
+        f"Polars DataFrame estimated size: {polars_groupby_df.estimated_size() / 1000000.0} MB"
+    )
+    return
+
+
+@app.cell
+def _():
     return
 
 

-Original file line number
+Diff line change
 #hydra
 outputs
+-
+-
 +marimo_notebooks
 +*.csv