Merge pull request #22 from khuyentran1401/add-contribution-guide

khuyentran1401 · web-flow · commit 60265176d026 · 2025-05-13T13:49:59.000-05:00
Add contribution guide
diff --git a/.github/workflows/publish-marimo.yml b/.github/workflows/publish-marimo.yml
@@ -33,6 +33,7 @@ jobs:
           uv run marimo export html llm/pydantic_ai_examples.py -o build/llm/pydantic_ai_examples.html --sandbox
           uv run marimo export html data_science_tools/pandas_api_on_spark.py -o build/data_science_tools/pandas_api_on_spark.html --sandbox
           uv run marimo export html data_science_tools/pyspark_parametrize.py -o build/data_science_tools/pyspark_parametrize.html --sandbox
+          uv run marimo export html data_science_tools/narwhals.py -o build/data_science_tools/narwhals.html --sandbox
       - name: Upload Pages Artifact
         uses: actions/upload-pages-artifact@v3
         with:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,14 +1,10 @@
 repos:
--   repo: https://github.com/ambv/black
-    rev: 20.8b1
-    hooks:
-    - id: black
-      additional_dependencies: ['click==8.0.4']
--   repo: https://github.com/pycqa/flake8
-    rev: 3.8.4
-    hooks:
-    - id: flake8
--   repo: https://github.com/timothycrosley/isort
-    rev: 5.12.0
-    hooks:
-    -   id: isort
+- repo: https://github.com/charliermarsh/ruff-pre-commit
+  rev: v0.11.6
+  hooks:
+  - id: ruff
+    args: [--fix]
+- repo: https://github.com/pre-commit/mirrors-mypy
+  rev: v1.15.0
+  hooks:
+  - id: mypy
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@ Collection of useful data science topics along with articles and videos.
 ## The Data Scientist's Toolkit: 100+ Essential Tools for Modern Analytics
 
 To receive a condensed overview of these tools and additional resources, sign up for [CodeCut's free PDF guide](https://codecut.ai/data-scientist-toolkit/?utm_source=github&utm_medium=data_science_repo&utm_campaign=free_pdf). This comprehensive 264-page document covers over 100 essential data science tools, providing you with a valuable reference for your work.
- 
+
 ## How to Download the Code in This Repository to Your Local Machine
 
 To download the code in this repo, you can simply use git clone
diff --git a/contribution.md b/contribution.md
@@ -0,0 +1,65 @@
+# Contribution Guidelines
+
+## Environment Setup
+
+### Install uv
+
+[uv](https://github.com/astral.sh/uv) is a fast Python package installer and resolver.
+
+```bash
+# Install uv
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Verify installation
+uv --version
+```
+
+### Install Dependencies
+
+```bash
+# Install dependencies from pyproject.toml
+uv sync
+```
+
+### Install Pre-commit Hooks
+
+We use pre-commit to ensure code quality and consistency.
+
+```bash
+# Install pre-commit hooks
+uv run pre-commit install
+```
+
+## Working with Marimo Notebooks
+
+### Creating a New Notebook
+
+Create a new notebook using marimo:
+
+```bash
+uv run marimo edit notebook.py --sandbox
+```
+
+### Publishing Notebooks
+
+Add the following workflow to `.github/workflows/publish-marimo.yml`:
+
+```yaml
+...
+jobs:
+  publish:
+    runs-on: ubuntu-latest
+    steps:
+      ...
+      - name: Export notebook
+        run: |
+          uv run marimo export html notebook.py -o build/notebook.html --sandbox
+      ...
+```
+
+## Pull Request Process
+
+1. Fork the repository
+2. Create a new branch for your feature
+3. Make your changes
+4. Submit a pull request with a clear description of changes
diff --git a/data_science_tools/narwhals.py b/data_science_tools/narwhals.py
@@ -0,0 +1,231 @@
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#     "duckdb==1.2.2",
+#     "marimo",
+#     "narwhals==1.39.0",
+#     "pandas==2.2.3",
+#     "polars==1.29.0",
+#     "pyarrow==20.0.0",
+#     "pyspark==3.5.5",
+#     "sqlframe==3.32.1",
+# ]
+# ///
+
+import marimo
+
+__generated_with = "0.13.6"
+app = marimo.App(width="medium")
+
+
+@app.cell
+def _():
+    import marimo as mo
+
+    return (mo,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(
+        r"""
+    # Dataframe-agnostic data science
+
+    Let's define a dataframe-agnostic function to calculate monthly average prices. It needs to support pandas, Polars, PySpark, DuckDB, PyArrow, Dask, and cuDF, without doing any conversion between libraries.
+
+    ## Bad solution: just convert to pandas
+
+    This kind of works, but:
+
+    - It doesn't return to the user the same class they started with.
+    - It kills lazy execution.
+    - It kills GPU acceleration.
+    - If forces pandas as a required dependency.
+    """
+    )
+    return
+
+
+@app.function
+def monthly_aggregate_bad(user_df):
+    if hasattr(user_df, "to_pandas"):
+        df = user_df.to_pandas()
+    elif hasattr(user_df, "toPandas"):
+        df = user_df.toPandas()
+    elif hasattr(user_df, "_to_pandas"):
+        df = user_df._to_pandas()
+    return df.resample("MS", on="date")[["price"]].mean()
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(
+        r"""
+    ## Unmaintainable solution: different branches for each library
+
+    This works, but is unfeasibly difficult to test and maintain, especially when also factoring in API changes between different versions of the same library (e.g. pandas `1.*` vs pandas `2.*`).
+    """
+    )
+    return
+
+
+@app.cell
+def _(F):
+    import duckdb
+    import pandas as pd
+    import polars as pl
+    import pyspark
+
+    def monthly_aggregate_unmaintainable(user_df):
+        if isinstance(user_df, pd.DataFrame):
+            result = user_df.resample("MS", on="date")[["price"]].mean()
+        elif isinstance(user_df, pl.DataFrame):
+            result = (
+                user_df.group_by(pl.col("date").dt.truncate("1mo"))
+                .agg(pl.col("price").mean())
+                .sort("date")
+            )
+        elif isinstance(user_df, pyspark.sql.dataframe.DataFrame):
+            result = (
+                user_df.groupBy(F.date_trunc("month", F.col("date")))
+                .agg(F.mean("price"))
+                .orderBy("date")
+            )
+        elif isinstance(user_df, duckdb.DuckDBPyRelation):
+            result = user_df.aggregate(
+                [
+                    duckdb.FunctionExpression(
+                        "time_bucket",
+                        duckdb.ConstantExpression("1 month"),
+                        duckdb.FunctionExpression("date"),
+                    ).alias("date"),
+                    duckdb.FunctionExpression("mean", "price").alias("price"),
+                ],
+            ).sort("date")
+        # TODO: more branches for PyArrow, Dask, etc... :sob:
+        return result
+
+    return duckdb, pd, pl
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(
+        r"""
+    ## Best solution: Narwhals as a unified dataframe interface
+
+    - Preserves lazy execution and GPU acceleration.
+    - Users get back what they started with.
+    - Easy to write and maintain.
+    - Strong and complete static typing.
+    """
+    )
+    return
+
+
+@app.cell
+def _():
+    import narwhals as nw
+    from narwhals.typing import IntoFrameT
+
+    def monthly_aggregate(user_df: IntoFrameT) -> IntoFrameT:
+        return (
+            nw.from_native(user_df)
+            .group_by(nw.col("date").dt.truncate("1mo"))
+            .agg(nw.col("price").mean())
+            .sort("date")
+            .to_native()
+        )
+
+    return (monthly_aggregate,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""## Demo: let's verify that it works!""")
+    return
+
+
+@app.cell
+def _():
+    from datetime import datetime
+
+    data = {
+        "date": [datetime(2020, 1, 1), datetime(2020, 1, 8), datetime(2020, 2, 3)],
+        "price": [1, 4, 3],
+    }
+    return (data,)
+
+
+@app.cell
+def _(data, monthly_aggregate, pd):
+    # pandas
+    df_pd = pd.DataFrame(data)
+    monthly_aggregate(df_pd)
+    return (df_pd,)
+
+
+@app.cell
+def _(data, monthly_aggregate, pl):
+    # Polars
+    df_pl = pl.DataFrame(data)
+    monthly_aggregate(df_pl)
+    return
+
+
+@app.cell
+def _(duckdb, monthly_aggregate):
+    # DuckDB
+    rel = duckdb.sql(
+        """
+        from values (timestamp '2020-01-01', 1),
+                    (timestamp '2020-01-08', 4),
+                    (timestamp '2020-02-03', 3)
+                    df(date, price)
+        select *
+    """
+    )
+    monthly_aggregate(rel)
+    return
+
+
+@app.cell
+def _(data, monthly_aggregate):
+    # PyArrow
+    import pyarrow as pa
+
+    tbl = pa.table(data)
+    monthly_aggregate(tbl)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(
+        r"""
+    ## Bonus - can we generate SQL?
+
+    Narwhals comes with an extra bonus feature: by combining it with [SQLFrame](https://github.com/eakmanrq/sqlframe), we can easily transpiling the Polars API to any major SQL dialect. For example, to translate to the DataBricks SQL dialect, we can do:
+    """
+    )
+    return
+
+
+@app.cell
+def _(df_pd, monthly_aggregate):
+    from sqlframe.duckdb import DuckDBSession
+
+    sqlframe = DuckDBSession()
+    sqlframe_df = sqlframe.createDataFrame(df_pd)
+    sqlframe_result = monthly_aggregate(sqlframe_df)
+    print(sqlframe_result.sql(dialect="databricks"))
+    return
+
+
+@app.cell
+def _():
+    return
+
+
+if __name__ == "__main__":
+    app.run()
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,15 +5,9 @@ description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
-    "loguru>=0.7.3",
-    "marimo==0.13.6",
-    "narwhals==1.36.0",
-    "nbformat>=5.10.4",
-    "pandas>=2.2.3",
-    "pyspark[sql]>=3.5.5",
+    "marimo>=0.13.7",
+    "pre-commit>=4.2.0",
 ]
 
 [dependency-groups]
-dev = [
-    "pytest>=8.3.5",
-]
+dev = []
diff --git a/requirements.txt b/requirements.txt
diff --git a/uv.lock b/uv.lock