Skip to content

Commit ede80e3

Browse files
add pyspark parametrize
1 parent 09b14b0 commit ede80e3

File tree

6 files changed

+377
-59
lines changed

6 files changed

+377
-59
lines changed

.github/workflows/publish-marimo.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ jobs:
2323
uses: astral-sh/setup-uv@v5
2424

2525
- name: Install dependencies
26-
run: uv add marimo
26+
run: uv add "marimo==0.13.2" "narwhals==1.36.0"
2727

2828
- name: Export Marimo Notebook
2929
env:
@@ -32,7 +32,7 @@ jobs:
3232
uv run marimo export html data_science_tools/polars_vs_pandas.py -o build/data_science_tools/polars_vs_pandas.html --sandbox
3333
uv run marimo export html llm/pydantic_ai_examples.py -o build/llm/pydantic_ai_examples.html --sandbox
3434
uv run marimo export html data_science_tools/pandas_api_on_spark.py -o build/data_science_tools/pandas_api_on_spark.html --sandbox
35-
35+
uv run marimo export html data_science_tools/pyspark_parametrize.py -o build/data_science_tools/pyspark_parametrize.html --sandbox
3636
- name: Upload Pages Artifact
3737
uses: actions/upload-pages-artifact@v3
3838
with:

data_science_tools/pandas_api_on_spark.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
import marimo
1616

17-
__generated_with = "0.13.0"
17+
__generated_with = "0.13.4"
1818
app = marimo.App(width="medium")
1919

2020

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
# /// script
2+
# requires-python = ">=3.11"
3+
# dependencies = [
4+
# "marimo",
5+
# "pandas==2.2.3",
6+
# "pyspark==3.5.5",
7+
# ]
8+
# ///
9+
10+
import marimo
11+
12+
__generated_with = "0.13.2"
13+
app = marimo.App(width="medium")
14+
15+
16+
@app.cell(hide_code=True)
17+
def _():
18+
import marimo as mo
19+
20+
return (mo,)
21+
22+
23+
@app.cell(hide_code=True)
24+
def _(mo):
25+
mo.md(r"""## Set Up""")
26+
return
27+
28+
29+
@app.cell
30+
def _():
31+
from datetime import date
32+
33+
import pandas as pd
34+
from pyspark.sql import SparkSession
35+
36+
spark = SparkSession.builder.getOrCreate()
37+
return date, pd, spark
38+
39+
40+
@app.cell
41+
def _(date, pd, spark):
42+
# Create a Spark DataFrame
43+
item_price_pandas = pd.DataFrame(
44+
{
45+
"item_id": [1, 2, 3, 4],
46+
"price": [4, 2, 5, 1],
47+
"transaction_date": [
48+
date(2025, 1, 15),
49+
date(2025, 2, 1),
50+
date(2025, 3, 10),
51+
date(2025, 4, 22),
52+
],
53+
}
54+
)
55+
56+
item_price = spark.createDataFrame(item_price_pandas)
57+
item_price.show()
58+
return (item_price,)
59+
60+
61+
@app.cell(hide_code=True)
62+
def _(mo):
63+
mo.md(r"""## Traditional Query Approach""")
64+
return
65+
66+
67+
@app.cell
68+
def _(item_price, spark):
69+
item_price.createOrReplaceTempView("item_price_view")
70+
transaction_date_str = "2025-02-15"
71+
72+
query_with_fstring = f"""SELECT *
73+
FROM item_price_view
74+
WHERE transaction_date > '{transaction_date_str}'
75+
"""
76+
77+
spark.sql(query_with_fstring).show()
78+
return (transaction_date_str,)
79+
80+
81+
@app.cell(hide_code=True)
82+
def _(mo):
83+
mo.md(r"""## Parameterized Queries with PySpark Custom String Formatting""")
84+
return
85+
86+
87+
@app.cell
88+
def _(item_price, spark, transaction_date_str):
89+
parametrized_query = """SELECT *
90+
FROM {item_price}
91+
WHERE transaction_date > {transaction_date}
92+
"""
93+
94+
spark.sql(
95+
parametrized_query, item_price=item_price, transaction_date=transaction_date_str
96+
).show()
97+
return
98+
99+
100+
@app.cell(hide_code=True)
101+
def _(mo):
102+
mo.md(r"""## Parameterized Queries with Parameter Markers""")
103+
return
104+
105+
106+
@app.cell
107+
def _(date, item_price, spark):
108+
query_with_markers = """SELECT *
109+
FROM {item_price}
110+
WHERE transaction_date > :transaction_date
111+
"""
112+
113+
transaction_date = date(2025, 2, 15)
114+
115+
spark.sql(
116+
query_with_markers,
117+
item_price=item_price,
118+
args={"transaction_date": transaction_date},
119+
).show()
120+
return (query_with_markers,)
121+
122+
123+
@app.cell(hide_code=True)
124+
def _(mo):
125+
mo.md(r"""## Make SQL Easier to Reuse""")
126+
return
127+
128+
129+
@app.cell
130+
def _(date, item_price, query_with_markers, spark):
131+
transaction_date_1 = date(2025, 3, 9)
132+
133+
spark.sql(
134+
query_with_markers,
135+
item_price=item_price,
136+
args={"transaction_date": transaction_date_1},
137+
).show()
138+
return
139+
140+
141+
@app.cell
142+
def _(date, item_price, query_with_markers, spark):
143+
transaction_date_2 = date(2025, 3, 15)
144+
145+
spark.sql(
146+
query_with_markers,
147+
item_price=item_price,
148+
args={"transaction_date": transaction_date_2},
149+
).show()
150+
return
151+
152+
153+
@app.cell(hide_code=True)
154+
def _(mo):
155+
mo.md(r"""## Easier Unit Testing with Parameterized Queries""")
156+
return
157+
158+
159+
@app.cell
160+
def _(spark):
161+
def filter_by_price_threshold(df, amount):
162+
return spark.sql(
163+
"SELECT * from {df} where price > :amount", df=df, args={"amount": amount}
164+
)
165+
166+
return (filter_by_price_threshold,)
167+
168+
169+
@app.cell
170+
def test_query_return_correct_number_of_rows(filter_by_price_threshold, spark):
171+
# Create test input DataFrame
172+
df = spark.createDataFrame(
173+
[
174+
("Product 1", 10.0, 5),
175+
("Product 2", 15.0, 3),
176+
("Product 3", 8.0, 2),
177+
],
178+
["name", "price", "quantity"],
179+
)
180+
181+
# Execute query with parameters
182+
assert filter_by_price_threshold(df, 10).count() == 1
183+
assert filter_by_price_threshold(df, 8).count() == 2
184+
return
185+
186+
187+
@app.cell(hide_code=True)
188+
def imports():
189+
import pytest
190+
191+
return
192+
193+
194+
if __name__ == "__main__":
195+
app.run()

llm/pydantic_ai_examples.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,13 @@
1313
# "pydantic==2.11.3",
1414
# "pydantic-ai==0.1.4",
1515
# "pydantic-ai-slim[duckduckgo]==0.1.4",
16+
# "requests==2.32.3",
1617
# ]
1718
# ///
1819

1920
import marimo
2021

21-
__generated_with = "0.13.0"
22+
__generated_with = "0.13.4"
2223
app = marimo.App(width="medium")
2324

2425

@@ -45,13 +46,13 @@ def _():
4546
api_key=os.environ.get("OPENAI_API_KEY"),
4647
)
4748

48-
response = client.responses.create(
49+
openai_response = client.responses.create(
4950
model="gpt-4o-mini-2024-07-18",
5051
instructions="Extract name, years of experience, and primary skill from the job applicant description.",
5152
input="Khuyen Tran is a data scientist with 5 years of experience, skilled in Python and machine learning.",
5253
)
5354

54-
print(response.output_text)
55+
print(openai_response.output_text)
5556
return
5657

5758

@@ -144,7 +145,6 @@ def _(Agent, UnemploymentDataSource):
144145
search_agent = Agent(
145146
"gpt-4o-mini-2024-07-18",
146147
tools=[duckduckgo_search_tool()],
147-
system_prompt="Search DuckDuckGo and return links or resources that match the query.",
148148
output_type=UnemploymentDataSource,
149149
)
150150

pyproject.toml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,14 @@ readme = "README.md"
66
requires-python = ">=3.11"
77
dependencies = [
88
"loguru>=0.7.3",
9-
"marimo>=0.13.0",
9+
"marimo==0.13.2",
10+
"narwhals==1.36.0",
1011
"nbformat>=5.10.4",
1112
"pandas>=2.2.3",
13+
"pyspark[sql]>=3.5.5",
14+
]
15+
16+
[dependency-groups]
17+
dev = [
18+
"pytest>=8.3.5",
1219
]

0 commit comments

Comments
 (0)