|
2289 | 2289 | "[Link to Delta Lake](https://github.com/delta-io/delta)."
|
2290 | 2290 | ]
|
2291 | 2291 | },
|
| 2292 | + { |
| 2293 | + "cell_type": "markdown", |
| 2294 | + "id": "6a202591", |
| 2295 | + "metadata": {}, |
| 2296 | + "source": [ |
| 2297 | + "### From Complex SQL to Simple Merges: Delta Lake's Upsert Solution" |
| 2298 | + ] |
| 2299 | + }, |
| 2300 | + { |
| 2301 | + "cell_type": "code", |
| 2302 | + "execution_count": null, |
| 2303 | + "id": "e655b5fa", |
| 2304 | + "metadata": { |
| 2305 | + "tags": [ |
| 2306 | + "hide-cell" |
| 2307 | + ] |
| 2308 | + }, |
| 2309 | + "outputs": [], |
| 2310 | + "source": [ |
| 2311 | + "!pip install delta-spark" |
| 2312 | + ] |
| 2313 | + }, |
| 2314 | + { |
| 2315 | + "cell_type": "code", |
| 2316 | + "execution_count": null, |
| 2317 | + "id": "32ae71e5", |
| 2318 | + "metadata": { |
| 2319 | + "tags": [ |
| 2320 | + "remove-cell" |
| 2321 | + ] |
| 2322 | + }, |
| 2323 | + "outputs": [], |
| 2324 | + "source": [ |
| 2325 | + "import pyspark\n", |
| 2326 | + "from delta import *\n", |
| 2327 | + "\n", |
| 2328 | + "# Configure Spark to use Delta\n", |
| 2329 | + "builder = (\n", |
| 2330 | + " pyspark.sql.SparkSession.builder.appName(\"MyApp\")\n", |
| 2331 | + " .config(\"spark.sql.extensions\", \"io.delta.sql.DeltaSparkSessionExtension\")\n", |
| 2332 | + " .config(\n", |
| 2333 | + " \"spark.sql.catalog.spark_catalog\",\n", |
| 2334 | + " \"org.apache.spark.sql.delta.catalog.DeltaCatalog\",\n", |
| 2335 | + " )\n", |
| 2336 | + ")\n", |
| 2337 | + "\n", |
| 2338 | + "spark = configure_spark_with_delta_pip(builder).getOrCreate()" |
| 2339 | + ] |
| 2340 | + }, |
| 2341 | + { |
| 2342 | + "cell_type": "markdown", |
| 2343 | + "id": "775dcae5", |
| 2344 | + "metadata": {}, |
| 2345 | + "source": [ |
| 2346 | + "Traditionally, implementing upsert (update or insert) logic requires separate UPDATE and INSERT statements or complex SQL. This approach can be error-prone and inefficient, especially for large datasets. \n", |
| 2347 | + "\n", |
| 2348 | + "Delta Lake's merge operation solves this problem by allowing you to specify different actions for matching and non-matching records in a single, declarative statement.\n", |
| 2349 | + "\n", |
| 2350 | + "Here's an example that demonstrates the power and simplicity of Delta Lake's merge operation:\n", |
| 2351 | + "\n", |
| 2352 | + "First, let's set up our initial data:\n", |
| 2353 | + "\n", |
| 2354 | + "\n" |
| 2355 | + ] |
| 2356 | + }, |
| 2357 | + { |
| 2358 | + "cell_type": "code", |
| 2359 | + "execution_count": 12, |
| 2360 | + "id": "ff393032", |
| 2361 | + "metadata": {}, |
| 2362 | + "outputs": [ |
| 2363 | + { |
| 2364 | + "name": "stdout", |
| 2365 | + "output_type": "stream", |
| 2366 | + "text": [ |
| 2367 | + "Initial Customers:\n", |
| 2368 | + "+-----------+-----------+----------------+-------------------+\n", |
| 2369 | + "|customer_id| name| email| last_updated|\n", |
| 2370 | + "+-----------+-----------+----------------+-------------------+\n", |
| 2371 | + "| 1| John Doe|john@example.com|2023-01-01 10:00:00|\n", |
| 2372 | + "| 2| Jane Smith|jane@example.com|2023-01-02 11:00:00|\n", |
| 2373 | + "| 3|Bob Johnson| bob@example.com|2023-01-03 12:00:00|\n", |
| 2374 | + "+-----------+-----------+----------------+-------------------+\n", |
| 2375 | + "\n", |
| 2376 | + "Updates:\n", |
| 2377 | + "+-----------+-----------+--------------------+\n", |
| 2378 | + "|customer_id| name| email|\n", |
| 2379 | + "+-----------+-----------+--------------------+\n", |
| 2380 | + "| 2| Jane Doe|jane.doe@example.com|\n", |
| 2381 | + "| 3|Bob Johnson| bob@example.com|\n", |
| 2382 | + "| 4|Alice Brown| alice@example.com|\n", |
| 2383 | + "+-----------+-----------+--------------------+\n", |
| 2384 | + "\n" |
| 2385 | + ] |
| 2386 | + } |
| 2387 | + ], |
| 2388 | + "source": [ |
| 2389 | + "# Create sample data for 'customers' DataFrame\n", |
| 2390 | + "customers_data = [\n", |
| 2391 | + " (1, \"John Doe\", \"john@example.com\", \"2023-01-01 10:00:00\"),\n", |
| 2392 | + " (2, \"Jane Smith\", \"jane@example.com\", \"2023-01-02 11:00:00\"),\n", |
| 2393 | + " (3, \"Bob Johnson\", \"bob@example.com\", \"2023-01-03 12:00:00\"),\n", |
| 2394 | + "]\n", |
| 2395 | + "customers = spark.createDataFrame(\n", |
| 2396 | + " customers_data, [\"customer_id\", \"name\", \"email\", \"last_updated\"]\n", |
| 2397 | + ")\n", |
| 2398 | + "\n", |
| 2399 | + "# Create sample data for 'updates' DataFrame\n", |
| 2400 | + "updates_data = [\n", |
| 2401 | + " (2, \"Jane Doe\", \"jane.doe@example.com\"), # Existing customer with updates\n", |
| 2402 | + " (3, \"Bob Johnson\", \"bob@example.com\"), # Existing customer without changes\n", |
| 2403 | + " (4, \"Alice Brown\", \"alice@example.com\"), # New customer\n", |
| 2404 | + "]\n", |
| 2405 | + "updates = spark.createDataFrame(updates_data, [\"customer_id\", \"name\", \"email\"])\n", |
| 2406 | + "\n", |
| 2407 | + "# Show the initial data\n", |
| 2408 | + "print(\"Initial Customers:\")\n", |
| 2409 | + "customers.show()\n", |
| 2410 | + "print(\"Updates:\")\n", |
| 2411 | + "updates.show()" |
| 2412 | + ] |
| 2413 | + }, |
| 2414 | + { |
| 2415 | + "cell_type": "markdown", |
| 2416 | + "id": "acb9e489", |
| 2417 | + "metadata": {}, |
| 2418 | + "source": [ |
| 2419 | + "Next, we create a Delta table from our initial customer data:" |
| 2420 | + ] |
| 2421 | + }, |
| 2422 | + { |
| 2423 | + "cell_type": "code", |
| 2424 | + "execution_count": 13, |
| 2425 | + "id": "0041f1d4", |
| 2426 | + "metadata": {}, |
| 2427 | + "outputs": [ |
| 2428 | + { |
| 2429 | + "name": "stderr", |
| 2430 | + "output_type": "stream", |
| 2431 | + "text": [ |
| 2432 | + " \r" |
| 2433 | + ] |
| 2434 | + }, |
| 2435 | + { |
| 2436 | + "name": "stdout", |
| 2437 | + "output_type": "stream", |
| 2438 | + "text": [ |
| 2439 | + "Customers Delta Table created successfully\n" |
| 2440 | + ] |
| 2441 | + }, |
| 2442 | + { |
| 2443 | + "name": "stderr", |
| 2444 | + "output_type": "stream", |
| 2445 | + "text": [ |
| 2446 | + " \r" |
| 2447 | + ] |
| 2448 | + } |
| 2449 | + ], |
| 2450 | + "source": [ |
| 2451 | + "# Define the path where you want to save the Delta table\n", |
| 2452 | + "delta_table_path = \"customers_delta\"\n", |
| 2453 | + "\n", |
| 2454 | + "# Write the DataFrame as a Delta table\n", |
| 2455 | + "customers.write.format(\"delta\").mode(\"overwrite\").save(delta_table_path)\n", |
| 2456 | + "\n", |
| 2457 | + "# Create a DeltaTable object\n", |
| 2458 | + "customers_delta = DeltaTable.forPath(spark, delta_table_path)\n", |
| 2459 | + "\n", |
| 2460 | + "print(\"Customers Delta Table created successfully\")" |
| 2461 | + ] |
| 2462 | + }, |
| 2463 | + { |
| 2464 | + "cell_type": "markdown", |
| 2465 | + "id": "560b2a9d", |
| 2466 | + "metadata": {}, |
| 2467 | + "source": [ |
| 2468 | + "Now, here's the key part - the merge operation that handles both updates and inserts in a single statement:" |
| 2469 | + ] |
| 2470 | + }, |
| 2471 | + { |
| 2472 | + "cell_type": "code", |
| 2473 | + "execution_count": 14, |
| 2474 | + "id": "f0626375", |
| 2475 | + "metadata": {}, |
| 2476 | + "outputs": [ |
| 2477 | + { |
| 2478 | + "name": "stderr", |
| 2479 | + "output_type": "stream", |
| 2480 | + "text": [ |
| 2481 | + " \r" |
| 2482 | + ] |
| 2483 | + } |
| 2484 | + ], |
| 2485 | + "source": [ |
| 2486 | + "# Assume 'customers_delta' is your target table and 'updates' is your source of new data\n", |
| 2487 | + "customers_delta.alias(\"target\").merge(\n", |
| 2488 | + " updates.alias(\"source\"),\n", |
| 2489 | + " \"target.customer_id = source.customer_id\"\n", |
| 2490 | + ").whenMatchedUpdate(set={\n", |
| 2491 | + " \"name\": \"source.name\",\n", |
| 2492 | + " \"email\": \"source.email\",\n", |
| 2493 | + " \"last_updated\": \"current_timestamp()\"\n", |
| 2494 | + "}).whenNotMatchedInsert(values={\n", |
| 2495 | + " \"customer_id\": \"source.customer_id\",\n", |
| 2496 | + " \"name\": \"source.name\",\n", |
| 2497 | + " \"email\": \"source.email\",\n", |
| 2498 | + " \"last_updated\": \"current_timestamp()\"\n", |
| 2499 | + "}).execute()" |
| 2500 | + ] |
| 2501 | + }, |
| 2502 | + { |
| 2503 | + "cell_type": "code", |
| 2504 | + "execution_count": 15, |
| 2505 | + "id": "0ed114dc", |
| 2506 | + "metadata": {}, |
| 2507 | + "outputs": [ |
| 2508 | + { |
| 2509 | + "name": "stdout", |
| 2510 | + "output_type": "stream", |
| 2511 | + "text": [ |
| 2512 | + "Updated Customers Delta Table:\n" |
| 2513 | + ] |
| 2514 | + }, |
| 2515 | + { |
| 2516 | + "name": "stderr", |
| 2517 | + "output_type": "stream", |
| 2518 | + "text": [ |
| 2519 | + " \r" |
| 2520 | + ] |
| 2521 | + }, |
| 2522 | + { |
| 2523 | + "name": "stdout", |
| 2524 | + "output_type": "stream", |
| 2525 | + "text": [ |
| 2526 | + "+-----------+-----------+--------------------+--------------------+\n", |
| 2527 | + "|customer_id| name| email| last_updated|\n", |
| 2528 | + "+-----------+-----------+--------------------+--------------------+\n", |
| 2529 | + "| 2| Jane Doe|jane.doe@example.com|2024-08-20 16:05:...|\n", |
| 2530 | + "| 3|Bob Johnson| bob@example.com|2024-08-20 16:05:...|\n", |
| 2531 | + "| 4|Alice Brown| alice@example.com|2024-08-20 16:05:...|\n", |
| 2532 | + "| 1| John Doe| john@example.com| 2023-01-01 10:00:00|\n", |
| 2533 | + "+-----------+-----------+--------------------+--------------------+\n", |
| 2534 | + "\n" |
| 2535 | + ] |
| 2536 | + } |
| 2537 | + ], |
| 2538 | + "source": [ |
| 2539 | + "# Verify the updated data\n", |
| 2540 | + "print(\"Updated Customers Delta Table:\")\n", |
| 2541 | + "customers_delta.toDF().show()" |
| 2542 | + ] |
| 2543 | + }, |
2292 | 2544 | {
|
2293 | 2545 | "attachments": {},
|
2294 | 2546 | "cell_type": "markdown",
|
|
4178 | 4430 | "name": "python",
|
4179 | 4431 | "nbconvert_exporter": "python",
|
4180 | 4432 | "pygments_lexer": "ipython3",
|
4181 |
| - "version": "3.11.4" |
| 4433 | + "version": "3.11.6" |
4182 | 4434 | },
|
4183 | 4435 | "toc": {
|
4184 | 4436 | "base_numbering": 1,
|
|
0 commit comments