|
1612 | 1612 | "\n",
|
1613 | 1613 | "new_df.show()"
|
1614 | 1614 | ]
|
| 1615 | + }, |
| 1616 | + { |
| 1617 | + "cell_type": "markdown", |
| 1618 | + "id": "afb930f8", |
| 1619 | + "metadata": {}, |
| 1620 | + "source": [ |
| 1621 | + "### Vectorized Operations in PySpark: pandas_udf vs Standard UDF" |
| 1622 | + ] |
| 1623 | + }, |
| 1624 | + { |
| 1625 | + "cell_type": "code", |
| 1626 | + "execution_count": null, |
| 1627 | + "id": "619e824b", |
| 1628 | + "metadata": { |
| 1629 | + "tags": [ |
| 1630 | + "hide-cell" |
| 1631 | + ] |
| 1632 | + }, |
| 1633 | + "outputs": [], |
| 1634 | + "source": [ |
| 1635 | + "!pip install -U pyspark" |
| 1636 | + ] |
| 1637 | + }, |
| 1638 | + { |
| 1639 | + "cell_type": "code", |
| 1640 | + "execution_count": null, |
| 1641 | + "id": "40c80193", |
| 1642 | + "metadata": {}, |
| 1643 | + "outputs": [], |
| 1644 | + "source": [ |
| 1645 | + "from pyspark.sql import SparkSession\n", |
| 1646 | + "\n", |
| 1647 | + "# Create SparkSession\n", |
| 1648 | + "spark = SparkSession.builder.getOrCreate()" |
| 1649 | + ] |
| 1650 | + }, |
| 1651 | + { |
| 1652 | + "cell_type": "markdown", |
| 1653 | + "id": "94329cf7", |
| 1654 | + "metadata": {}, |
| 1655 | + "source": [ |
| 1656 | + "Standard UDF functions process data row-by-row, resulting in Python function call overhead. \n", |
| 1657 | + "\n", |
| 1658 | + "In contrast, pandas_udf utilizes Pandas' vectorized operations to process entire columns in a single operation, significantly improving performance." |
| 1659 | + ] |
| 1660 | + }, |
| 1661 | + { |
| 1662 | + "cell_type": "code", |
| 1663 | + "execution_count": 3, |
| 1664 | + "id": "a4633f44", |
| 1665 | + "metadata": {}, |
| 1666 | + "outputs": [ |
| 1667 | + { |
| 1668 | + "name": "stderr", |
| 1669 | + "output_type": "stream", |
| 1670 | + "text": [ |
| 1671 | + " \r" |
| 1672 | + ] |
| 1673 | + }, |
| 1674 | + { |
| 1675 | + "name": "stdout", |
| 1676 | + "output_type": "stream", |
| 1677 | + "text": [ |
| 1678 | + "+----+\n", |
| 1679 | + "|val1|\n", |
| 1680 | + "+----+\n", |
| 1681 | + "| 1.0|\n", |
| 1682 | + "| 2.0|\n", |
| 1683 | + "| 3.0|\n", |
| 1684 | + "| 4.0|\n", |
| 1685 | + "+----+\n", |
| 1686 | + "\n" |
| 1687 | + ] |
| 1688 | + } |
| 1689 | + ], |
| 1690 | + "source": [ |
| 1691 | + "# Sample DataFrame\n", |
| 1692 | + "data = [(1.0,), (2.0,), (3.0,), (4.0,)]\n", |
| 1693 | + "df = spark.createDataFrame(data, [\"val1\"])\n", |
| 1694 | + "\n", |
| 1695 | + "df.show()" |
| 1696 | + ] |
| 1697 | + }, |
| 1698 | + { |
| 1699 | + "cell_type": "code", |
| 1700 | + "execution_count": 4, |
| 1701 | + "id": "fcf0cdf9", |
| 1702 | + "metadata": {}, |
| 1703 | + "outputs": [ |
| 1704 | + { |
| 1705 | + "name": "stderr", |
| 1706 | + "output_type": "stream", |
| 1707 | + "text": [ |
| 1708 | + " \r" |
| 1709 | + ] |
| 1710 | + }, |
| 1711 | + { |
| 1712 | + "name": "stdout", |
| 1713 | + "output_type": "stream", |
| 1714 | + "text": [ |
| 1715 | + "+----+----+\n", |
| 1716 | + "|val1|val2|\n", |
| 1717 | + "+----+----+\n", |
| 1718 | + "| 1.0| 2.0|\n", |
| 1719 | + "| 2.0| 3.0|\n", |
| 1720 | + "| 3.0| 4.0|\n", |
| 1721 | + "| 4.0| 5.0|\n", |
| 1722 | + "+----+----+\n", |
| 1723 | + "\n" |
| 1724 | + ] |
| 1725 | + } |
| 1726 | + ], |
| 1727 | + "source": [ |
| 1728 | + "from pyspark.sql.functions import udf\n", |
| 1729 | + "\n", |
| 1730 | + "# Standard UDF\n", |
| 1731 | + "@udf('double')\n", |
| 1732 | + "def plus_one(val):\n", |
| 1733 | + " return val + 1\n", |
| 1734 | + "\n", |
| 1735 | + "# Apply the Standard UDF\n", |
| 1736 | + "df.withColumn('val2', plus_one(df.val1)).show()" |
| 1737 | + ] |
| 1738 | + }, |
| 1739 | + { |
| 1740 | + "cell_type": "code", |
| 1741 | + "execution_count": 8, |
| 1742 | + "id": "e1ec8b2b", |
| 1743 | + "metadata": {}, |
| 1744 | + "outputs": [ |
| 1745 | + { |
| 1746 | + "name": "stderr", |
| 1747 | + "output_type": "stream", |
| 1748 | + "text": [ |
| 1749 | + " \r" |
| 1750 | + ] |
| 1751 | + }, |
| 1752 | + { |
| 1753 | + "name": "stdout", |
| 1754 | + "output_type": "stream", |
| 1755 | + "text": [ |
| 1756 | + "+----+----+\n", |
| 1757 | + "|val1|val2|\n", |
| 1758 | + "+----+----+\n", |
| 1759 | + "| 1.0| 2.0|\n", |
| 1760 | + "| 2.0| 3.0|\n", |
| 1761 | + "| 3.0| 4.0|\n", |
| 1762 | + "| 4.0| 5.0|\n", |
| 1763 | + "+----+----+\n", |
| 1764 | + "\n" |
| 1765 | + ] |
| 1766 | + } |
| 1767 | + ], |
| 1768 | + "source": [ |
| 1769 | + "from pyspark.sql.functions import pandas_udf\n", |
| 1770 | + "import pandas as pd\n", |
| 1771 | + "\n", |
| 1772 | + "\n", |
| 1773 | + "# Pandas UDF\n", |
| 1774 | + "@pandas_udf(\"double\")\n", |
| 1775 | + "def pandas_plus_one(val: pd.Series) -> pd.Series:\n", |
| 1776 | + " return val + 1\n", |
| 1777 | + "\n", |
| 1778 | + "\n", |
| 1779 | + "# Apply the Pandas UDF\n", |
| 1780 | + "df.withColumn(\"val2\", pandas_plus_one(df.val1)).show()" |
| 1781 | + ] |
| 1782 | + }, |
| 1783 | + { |
| 1784 | + "cell_type": "markdown", |
| 1785 | + "id": "1ab94b5b", |
| 1786 | + "metadata": {}, |
| 1787 | + "source": [ |
| 1788 | + "[Learn more about pandas_udf](https://bit.ly/4aRBNTX)." |
| 1789 | + ] |
1615 | 1790 | }
|
1616 | 1791 | ],
|
1617 | 1792 | "metadata": {
|
|
0 commit comments