From 5eb4734a96ed73639c59f309105051172b5c1605 Mon Sep 17 00:00:00 2001 From: Fokko Date: Wed, 19 Mar 2025 09:05:18 +0100 Subject: [PATCH 1/9] Fix Papermill CI --- spark/notebooks/PyIceberg - Getting Started.ipynb | 5 +++-- spark/requirements.txt | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/spark/notebooks/PyIceberg - Getting Started.ipynb b/spark/notebooks/PyIceberg - Getting Started.ipynb index e58f19c..daf65ee 100644 --- a/spark/notebooks/PyIceberg - Getting Started.ipynb +++ b/spark/notebooks/PyIceberg - Getting Started.ipynb @@ -260,7 +260,8 @@ "%config SqlMagic.autopandas = True\n", "%config SqlMagic.feedback = False\n", "%config SqlMagic.displaycon = False\n", - "%sql duckdb:///:memory:" + "%sql duckdb:///:memory:\n", + "%sql set python_scan_all_frames=true" ] }, { @@ -361,7 +362,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" + "version": "3.10.16" } }, "nbformat": 4, diff --git a/spark/requirements.txt b/spark/requirements.txt index 53cac49..0943494 100644 --- a/spark/requirements.txt +++ b/spark/requirements.txt @@ -1,7 +1,8 @@ jupyter==1.1.1 spylon-kernel==0.4.1 pyiceberg[pyarrow,duckdb,pandas]==0.7.1 -jupysql==0.10.5 +jupysql==0.11.0 matplotlib==3.9.2 scipy==1.14.1 duckdb-engine==0.13.1 +grpcio==1.71.0 From bfbeeaa2a02c15976acf065a0da9f1f03b4bdd16 Mon Sep 17 00:00:00 2001 From: Fokko Date: Wed, 19 Mar 2025 09:32:14 +0100 Subject: [PATCH 2/9] Give this a try --- spark/spark-defaults.conf | 1 + 1 file changed, 1 insertion(+) diff --git a/spark/spark-defaults.conf b/spark/spark-defaults.conf index 42b6c73..a94ad2b 100755 --- a/spark/spark-defaults.conf +++ b/spark/spark-defaults.conf @@ -31,3 +31,4 @@ spark.eventLog.enabled true spark.eventLog.dir /home/iceberg/spark-events spark.history.fs.logDirectory /home/iceberg/spark-events spark.sql.catalogImplementation in-memory +spark.executor.extraJavaOptions "--add-exports=java.base/sun.nio.ch=ALL-UNNAMED" From 3f2443d95d527b1b0756f596b9d86f1b20dc5858 Mon Sep 17 00:00:00 2001 From: Fokko Date: Wed, 19 Mar 2025 09:35:08 +0100 Subject: [PATCH 3/9] pycolor --- spark/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/spark/requirements.txt b/spark/requirements.txt index 0943494..72c455d 100644 --- a/spark/requirements.txt +++ b/spark/requirements.txt @@ -6,3 +6,4 @@ matplotlib==3.9.2 scipy==1.14.1 duckdb-engine==0.13.1 grpcio==1.71.0 +pycolor From 7db08b4f0f6bba532359ac1385539109af178fbb Mon Sep 17 00:00:00 2001 From: Fokko Date: Wed, 19 Mar 2025 09:47:38 +0100 Subject: [PATCH 4/9] Second attempt --- spark/entrypoint.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spark/entrypoint.sh b/spark/entrypoint.sh index 2738b59..b2d3681 100755 --- a/spark/entrypoint.sh +++ b/spark/entrypoint.sh @@ -17,6 +17,8 @@ # specific language governing permissions and limitations # under the License. +export JAVA_OPTS='--add-exports java.base/sun.nio.ch=ALL-UNNAMED' + start-master.sh -p 7077 start-worker.sh spark://spark-iceberg:7077 start-history-server.sh From ef9602025e2aae3a87db8e971da9dbde4e637e76 Mon Sep 17 00:00:00 2001 From: Fokko Date: Wed, 19 Mar 2025 09:53:57 +0100 Subject: [PATCH 5/9] I like this more --- spark/Dockerfile | 2 ++ spark/spark-defaults.conf | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/spark/Dockerfile b/spark/Dockerfile index d8cb9fb..346cfac 100644 --- a/spark/Dockerfile +++ b/spark/Dockerfile @@ -117,6 +117,8 @@ RUN mkdir -p /root/.ipython/profile_default/startup COPY ipython/startup/00-prettytables.py /root/.ipython/profile_default/startup COPY ipython/startup/README /root/.ipython/profile_default/startup +ENV JAVA_OPTS="--add-exports java.base/sun.nio.ch=ALL-UNNAMED" + COPY spark-defaults.conf /opt/spark/conf ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}" diff --git a/spark/spark-defaults.conf b/spark/spark-defaults.conf index a94ad2b..42b6c73 100755 --- a/spark/spark-defaults.conf +++ b/spark/spark-defaults.conf @@ -31,4 +31,3 @@ spark.eventLog.enabled true spark.eventLog.dir /home/iceberg/spark-events spark.history.fs.logDirectory /home/iceberg/spark-events spark.sql.catalogImplementation in-memory -spark.executor.extraJavaOptions "--add-exports=java.base/sun.nio.ch=ALL-UNNAMED" From ad9b85821f16558fa02c88ea210e7d496b7605d9 Mon Sep 17 00:00:00 2001 From: Fokko Date: Wed, 19 Mar 2025 10:00:59 +0100 Subject: [PATCH 6/9] MOAR --- spark/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark/Dockerfile b/spark/Dockerfile index 346cfac..5f2bb29 100644 --- a/spark/Dockerfile +++ b/spark/Dockerfile @@ -117,7 +117,7 @@ RUN mkdir -p /root/.ipython/profile_default/startup COPY ipython/startup/00-prettytables.py /root/.ipython/profile_default/startup COPY ipython/startup/README /root/.ipython/profile_default/startup -ENV JAVA_OPTS="--add-exports java.base/sun.nio.ch=ALL-UNNAMED" +ENV JAVA_OPTS="--add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED" COPY spark-defaults.conf /opt/spark/conf ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}" From 497dbd9ebc600236594a983ffabbfb2beff99431 Mon Sep 17 00:00:00 2001 From: Fokko Date: Wed, 19 Mar 2025 11:11:35 +0100 Subject: [PATCH 7/9] Try this --- spark/entrypoint.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spark/entrypoint.sh b/spark/entrypoint.sh index b2d3681..cae7905 100755 --- a/spark/entrypoint.sh +++ b/spark/entrypoint.sh @@ -17,10 +17,10 @@ # specific language governing permissions and limitations # under the License. -export JAVA_OPTS='--add-exports java.base/sun.nio.ch=ALL-UNNAMED' - -start-master.sh -p 7077 -start-worker.sh spark://spark-iceberg:7077 +start-master.sh --conf "spark.driver.extraJavaOptions=--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" \ + --conf "spark.executor.extraJavaOptions=--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" -p 7077 +start-worker.sh --conf "spark.driver.extraJavaOptions=--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" \ + --conf "spark.executor.extraJavaOptions=--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" spark://spark-iceberg:7077 start-history-server.sh start-thriftserver.sh --driver-java-options "-Dderby.system.home=/tmp/derby" From 1a1087761f24bd0a72fa618408f5c02418278440 Mon Sep 17 00:00:00 2001 From: Fokko Date: Thu, 24 Apr 2025 09:43:15 +0200 Subject: [PATCH 8/9] Cleanup --- .github/workflows/check-notebooks.yml | 2 +- ...Introduction to the Iceberg Java API.ipynb | 2 + spark/notebooks/PyIceberg - Commits.ipynb | 385 +----------------- spark/requirements.txt | 3 +- spark/spark-defaults.conf | 2 + 5 files changed, 24 insertions(+), 370 deletions(-) diff --git a/.github/workflows/check-notebooks.yml b/.github/workflows/check-notebooks.yml index 64e357c..359d51a 100644 --- a/.github/workflows/check-notebooks.yml +++ b/.github/workflows/check-notebooks.yml @@ -28,4 +28,4 @@ jobs: run: docker exec $(docker ps -q --filter expose=8888) pip3 install papermill - name: Run notebooks - run: ls spark/notebooks | xargs -I {} docker exec $(docker ps -q --filter expose=8888) papermill "/home/iceberg/notebooks/{}" + run: ls spark/notebooks | xargs -I {} docker exec -e JAVA_OPTS="--add-exports java.base/sun.nio.ch=ALL-UNNAMED" $(docker ps -q --filter expose=8888) papermill "/home/iceberg/notebooks/{}" diff --git a/spark/notebooks/Iceberg - An Introduction to the Iceberg Java API.ipynb b/spark/notebooks/Iceberg - An Introduction to the Iceberg Java API.ipynb index 20af42b..a974cac 100644 --- a/spark/notebooks/Iceberg - An Introduction to the Iceberg Java API.ipynb +++ b/spark/notebooks/Iceberg - An Introduction to the Iceberg Java API.ipynb @@ -163,6 +163,8 @@ " .config(\"spark.eventLog.enabled\", \"true\")\n", " .config(\"spark.eventLog.dir\", \"/home/iceberg/spark-events\")\n", " .config(\"spark.history.fs.logDirectory\", \"/home/iceberg/spark-events\")\n", + " .config(\"spark.driver.extraJavaOptions\", \"--add-exports java.base/sun.nio.ch=ALL-UNNAMED\")\n", + " .config(\"spark.executor.extraJavaOptions\", \"--add-exports java.base/sun.nio.ch=ALL-UNNAMED\")\n", " .getOrCreate();\n", "\n", "spark.sparkContext().setLogLevel(\"ERROR\");" diff --git a/spark/notebooks/PyIceberg - Commits.ipynb b/spark/notebooks/PyIceberg - Commits.ipynb index e3d5dcb..53624e3 100644 --- a/spark/notebooks/PyIceberg - Commits.ipynb +++ b/spark/notebooks/PyIceberg - Commits.ipynb @@ -18,21 +18,10 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "id": "6a5c8206", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'0.7.1'" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from pyiceberg import __version__\n", "\n", @@ -51,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "id": "47645b52", "metadata": {}, "outputs": [], @@ -73,30 +62,10 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": null, "id": "47e5a21d-de87-4aaf-aa06-dc5048acba58", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "commits(\n", - " 1: id: required long,\n", - " 2: name: required string,\n", - " 3: state: required string,\n", - " 4: additions: required long,\n", - " 5: deletes: required long\n", - "),\n", - "partition by: [],\n", - "sort order: [],\n", - "snapshot: null" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "table_name = \"default.commits\"\n", "\n", @@ -134,32 +103,10 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "id": "9fddb808", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "pyarrow.Table\n", - "id: int64 not null\n", - "name: large_string not null\n", - "state: large_string not null\n", - "additions: int64 not null\n", - "deletes: int64 not null\n", - "----\n", - "id: [[123,234,345]]\n", - "name: [[\"Fix bug\",\"Add VariantType\",\"Add commit retries\"]]\n", - "state: [[\"Open\",\"Open\",\"Open\"]]\n", - "additions: [[22,29123,22]]\n", - "deletes: [[10,302,10]]" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import pyarrow as pa\n", "\n", @@ -191,83 +138,14 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": null, "id": "efee8252", "metadata": { "pycharm": { "name": "#%%\n" } }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamestateadditionsdeletes
0123Fix bugOpen2210
1234Add VariantTypeOpen29123302
2345Add commit retriesOpen2210
\n", - "
" - ], - "text/plain": [ - " id name state additions deletes\n", - "0 123 Fix bug Open 22 10\n", - "1 234 Add VariantType Open 29123 302\n", - "2 345 Add commit retries Open 22 10" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "table.append(df)\n", "\n", @@ -278,69 +156,10 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "id": "9ce1cecc-8cb0-4622-b0eb-55880d091556", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
committed_atsnapshot_idparent_idoperationmanifest_listsummary
02025-04-07 13:43:24.6792018389781075547497NaNappends3://warehouse/default/commits/metadata/snap-2...[(added-files-size, 2504), (added-data-files, ...
\n", - "
" - ], - "text/plain": [ - " committed_at snapshot_id parent_id operation \\\n", - "0 2025-04-07 13:43:24.679 2018389781075547497 NaN append \n", - "\n", - " manifest_list \\\n", - "0 s3://warehouse/default/commits/metadata/snap-2... \n", - "\n", - " summary \n", - "0 [(added-files-size, 2504), (added-data-files, ... " - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "table.inspect.snapshots().to_pandas()" ] @@ -355,97 +174,10 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": null, "id": "794de3a0", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamestateadditionsdeletes
0456Add NanosecondTimestampsMerged23928
1567Add documentation around filtersOpen75433
2123Fix bugOpen2210
3234Add VariantTypeOpen29123302
4345Add commit retriesOpen2210
\n", - "
" - ], - "text/plain": [ - " id name state additions deletes\n", - "0 456 Add NanosecondTimestamps Merged 2392 8\n", - "1 567 Add documentation around filters Open 7543 3\n", - "2 123 Fix bug Open 22 10\n", - "3 234 Add VariantType Open 29123 302\n", - "4 345 Add commit retries Open 22 10" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "table.append(pa.Table.from_pylist(\n", " [\n", @@ -460,81 +192,10 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": null, "id": "f3ac7021", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
committed_atsnapshot_idparent_idoperationmanifest_listsummary
02025-04-07 14:25:20.5924215114979050777060NaNappends3://warehouse/default/commits/metadata/snap-4...[(added-files-size, 2472), (added-data-files, ...
12025-04-07 14:25:32.87621761834290455206104.215115e+18appends3://warehouse/default/commits/metadata/snap-2...[(added-files-size, 2600), (added-data-files, ...
\n", - "
" - ], - "text/plain": [ - " committed_at snapshot_id parent_id operation \\\n", - "0 2025-04-07 14:25:20.592 4215114979050777060 NaN append \n", - "1 2025-04-07 14:25:32.876 2176183429045520610 4.215115e+18 append \n", - "\n", - " manifest_list \\\n", - "0 s3://warehouse/default/commits/metadata/snap-4... \n", - "1 s3://warehouse/default/commits/metadata/snap-2... \n", - "\n", - " summary \n", - "0 [(added-files-size, 2472), (added-data-files, ... \n", - "1 [(added-files-size, 2600), (added-data-files, ... " - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "table.inspect.snapshots().to_pandas()" ] @@ -549,22 +210,10 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": null, "id": "5157a36e-1ab2-4a5b-bf73-1f2d462b0002", "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "'Table' object has no attribute 'upsert'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[58], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mupsert\u001b[49m(pa\u001b[38;5;241m.\u001b[39mTable\u001b[38;5;241m.\u001b[39mfrom_pylist(\n\u001b[1;32m 2\u001b[0m [\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Nothing changes: No-op\u001b[39;00m\n\u001b[1;32m 4\u001b[0m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m456\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAdd NanosecondTimestamps\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstate\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMerged\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124madditions\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m2392\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdeletes\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m8\u001b[39m},\n\u001b[1;32m 5\u001b[0m \n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[1;32m 7\u001b[0m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m567\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAdd documentation around filters\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstate\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOpen\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124madditions\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m7543\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdeletes\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m3\u001b[39m},\n\u001b[1;32m 8\u001b[0m ],\n\u001b[1;32m 9\u001b[0m schema\u001b[38;5;241m=\u001b[39mpa_schema\n\u001b[1;32m 10\u001b[0m ))\n\u001b[1;32m 12\u001b[0m table\u001b[38;5;241m.\u001b[39mscan()\u001b[38;5;241m.\u001b[39mto_pandas()\n", - "\u001b[0;31mAttributeError\u001b[0m: 'Table' object has no attribute 'upsert'" - ] - } - ], + "outputs": [], "source": [ "table.upsert(pa.Table.from_pylist(\n", " [\n", diff --git a/spark/requirements.txt b/spark/requirements.txt index 2496ee2..2669b9f 100644 --- a/spark/requirements.txt +++ b/spark/requirements.txt @@ -6,4 +6,5 @@ matplotlib==3.9.2 scipy==1.14.1 duckdb-engine==0.13.1 grpcio==1.71.0 -pycolor +pycolor==1.2.1 +papermill==2.6.0 diff --git a/spark/spark-defaults.conf b/spark/spark-defaults.conf index 42b6c73..3691964 100755 --- a/spark/spark-defaults.conf +++ b/spark/spark-defaults.conf @@ -31,3 +31,5 @@ spark.eventLog.enabled true spark.eventLog.dir /home/iceberg/spark-events spark.history.fs.logDirectory /home/iceberg/spark-events spark.sql.catalogImplementation in-memory +spark.driver.extraJavaOptions "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" +spark.executor.extraJavaOptions "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" From 9a4861093d71e3ceeeb94579d6c6d839f6c2f129 Mon Sep 17 00:00:00 2001 From: Fokko Date: Thu, 24 Apr 2025 19:57:49 +0200 Subject: [PATCH 9/9] Cleanup --- .github/workflows/check-notebooks.yml | 2 +- spark/Dockerfile | 11 - spark/entrypoint.sh | 7 +- ... to the Iceberg Java API using Scala.ipynb | 526 ++++++++++++++++++ ...Introduction to the Iceberg Java API.ipynb | 471 ---------------- .../PyIceberg - Getting Started.ipynb | 2 +- spark/spark-defaults.conf | 2 - 7 files changed, 531 insertions(+), 490 deletions(-) create mode 100644 spark/notebooks/Iceberg - An Introduction to the Iceberg Java API using Scala.ipynb delete mode 100644 spark/notebooks/Iceberg - An Introduction to the Iceberg Java API.ipynb diff --git a/.github/workflows/check-notebooks.yml b/.github/workflows/check-notebooks.yml index 359d51a..64e357c 100644 --- a/.github/workflows/check-notebooks.yml +++ b/.github/workflows/check-notebooks.yml @@ -28,4 +28,4 @@ jobs: run: docker exec $(docker ps -q --filter expose=8888) pip3 install papermill - name: Run notebooks - run: ls spark/notebooks | xargs -I {} docker exec -e JAVA_OPTS="--add-exports java.base/sun.nio.ch=ALL-UNNAMED" $(docker ps -q --filter expose=8888) papermill "/home/iceberg/notebooks/{}" + run: ls spark/notebooks | xargs -I {} docker exec $(docker ps -q --filter expose=8888) papermill "/home/iceberg/notebooks/{}" diff --git a/spark/Dockerfile b/spark/Dockerfile index 5f2bb29..0a430a6 100644 --- a/spark/Dockerfile +++ b/spark/Dockerfile @@ -36,12 +36,6 @@ RUN pip3 install -r requirements.txt # Add scala kernel via spylon-kernel RUN python3 -m spylon_kernel install -# Download and install IJava jupyter kernel -RUN curl https://github.com/SpencerPark/IJava/releases/download/v1.3.0/ijava-1.3.0.zip -Lo ijava-1.3.0.zip \ - && unzip ijava-1.3.0.zip \ - && python3 install.py --sys-prefix \ - && rm ijava-1.3.0.zip - # Optional env variables ENV SPARK_HOME=${SPARK_HOME:-"/opt/spark"} ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH @@ -77,9 +71,6 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2 && rm awscliv2.zip \ && rm -rf aws/ -# Add iceberg spark runtime jar to IJava classpath -ENV IJAVA_CLASSPATH=/opt/spark/jars/* - RUN mkdir -p /home/iceberg/data \ && curl https://data.cityofnewyork.us/resource/tg4x-b46p.json > /home/iceberg/data/nyc_film_permits.json \ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-04.parquet -o /home/iceberg/data/yellow_tripdata_2022-04.parquet \ @@ -117,8 +108,6 @@ RUN mkdir -p /root/.ipython/profile_default/startup COPY ipython/startup/00-prettytables.py /root/.ipython/profile_default/startup COPY ipython/startup/README /root/.ipython/profile_default/startup -ENV JAVA_OPTS="--add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED" - COPY spark-defaults.conf /opt/spark/conf ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}" diff --git a/spark/entrypoint.sh b/spark/entrypoint.sh index cae7905..704f316 100755 --- a/spark/entrypoint.sh +++ b/spark/entrypoint.sh @@ -17,10 +17,9 @@ # specific language governing permissions and limitations # under the License. -start-master.sh --conf "spark.driver.extraJavaOptions=--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" \ - --conf "spark.executor.extraJavaOptions=--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" -p 7077 -start-worker.sh --conf "spark.driver.extraJavaOptions=--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" \ - --conf "spark.executor.extraJavaOptions=--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" spark://spark-iceberg:7077 +start-master.sh -p 7077 +start-worker.sh spark://spark-iceberg:7077 + start-history-server.sh start-thriftserver.sh --driver-java-options "-Dderby.system.home=/tmp/derby" diff --git a/spark/notebooks/Iceberg - An Introduction to the Iceberg Java API using Scala.ipynb b/spark/notebooks/Iceberg - An Introduction to the Iceberg Java API using Scala.ipynb new file mode 100644 index 0000000..6db9d18 --- /dev/null +++ b/spark/notebooks/Iceberg - An Introduction to the Iceberg Java API using Scala.ipynb @@ -0,0 +1,526 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "16f6bb49", + "metadata": {}, + "source": [ + "![iceberg-logo](https://www.apache.org/logos/res/iceberg/iceberg.png)" + ] + }, + { + "cell_type": "markdown", + "id": "c82657e9", + "metadata": {}, + "source": [ + "# An Introduction to the Iceberg Scala API" + ] + }, + { + "cell_type": "markdown", + "id": "3ee90ad2", + "metadata": {}, + "source": [ + "## [Part 1 - Loading a Catalog and Creating a Table](https://tabular.io/blog/java-api-part-1/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5c29777-3fd4-4fb1-81c4-799db166ebf7", + "metadata": {}, + "outputs": [], + "source": [ + "spark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72e68c62", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.catalog.Catalog\n", + "import org.apache.hadoop.conf.Configuration\n", + "import org.apache.iceberg.CatalogProperties\n", + "import org.apache.iceberg.rest.RESTCatalog\n", + "import org.apache.iceberg.aws.s3.S3FileIOProperties\n", + "\n", + "import scala.collection.JavaConverters._\n", + "\n", + "val properties: Map[String, String] = Map(\n", + " CatalogProperties.CATALOG_IMPL -> \"org.apache.iceberg.rest.RESTCatalog\",\n", + " CatalogProperties.URI -> \"http://rest:8181\",\n", + " CatalogProperties.WAREHOUSE_LOCATION -> \"s3a://warehouse/wh\",\n", + " CatalogProperties.FILE_IO_IMPL -> \"org.apache.iceberg.aws.s3.S3FileIO\",\n", + " S3FileIOProperties.ENDPOINT -> \"http://minio:9000\"\n", + ")\n", + "\n", + "val catalog = new RESTCatalog()\n", + "val conf = new Configuration()\n", + "catalog.setConf(conf)\n", + "catalog.initialize(\"demo\", properties.asJava)\n", + "catalog.name()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4be615e7", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.Schema\n", + "import org.apache.iceberg.types.Types\n", + "\n", + "val schema = new Schema(\n", + " Types.NestedField.required(1, \"level\", Types.StringType.get()),\n", + " Types.NestedField.required(2, \"event_time\", Types.TimestampType.withZone()),\n", + " Types.NestedField.required(3, \"message\", Types.StringType.get()),\n", + " Types.NestedField.optional(4, \"call_stack\", Types.ListType.ofRequired(5, Types.StringType.get()))\n", + ")\n", + "\n", + "schema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7299d16", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.PartitionSpec\n", + "\n", + "val spec = PartitionSpec.builderFor(schema).hour(\"event_time\").identity(\"level\").build()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d900c97", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.catalog.TableIdentifier\n", + "import org.apache.iceberg.catalog.Namespace\n", + "\n", + "val nyc = Namespace.of(\"nyc\")\n", + "val name = TableIdentifier.of(nyc, \"logs\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0ed4251-1ccf-4ab2-bc1f-077df739b892", + "metadata": {}, + "outputs": [], + "source": [ + "val sql = s\"DROP TABLE IF EXISTS $name\"\n", + "spark.sql(sql)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a4d8a6e", + "metadata": {}, + "outputs": [], + "source": [ + "catalog.createTable(name, schema, spec)" + ] + }, + { + "cell_type": "markdown", + "id": "fe62e0a9", + "metadata": {}, + "source": [ + "## [Part 2 - Table Scans](https://tabular.io/blog/java-api-part-2/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4bcdf80-720d-4ea0-8296-eb93ffe8bfc0", + "metadata": {}, + "outputs": [], + "source": [ + "val sql = s\"DROP TABLE IF EXISTS $name\"\n", + "spark.sql(sql)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1e7aa7a", + "metadata": {}, + "outputs": [], + "source": [ + "catalog.createTable(name, schema, spec)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b17f820", + "metadata": {}, + "outputs": [], + "source": [ + "val query =\n", + " \"\"\"INSERT INTO demo.nyc.logs\n", + " |VALUES\n", + " |('info', timestamp 'today', 'Just letting you know!', array('stack trace line 1', 'stack trace line 2', 'stack trace line 3')),\n", + " |('warning', timestamp 'today', 'You probably should not do this!', array('stack trace line 1', 'stack trace line 2', 'stack trace line 3')),\n", + " |('error', timestamp 'today', 'This was a fatal application error!', array('stack trace line 1', 'stack trace line 2', 'stack trace line 3'))\n", + " |\"\"\".stripMargin\n", + "\n", + "spark.sql(query).show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15ca1822", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.catalog.Catalog\n", + "import org.apache.hadoop.conf.Configuration\n", + "import org.apache.iceberg.CatalogProperties\n", + "import org.apache.iceberg.rest.RESTCatalog\n", + "\n", + "val properties: Map[String, String] = Map(\n", + " CatalogProperties.CATALOG_IMPL -> \"org.apache.iceberg.rest.RESTCatalog\",\n", + " CatalogProperties.URI -> \"http://rest:8181\",\n", + " CatalogProperties.WAREHOUSE_LOCATION -> \"s3a://warehouse/wh/\",\n", + " CatalogProperties.FILE_IO_IMPL -> \"org.apache.iceberg.aws.s3.S3FileIO\",\n", + " S3FileIOProperties.ENDPOINT -> \"http://minio:9000\"\n", + ");\n", + "\n", + "val catalog = new RESTCatalog()\n", + "val conf = new Configuration()\n", + "catalog.setConf(conf)\n", + "catalog.initialize(\"demo\", properties.asJava)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a5cf423", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.Table\n", + "import org.apache.iceberg.TableScan\n", + "import org.apache.iceberg.catalog.Namespace\n", + "import org.apache.iceberg.catalog.TableIdentifier\n", + "\n", + "val nyc = Namespace.of(\"nyc\")\n", + "val name = TableIdentifier.of(nyc, \"logs\")\n", + "val table = catalog.loadTable(name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e472d6a1", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.io.CloseableIterable\n", + "import org.apache.iceberg.data.Record\n", + "import org.apache.iceberg.data.IcebergGenerics\n", + "\n", + "val result = IcebergGenerics.read(table).build()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d32f41c", + "metadata": {}, + "outputs": [], + "source": [ + "result.asScala.foreach(println)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7dffc238", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.expressions.Expressions\n", + "\n", + "val result = IcebergGenerics.read(table).where(Expressions.equal(\"level\", \"error\")).build()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec2b0431", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.CombinedScanTask\n", + "import org.apache.iceberg.TableScan\n", + "\n", + "val scan = table.newScan()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09d13c6b", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.expressions.Expressions\n", + "\n", + "val filteredScan = scan.filter(Expressions.equal(\"level\", \"info\")).select(\"message\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1857c10f", + "metadata": {}, + "outputs": [], + "source": [ + "val result = filteredScan.planTasks()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea206ec7", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.DataFile;\n", + "\n", + "val task = result.iterator().next();\n", + "val dataFile = task.files().iterator().next().file();\n", + "System.out.println(dataFile);" + ] + }, + { + "cell_type": "markdown", + "id": "41e9e10f", + "metadata": {}, + "source": [ + "## [Part 3 - Table Scans](https://tabular.io/blog/java-api-part-3/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29574130-7a69-4766-89e5-e3ced476518b", + "metadata": {}, + "outputs": [], + "source": [ + "val webapp = Namespace.of(\"webapp\")\n", + "val name = TableIdentifier.of(webapp, \"user_events\")\n", + "\n", + "spark.sql(s\"DROP TABLE IF EXISTS $name\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81033412", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.Schema\n", + "import org.apache.iceberg.types.Types\n", + "import org.apache.iceberg.catalog.{Namespace, TableIdentifier}\n", + "import org.apache.iceberg.PartitionSpec\n", + "\n", + "val schema = new Schema(\n", + " Types.NestedField.optional(1, \"event_id\", Types.StringType.get()),\n", + " Types.NestedField.optional(2, \"username\", Types.StringType.get()),\n", + " Types.NestedField.optional(3, \"userid\", Types.IntegerType.get()),\n", + " Types.NestedField.optional(4, \"api_version\", Types.StringType.get()),\n", + " Types.NestedField.optional(5, \"command\", Types.StringType.get())\n", + ")\n", + "\n", + "catalog.createTable(name, schema, PartitionSpec.unpartitioned())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12c45c6b", + "metadata": {}, + "outputs": [], + "source": [ + "import java.util.UUID\n", + "import com.google.common.collect.{ImmutableList, ImmutableMap}\n", + "import scala.jdk.CollectionConverters._\n", + "import org.apache.iceberg.data.GenericRecord\n", + "\n", + "val record = GenericRecord.create(schema)\n", + "val builder = ImmutableList.builder[GenericRecord]()\n", + "\n", + "val records = List(\n", + " Map(\n", + " \"event_id\" -> UUID.randomUUID().toString,\n", + " \"username\" -> \"Bruce\",\n", + " \"userid\" -> 1.asInstanceOf[AnyRef],\n", + " \"api_version\" -> \"1.0\",\n", + " \"command\" -> \"grapple\"\n", + " ),\n", + " Map(\n", + " \"event_id\" -> UUID.randomUUID().toString,\n", + " \"username\" -> \"Wayne\",\n", + " \"userid\" -> 1.asInstanceOf[AnyRef],\n", + " \"api_version\" -> \"1.0\",\n", + " \"command\" -> \"glide\"\n", + " ),\n", + " Map(\n", + " \"event_id\" -> UUID.randomUUID().toString,\n", + " \"username\" -> \"Clark\",\n", + " \"userid\" -> 1.asInstanceOf[AnyRef],\n", + " \"api_version\" -> \"2.0\",\n", + " \"command\" -> \"fly\"\n", + " ),\n", + " Map(\n", + " \"event_id\" -> UUID.randomUUID().toString,\n", + " \"username\" -> \"Kent\",\n", + " \"userid\" -> 1.asInstanceOf[AnyRef],\n", + " \"api_version\" -> \"1.0\",\n", + " \"command\" -> \"land\"\n", + " )\n", + ").map(data => record.copy(data.mapValues(_.asInstanceOf[AnyRef]).toMap.asJava)).foreach(builder.add)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83bc5319", + "metadata": {}, + "outputs": [], + "source": [ + "import java.util.UUID\n", + "import org.apache.iceberg.io.{DataWriter, OutputFile}\n", + "import org.apache.iceberg.parquet.Parquet\n", + "import org.apache.iceberg.data.GenericRecord\n", + "import org.apache.iceberg.data.parquet.GenericParquetWriter\n", + "import org.apache.iceberg.PartitionSpec\n", + "\n", + "val filepath = s\"${table.location()}/${UUID.randomUUID().toString}\"\n", + "val file: OutputFile = table.io().newOutputFile(filepath)\n", + "\n", + "val dataWriter: DataWriter[GenericRecord] =\n", + " Parquet.writeData(file)\n", + " .schema(schema)\n", + " .createWriterFunc(GenericParquetWriter.buildWriter)\n", + " .overwrite()\n", + " .withSpec(PartitionSpec.unpartitioned())\n", + " .build()\n", + "\n", + "try {\n", + " for (record <- builder.build().asScala) {\n", + " dataWriter.write(record)\n", + " }\n", + "} finally {\n", + " dataWriter.close()\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "469e6af4", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.DataFile\n", + "\n", + "val dataFile = dataWriter.toDataFile()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "142b6ed1", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.catalog.Namespace\n", + "import org.apache.iceberg.catalog.TableIdentifier\n", + "import org.apache.iceberg.Table;\n", + "\n", + "val webapp = Namespace.of(\"webapp\");\n", + "val name = TableIdentifier.of(webapp, \"user_events\");\n", + "val tbl = catalog.loadTable(name);\n", + "tbl.newAppend().appendFile(dataFile).commit()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c61e9e79", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.io.CloseableIterable\n", + "import org.apache.iceberg.data.Record\n", + "import org.apache.iceberg.data.IcebergGenerics\n", + "\n", + "val result = IcebergGenerics.read(tbl).build();\n", + "result.asScala.foreach(println)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "801aec53-47e3-4441-bd15-6dd7a4cabb21", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8bc6dc94-396f-42ae-ac60-bb27cf86ea13", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c80f1225-f73a-44ec-b53a-58d652a34a45", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "spylon-kernel", + "language": "scala", + "name": "spylon-kernel" + }, + "language_info": { + "codemirror_mode": "text/x-scala", + "file_extension": ".scala", + "help_links": [ + { + "text": "MetaKernel Magics", + "url": "https://metakernel.readthedocs.io/en/latest/source/README.html" + } + ], + "mimetype": "text/x-scala", + "name": "scala", + "pygments_lexer": "scala", + "version": "0.4.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/spark/notebooks/Iceberg - An Introduction to the Iceberg Java API.ipynb b/spark/notebooks/Iceberg - An Introduction to the Iceberg Java API.ipynb deleted file mode 100644 index a974cac..0000000 --- a/spark/notebooks/Iceberg - An Introduction to the Iceberg Java API.ipynb +++ /dev/null @@ -1,471 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "16f6bb49", - "metadata": {}, - "source": [ - "![iceberg-logo](https://www.apache.org/logos/res/iceberg/iceberg.png)" - ] - }, - { - "cell_type": "markdown", - "id": "c82657e9", - "metadata": {}, - "source": [ - "# An Introduction to the Iceberg Java API" - ] - }, - { - "cell_type": "markdown", - "id": "3ee90ad2", - "metadata": {}, - "source": [ - "## [Part 1 - Loading a Catalog and Creating a Table](https://tabular.io/blog/java-api-part-1/)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "72e68c62", - "metadata": {}, - "outputs": [], - "source": [ - "import org.apache.iceberg.catalog.Catalog;\n", - "import org.apache.hadoop.conf.Configuration;\n", - "import org.apache.iceberg.CatalogProperties;\n", - "import org.apache.iceberg.rest.RESTCatalog;\n", - "import org.apache.iceberg.aws.s3.S3FileIOProperties;\n", - "\n", - "Map properties = new HashMap<>();\n", - "\n", - "properties.put(CatalogProperties.CATALOG_IMPL, \"org.apache.iceberg.rest.RESTCatalog\");\n", - "properties.put(CatalogProperties.URI, \"http://rest:8181\");\n", - "properties.put(CatalogProperties.WAREHOUSE_LOCATION, \"s3a://warehouse/wh\");\n", - "properties.put(CatalogProperties.FILE_IO_IMPL, \"org.apache.iceberg.aws.s3.S3FileIO\");\n", - "properties.put(S3FileIOProperties.ENDPOINT, \"http://minio:9000\");\n", - "\n", - "RESTCatalog catalog = new RESTCatalog();\n", - "Configuration conf = new Configuration();\n", - "catalog.setConf(conf);\n", - "catalog.initialize(\"demo\", properties);\n", - "catalog.name();" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4be615e7", - "metadata": {}, - "outputs": [], - "source": [ - "import org.apache.iceberg.Schema;\n", - "import org.apache.iceberg.types.Types;\n", - "\n", - "Schema schema = new Schema(\n", - " Types.NestedField.required(1, \"level\", Types.StringType.get()),\n", - " Types.NestedField.required(2, \"event_time\", Types.TimestampType.withZone()),\n", - " Types.NestedField.required(3, \"message\", Types.StringType.get()),\n", - " Types.NestedField.optional(4, \"call_stack\", Types.ListType.ofRequired(5, Types.StringType.get()))\n", - " );\n", - "schema" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b7299d16", - "metadata": {}, - "outputs": [], - "source": [ - "import org.apache.iceberg.PartitionSpec;\n", - "\n", - "PartitionSpec spec = PartitionSpec.builderFor(schema)\n", - " .hour(\"event_time\")\n", - " .identity(\"level\")\n", - " .build();\n", - "spec" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4d900c97", - "metadata": {}, - "outputs": [], - "source": [ - "import org.apache.iceberg.catalog.TableIdentifier;\n", - "import org.apache.iceberg.catalog.Namespace;\n", - "\n", - "Namespace nyc = Namespace.of(\"nyc\");\n", - "TableIdentifier name = TableIdentifier.of(nyc, \"logs\");\n", - "name" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a4d8a6e", - "metadata": {}, - "outputs": [], - "source": [ - "catalog.createTable(name, schema, spec)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7d8c46df", - "metadata": {}, - "outputs": [], - "source": [ - "catalog.dropTable(name)" - ] - }, - { - "cell_type": "markdown", - "id": "fe62e0a9", - "metadata": {}, - "source": [ - "## [Part 2 - Table Scans](https://tabular.io/blog/java-api-part-2/)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c1e7aa7a", - "metadata": {}, - "outputs": [], - "source": [ - "catalog.createTable(name, schema, spec)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "78c95e06", - "metadata": {}, - "outputs": [], - "source": [ - "import org.apache.spark.sql.SparkSession;\n", - "\n", - "SparkSession spark = SparkSession\n", - " .builder()\n", - " .master(\"local[*]\")\n", - " .appName(\"Java API Demo\")\n", - " .config(\"spark.sql.extensions\", \"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions\")\n", - " .config(\"spark.sql.catalog.demo\", \"org.apache.iceberg.spark.SparkCatalog\")\n", - " .config(\"spark.sql.catalog.demo.catalog-impl\", \"org.apache.iceberg.rest.RESTCatalog\")\n", - " .config(\"spark.sql.catalog.demo.uri\", \"http://rest:8181\")\n", - " .config(\"spark.sql.catalog.demo.io-impl\", \"org.apache.iceberg.aws.s3.S3FileIO\")\n", - " .config(\"spark.sql.catalog.demo.s3.endpoint\", \"http://minio:9000\")\n", - " .config(\"spark.sql.defaultCatalog\", \"demo\")\n", - " .config(\"spark.eventLog.enabled\", \"true\")\n", - " .config(\"spark.eventLog.dir\", \"/home/iceberg/spark-events\")\n", - " .config(\"spark.history.fs.logDirectory\", \"/home/iceberg/spark-events\")\n", - " .config(\"spark.driver.extraJavaOptions\", \"--add-exports java.base/sun.nio.ch=ALL-UNNAMED\")\n", - " .config(\"spark.executor.extraJavaOptions\", \"--add-exports java.base/sun.nio.ch=ALL-UNNAMED\")\n", - " .getOrCreate();\n", - "\n", - "spark.sparkContext().setLogLevel(\"ERROR\");" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b17f820", - "metadata": {}, - "outputs": [], - "source": [ - "String query = \"INSERT INTO demo.nyc.logs \"\n", - " + \"VALUES \"\n", - " + \"('info', timestamp 'today', 'Just letting you know!', array('stack trace line 1', 'stack trace line 2', 'stack trace line 3')), \"\n", - " + \"('warning', timestamp 'today', 'You probably should not do this!', array('stack trace line 1', 'stack trace line 2', 'stack trace line 3')), \"\n", - " + \"('error', timestamp 'today', 'This was a fatal application error!', array('stack trace line 1', 'stack trace line 2', 'stack trace line 3'))\";\n", - "\n", - "spark.sql(query).show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15ca1822", - "metadata": {}, - "outputs": [], - "source": [ - "import org.apache.iceberg.catalog.Catalog;\n", - "import org.apache.hadoop.conf.Configuration;\n", - "import org.apache.iceberg.CatalogProperties;\n", - "import org.apache.iceberg.rest.RESTCatalog;\n", - "\n", - "Map properties = new HashMap<>();\n", - "\n", - "properties.put(CatalogProperties.CATALOG_IMPL, \"org.apache.iceberg.rest.RESTCatalog\");\n", - "properties.put(CatalogProperties.URI, \"http://rest:8181\");\n", - "properties.put(CatalogProperties.WAREHOUSE_LOCATION, \"s3a://warehouse/wh/\");\n", - "properties.put(CatalogProperties.FILE_IO_IMPL, \"org.apache.iceberg.aws.s3.S3FileIO\");\n", - "properties.put(S3FileIOProperties.ENDPOINT, \"http://minio:9000\");\n", - "\n", - "RESTCatalog catalog = new RESTCatalog();\n", - "Configuration conf = new Configuration();\n", - "catalog.setConf(conf);\n", - "catalog.initialize(\"demo\", properties);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3a5cf423", - "metadata": {}, - "outputs": [], - "source": [ - "import org.apache.iceberg.Table;\n", - "import org.apache.iceberg.TableScan;\n", - "import org.apache.iceberg.catalog.Namespace;\n", - "import org.apache.iceberg.catalog.TableIdentifier;\n", - "\n", - "Namespace nyc = Namespace.of(\"nyc\");\n", - "TableIdentifier name = TableIdentifier.of(nyc, \"logs\");\n", - "Table table = catalog.loadTable(name);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e472d6a1", - "metadata": {}, - "outputs": [], - "source": [ - "import org.apache.iceberg.io.CloseableIterable;\n", - "import org.apache.iceberg.data.Record;\n", - "import org.apache.iceberg.data.IcebergGenerics;\n", - "\n", - "CloseableIterable result = IcebergGenerics.read(table).build();" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d32f41c", - "metadata": {}, - "outputs": [], - "source": [ - "for (Record r: result) {\n", - " System.out.println(r);\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7dffc238", - "metadata": {}, - "outputs": [], - "source": [ - "import org.apache.iceberg.expressions.Expressions;\n", - "\n", - "CloseableIterable result = IcebergGenerics.read(table)\n", - " .where(Expressions.equal(\"level\", \"error\"))\n", - " .build();" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ec2b0431", - "metadata": {}, - "outputs": [], - "source": [ - "import org.apache.iceberg.CombinedScanTask;\n", - "import org.apache.iceberg.TableScan;\n", - "\n", - "TableScan scan = table.newScan();" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "09d13c6b", - "metadata": {}, - "outputs": [], - "source": [ - "import org.apache.iceberg.expressions.Expressions;\n", - "\n", - "TableScan filteredScan = scan.filter(Expressions.equal(\"level\", \"info\")).select(\"message\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1857c10f", - "metadata": {}, - "outputs": [], - "source": [ - "Iterable result = filteredScan.planTasks();" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea206ec7", - "metadata": {}, - "outputs": [], - "source": [ - "import org.apache.iceberg.DataFile;\n", - "\n", - "CombinedScanTask task = result.iterator().next();\n", - "DataFile dataFile = task.files().iterator().next().file();\n", - "System.out.println(dataFile);" - ] - }, - { - "cell_type": "markdown", - "id": "41e9e10f", - "metadata": {}, - "source": [ - "## [Part 3 - Table Scans](https://tabular.io/blog/java-api-part-3/)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "81033412", - "metadata": {}, - "outputs": [], - "source": [ - "import org.apache.iceberg.Schema;\n", - "import org.apache.iceberg.types.Types;\n", - "import org.apache.iceberg.catalog.Namespace;\n", - "import org.apache.iceberg.catalog.TableIdentifier;\n", - "import org.apache.iceberg.PartitionSpec;\n", - "\n", - "Schema schema = new Schema(\n", - " Types.NestedField.optional(1, \"event_id\", Types.StringType.get()),\n", - " Types.NestedField.optional(2, \"username\", Types.StringType.get()),\n", - " Types.NestedField.optional(3, \"userid\", Types.IntegerType.get()),\n", - " Types.NestedField.optional(4, \"api_version\", Types.StringType.get()),\n", - " Types.NestedField.optional(5, \"command\", Types.StringType.get())\n", - " );\n", - "\n", - "Namespace webapp = Namespace.of(\"webapp\");\n", - "TableIdentifier name = TableIdentifier.of(webapp, \"user_events\");\n", - "catalog.createTable(name, schema, PartitionSpec.unpartitioned());" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12c45c6b", - "metadata": {}, - "outputs": [], - "source": [ - "import java.util.UUID;\n", - "import com.google.common.collect.ImmutableList;\n", - "import com.google.common.collect.ImmutableMap;\n", - "import org.apache.iceberg.data.GenericRecord;\n", - "\n", - "GenericRecord record = GenericRecord.create(schema);\n", - "ImmutableList.Builder builder = ImmutableList.builder();\n", - "builder.add(record.copy(ImmutableMap.of(\"event_id\", UUID.randomUUID().toString(), \"username\", \"Bruce\", \"userid\", 1, \"api_version\", \"1.0\", \"command\", \"grapple\")));\n", - "builder.add(record.copy(ImmutableMap.of(\"event_id\", UUID.randomUUID().toString(), \"username\", \"Wayne\", \"userid\", 1, \"api_version\", \"1.0\", \"command\", \"glide\")));\n", - "builder.add(record.copy(ImmutableMap.of(\"event_id\", UUID.randomUUID().toString(), \"username\", \"Clark\", \"userid\", 1, \"api_version\", \"2.0\", \"command\", \"fly\")));\n", - "builder.add(record.copy(ImmutableMap.of(\"event_id\", UUID.randomUUID().toString(), \"username\", \"Kent\", \"userid\", 1, \"api_version\", \"1.0\", \"command\", \"land\")));\n", - "ImmutableList records = builder.build();" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "83bc5319", - "metadata": {}, - "outputs": [], - "source": [ - "import org.apache.iceberg.Files;\n", - "import org.apache.iceberg.io.DataWriter;\n", - "import org.apache.iceberg.io.OutputFile;\n", - "import org.apache.iceberg.parquet.Parquet;\n", - "import org.apache.iceberg.data.parquet.GenericParquetWriter;\n", - "\n", - "String filepath = table.location() + \"/\" + UUID.randomUUID().toString();\n", - "OutputFile file = table.io().newOutputFile(filepath);\n", - "DataWriter dataWriter =\n", - " Parquet.writeData(file)\n", - " .schema(schema)\n", - " .createWriterFunc(GenericParquetWriter::buildWriter)\n", - " .overwrite()\n", - " .withSpec(PartitionSpec.unpartitioned())\n", - " .build();\n", - "try {\n", - " for (GenericRecord record : builder.build()) {\n", - " dataWriter.write(record);\n", - " }\n", - "} finally {\n", - " dataWriter.close();\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "469e6af4", - "metadata": {}, - "outputs": [], - "source": [ - "import org.apache.iceberg.DataFile;\n", - "\n", - "DataFile dataFile = dataWriter.toDataFile();" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "142b6ed1", - "metadata": {}, - "outputs": [], - "source": [ - "import org.apache.iceberg.catalog.Namespace;\n", - "import org.apache.iceberg.catalog.TableIdentifier;\n", - "import org.apache.iceberg.Table;\n", - "\n", - "Namespace webapp = Namespace.of(\"webapp\");\n", - "TableIdentifier name = TableIdentifier.of(webapp, \"user_events\");\n", - "Table tbl = catalog.loadTable(name);\n", - "tbl.newAppend().appendFile(dataFile).commit()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c61e9e79", - "metadata": {}, - "outputs": [], - "source": [ - "import org.apache.iceberg.io.CloseableIterable;\n", - "import org.apache.iceberg.data.Record;\n", - "import org.apache.iceberg.data.IcebergGenerics;\n", - "\n", - "CloseableIterable result = IcebergGenerics.read(tbl).build();\n", - "for (Record r: result) {\n", - " System.out.println(r);\n", - "}" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Java", - "language": "java", - "name": "java" - }, - "language_info": { - "codemirror_mode": "java", - "file_extension": ".jshell", - "mimetype": "text/x-java-source", - "name": "Java", - "pygments_lexer": "java", - "version": "11.0.15+10-post-Debian-1deb11u1" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/spark/notebooks/PyIceberg - Getting Started.ipynb b/spark/notebooks/PyIceberg - Getting Started.ipynb index daf65ee..a55bd2b 100644 --- a/spark/notebooks/PyIceberg - Getting Started.ipynb +++ b/spark/notebooks/PyIceberg - Getting Started.ipynb @@ -362,7 +362,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.16" + "version": "3.9.19" } }, "nbformat": 4, diff --git a/spark/spark-defaults.conf b/spark/spark-defaults.conf index 3691964..42b6c73 100755 --- a/spark/spark-defaults.conf +++ b/spark/spark-defaults.conf @@ -31,5 +31,3 @@ spark.eventLog.enabled true spark.eventLog.dir /home/iceberg/spark-events spark.history.fs.logDirectory /home/iceberg/spark-events spark.sql.catalogImplementation in-memory -spark.driver.extraJavaOptions "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED" -spark.executor.extraJavaOptions "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED"