What you need to have spark read and write in S3 (specifically apache iceberg, apache hudi, delta lake files)
5 min readMay 31, 2024
So spark is a bit of a pain. If you want Spark to write into S3 buckets you need 2 major pieces.
- Spark with specific S3 libraries (will work with S3 and other systems like min.io)
The 2 major libraries are aws-java-sdk-s3 (and dependencies) and hadoop-aws (and dependencies). Below is a sample using pyspark with Apache Hudi and apache ivy for packages.
pyspark --packages com.amazonaws:aws-java-sdk-s3:1.12.661,org.apache.hadoop:hadoop-aws:3.3.1,org.apache.hudi:hudi-spark3.4-bundle_2.12:0.14.1 --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog" --conf "spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension"
root@pyspark:/opt/spark/bin# ./pyspark --packages com.amazonaws:aws-java-sdk-s3:1.12.661,org.apache.hadoop:hadoop-aws:3.3.1,org.apache.hudi:hudi-spark3.4-bundle_2.12:0.14.1 --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog" --conf "spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension"
Python 3.10.6 (main, Mar 10 2023, 10:55:28) [GCC 11.3.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.amazonaws#aws-java-sdk-s3 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
org.apache.hudi#hudi-spark3.4-bundle_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d3cdd048-ce2d-4687-beee-6001f575e498;1.0
confs: [default]
found com.amazonaws#aws-java-sdk-s3;1.12.661 in central
found com.amazonaws#aws-java-sdk-kms;1.12.661 in central
found com.amazonaws#aws-java-sdk-core;1.12.661 in central
found commons-logging#commons-logging;1.1.3 in central
found commons-codec#commons-codec;1.15 in central
found org.apache.httpcomponents#httpclient;4.5.13 in central
found org.apache.httpcomponents#httpcore;4.4.13 in central
found com.fasterxml.jackson.core#jackson-databind;2.12.7.1 in central
found com.fasterxml.jackson.core#jackson-annotations;2.12.7 in central
found com.fasterxml.jackson.core#jackson-core;2.12.7 in central
found com.fasterxml.jackson.dataformat#jackson-dataformat-cbor;2.12.6 in central
found joda-time#joda-time;2.8.1 in central
found com.amazonaws#jmespath-java;1.12.661 in central
found org.apache.hadoop#hadoop-aws;3.3.1 in central
found com.amazonaws#aws-java-sdk-bundle;1.11.901 in central
found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
found org.apache.hudi#hudi-spark3.4-bundle_2.12;0.14.1 in central
downloading https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-s3/1.12.661/aws-java-sdk-s3-1.12.661.jar ...
[SUCCESSFUL ] com.amazonaws#aws-java-sdk-s3;1.12.661!aws-java-sdk-s3.jar (71ms)
downloading https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.1/hadoop-aws-3.3.1.jar ...
[SUCCESSFUL ] org.apache.hadoop#hadoop-aws;3.3.1!hadoop-aws.jar (49ms)
downloading https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark3.4-bundle_2.12/0.14.1/hudi-spark3.4-bundle_2.12-0.14.1.jar ...
[SUCCESSFUL ] org.apache.hudi#hudi-spark3.4-bundle_2.12;0.14.1!hudi-spark3.4-bundle_2.12.jar (1778ms)
downloading https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-kms/1.12.661/aws-java-sdk-kms-1.12.661.jar ...
[SUCCESSFUL ] com.amazonaws#aws-java-sdk-kms;1.12.661!aws-java-sdk-kms.jar (52ms)
downloading https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-core/1.12.661/aws-java-sdk-core-1.12.661.jar ...
[SUCCESSFUL ] com.amazonaws#aws-java-sdk-core;1.12.661!aws-java-sdk-core.jar (51ms)
downloading https://repo1.maven.org/maven2/com/amazonaws/jmespath-java/1.12.661/jmespath-java-1.12.661.jar ...
[SUCCESSFUL ] com.amazonaws#jmespath-java;1.12.661!jmespath-java.jar (33ms)
downloading https://repo1.maven.org/maven2/commons-logging/commons-logging/1.1.3/commons-logging-1.1.3.jar ...
[SUCCESSFUL ] commons-logging#commons-logging;1.1.3!commons-logging.jar (32ms)
downloading https://repo1.maven.org/maven2/commons-codec/commons-codec/1.15/commons-codec-1.15.jar ...
[SUCCESSFUL ] commons-codec#commons-codec;1.15!commons-codec.jar (48ms)
downloading https://repo1.maven.org/maven2/org/apache/httpcomponents/httpclient/4.5.13/httpclient-4.5.13.jar ...
[SUCCESSFUL ] org.apache.httpcomponents#httpclient;4.5.13!httpclient.jar (49ms)
downloading https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-databind/2.12.7.1/jackson-databind-2.12.7.1.jar ...
[SUCCESSFUL ] com.fasterxml.jackson.core#jackson-databind;2.12.7.1!jackson-databind.jar(bundle) (64ms)
downloading https://repo1.maven.org/maven2/com/fasterxml/jackson/dataformat/jackson-dataformat-cbor/2.12.6/jackson-dataformat-cbor-2.12.6.jar ...
[SUCCESSFUL ] com.fasterxml.jackson.dataformat#jackson-dataformat-cbor;2.12.6!jackson-dataformat-cbor.jar(bundle) (38ms)
downloading https://repo1.maven.org/maven2/joda-time/joda-time/2.8.1/joda-time-2.8.1.jar ...
[SUCCESSFUL ] joda-time#joda-time;2.8.1!joda-time.jar (93ms)
downloading https://repo1.maven.org/maven2/org/apache/httpcomponents/httpcore/4.4.13/httpcore-4.4.13.jar ...
[SUCCESSFUL ] org.apache.httpcomponents#httpcore;4.4.13!httpcore.jar (39ms)
downloading https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-annotations/2.12.7/jackson-annotations-2.12.7.jar ...
[SUCCESSFUL ] com.fasterxml.jackson.core#jackson-annotations;2.12.7!jackson-annotations.jar(bundle) (33ms)
downloading https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-core/2.12.7/jackson-core-2.12.7.jar ...
[SUCCESSFUL ] com.fasterxml.jackson.core#jackson-core;2.12.7!jackson-core.jar(bundle) (45ms)
downloading https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.901/aws-java-sdk-bundle-1.11.901.jar ...
[SUCCESSFUL ] com.amazonaws#aws-java-sdk-bundle;1.11.901!aws-java-sdk-bundle.jar (3202ms)
downloading https://repo1.maven.org/maven2/org/wildfly/openssl/wildfly-openssl/1.0.7.Final/wildfly-openssl-1.0.7.Final.jar ...
[SUCCESSFUL ] org.wildfly.openssl#wildfly-openssl;1.0.7.Final!wildfly-openssl.jar (43ms)
:: resolution report :: resolve 5421ms :: artifacts dl 5731ms
:: modules in use:
com.amazonaws#aws-java-sdk-bundle;1.11.901 from central in [default]
com.amazonaws#aws-java-sdk-core;1.12.661 from central in [default]
com.amazonaws#aws-java-sdk-kms;1.12.661 from central in [default]
com.amazonaws#aws-java-sdk-s3;1.12.661 from central in [default]
com.amazonaws#jmespath-java;1.12.661 from central in [default]
com.fasterxml.jackson.core#jackson-annotations;2.12.7 from central in [default]
com.fasterxml.jackson.core#jackson-core;2.12.7 from central in [default]
com.fasterxml.jackson.core#jackson-databind;2.12.7.1 from central in [default]
com.fasterxml.jackson.dataformat#jackson-dataformat-cbor;2.12.6 from central in [default]
commons-codec#commons-codec;1.15 from central in [default]
commons-logging#commons-logging;1.1.3 from central in [default]
joda-time#joda-time;2.8.1 from central in [default]
org.apache.hadoop#hadoop-aws;3.3.1 from central in [default]
org.apache.httpcomponents#httpclient;4.5.13 from central in [default]
org.apache.httpcomponents#httpcore;4.4.13 from central in [default]
org.apache.hudi#hudi-spark3.4-bundle_2.12;0.14.1 from central in [default]
org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
:: evicted modules:
commons-logging#commons-logging;1.2 by [commons-logging#commons-logging;1.1.3] in [default]
commons-codec#commons-codec;1.11 by [commons-codec#commons-codec;1.15] in [default]
com.fasterxml.jackson.core#jackson-databind;2.12.6 by [com.fasterxml.jackson.core#jackson-databind;2.12.7.1] in [default]
com.fasterxml.jackson.core#jackson-core;2.12.6 by [com.fasterxml.jackson.core#jackson-core;2.12.7] in [default]
---------------------------------------------------------------------
| | modules || artifacts |
| conf | number| search|dwnlded|evicted|| number|dwnlded|
---------------------------------------------------------------------
| default | 21 | 17 | 17 | 4 || 17 | 17 |
---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-d3cdd048-ce2d-4687-beee-6001f575e498
confs: [default]
17 artifacts copied, 0 already retrieved (298923kB/695ms)
2. You need an Apache Hive Metastore that has S3 libaries. I haven’t had much success with the dockerhub images provided by Apache. I currently use the docker images provided by Starburst.
Here is my docker compose
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
version: "3.9"
services:
trino:
container_name: trino
ports:
- '8080:8080'
image: 'trinodb/trino:428'
volumes:
- ./trino/catalog:/etc/trino/catalog
- ./data:/home/data
presto:
container_name: presto
ports:
- '8082:8082'
image: 'prestodb/presto:0.283'
volumes:
- ./presto/catalog:/opt/presto-server/etc/catalog
- ./presto/config.properties:/opt/presto-server/etc/config.properties
- ./presto/jvm.config:/opt/presto-server/etc/jvm.config
- ./presto/node.properties:/opt/presto-server/etc/node.properties
- ./data:/home/data
metastore_db:
image: postgres:11
hostname: metastore_db
environment:
POSTGRES_USER: hive
POSTGRES_PASSWORD: hive
POSTGRES_DB: metastore
hive-metastore:
hostname: hive-metastore
image: 'starburstdata/hive:3.1.3-e.10'
ports:
- '9083:9083' # Metastore Thrift
environment:
HIVE_METASTORE_DRIVER: org.postgresql.Driver
HIVE_METASTORE_JDBC_URL: jdbc:postgresql://metastore_db:5432/metastore
HIVE_METASTORE_USER: hive
HIVE_METASTORE_PASSWORD: hive
HIVE_METASTORE_WAREHOUSE_DIR: s3a://warehouse/
S3_ENDPOINT: http://minio:9000
S3_ACCESS_KEY: admin
S3_SECRET_KEY: password
S3_PATH_STYLE_ACCESS: "true"
REGION: ""
GOOGLE_CLOUD_KEY_FILE_PATH: ""
AZURE_ADL_CLIENT_ID: ""
AZURE_ADL_CREDENTIAL: ""
AZURE_ADL_REFRESH_URL: ""
AZURE_ABFS_STORAGE_ACCOUNT: ""
AZURE_ABFS_ACCESS_KEY: ""
AZURE_WASB_STORAGE_ACCOUNT: ""
AZURE_ABFS_OAUTH: ""
AZURE_ABFS_OAUTH_TOKEN_PROVIDER: ""
AZURE_ABFS_OAUTH_CLIENT_ID: ""
AZURE_ABFS_OAUTH_SECRET: ""
AZURE_ABFS_OAUTH_ENDPOINT: ""
AZURE_WASB_ACCESS_KEY: ""
HIVE_METASTORE_USERS_IN_ADMIN_ROLE: "admin"
depends_on:
- metastore_db
healthcheck:
test: bash -c "exec 6<> /dev/tcp/localhost/9083"
jupyter:
container_name: jupyter
hostname: jupyter
image: 'almondsh/almond:latest'
ports:
- '8888:8888'
volumes:
- ./notebook:/home/jovyan/work
- ./jars:/home/jars
- ./data:/home/data
spark-py:
container_name: pyspark
hostname: pyspark
user: '0'
image: 'apache/spark-py:v3.4.0'
volumes:
- ./jars:/opt/xtable/jars
entrypoint: >
/bin/sh -c "
tail -f /dev/null;
"
minio:
image: minio/minio
environment:
- MINIO_ROOT_USER=admin
- MINIO_ROOT_PASSWORD=password
- MINIO_DOMAIN=minio
networks:
default:
aliases:
- warehouse.minio
ports:
- 9001:9001
- 9000:9000
command: ["server", "/data", "--console-address", ":9001"]
mc:
depends_on:
- minio
image: minio/mc
environment:
- AWS_ACCESS_KEY_ID=admin
- AWS_SECRET_ACCESS_KEY=password
- AWS_REGION=us-east-1
entrypoint: >
/bin/sh -c "
until (/usr/bin/mc config host add minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done;
/usr/bin/mc rm -r --force minio/warehouse;
/usr/bin/mc mb minio/warehouse;
/usr/bin/mc policy set public minio/warehouse;
tail -f /dev/null
"