diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5eb26e06b..c60f42b1d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -41,12 +41,24 @@ jobs: name: AMD64 Ubuntu 24.04 runs-on: ubuntu-24.04 timeout-minutes: 30 + strategy: + fail-fast: false + env: + ICEBERG_TEST_S3_URI: s3://iceberg-test + AWS_ACCESS_KEY_ID: minio + AWS_SECRET_ACCESS_KEY: minio123 + AWS_DEFAULT_REGION: us-east-1 + AWS_ENDPOINT_URL: http://127.0.0.1:9000 + AWS_EC2_METADATA_DISABLED: "TRUE" steps: - name: Checkout iceberg-cpp uses: actions/checkout@v6 - name: Install dependencies shell: bash run: sudo apt-get update && sudo apt-get install -y libcurl4-openssl-dev + - name: Start MinIO + shell: bash + run: bash ci/scripts/start_minio.sh - name: Build Iceberg shell: bash env: @@ -63,9 +75,21 @@ jobs: name: AArch64 macOS 26 runs-on: macos-26 timeout-minutes: 30 + strategy: + fail-fast: false + env: + ICEBERG_TEST_S3_URI: s3://iceberg-test + AWS_ACCESS_KEY_ID: minio + AWS_SECRET_ACCESS_KEY: minio123 + AWS_DEFAULT_REGION: us-east-1 + AWS_ENDPOINT_URL: http://127.0.0.1:9000 + AWS_EC2_METADATA_DISABLED: "TRUE" steps: - name: Checkout iceberg-cpp uses: actions/checkout@v6 + - name: Start MinIO + shell: bash + run: bash ci/scripts/start_minio.sh - name: Build Iceberg shell: bash run: ci/scripts/build_iceberg.sh $(pwd) @@ -76,6 +100,15 @@ jobs: name: AMD64 Windows 2025 runs-on: windows-2025 timeout-minutes: 60 + strategy: + fail-fast: false + env: + ICEBERG_TEST_S3_URI: s3://iceberg-test + AWS_ACCESS_KEY_ID: minio + AWS_SECRET_ACCESS_KEY: minio123 + AWS_DEFAULT_REGION: us-east-1 + AWS_ENDPOINT_URL: http://127.0.0.1:9000 + AWS_EC2_METADATA_DISABLED: "TRUE" steps: - name: Checkout iceberg-cpp uses: actions/checkout@v6 @@ -85,6 +118,9 @@ jobs: vcpkg install zlib:x64-windows nlohmann-json:x64-windows nanoarrow:x64-windows roaring:x64-windows cpr:x64-windows - name: Setup sccache uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad # v0.0.9 + - name: Start MinIO + shell: bash + run: bash ci/scripts/start_minio.sh - name: Build Iceberg shell: cmd env: diff --git a/CMakeLists.txt b/CMakeLists.txt index e7281fb11..c0f2eb73f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,6 +45,7 @@ option(ICEBERG_BUILD_TESTS "Build tests" ON) option(ICEBERG_BUILD_BUNDLE "Build the battery included library" ON) option(ICEBERG_BUILD_REST "Build rest catalog client" ON) option(ICEBERG_BUILD_REST_INTEGRATION_TESTS "Build rest catalog integration tests" OFF) +option(ICEBERG_S3 "Build with S3 support" OFF) option(ICEBERG_ENABLE_ASAN "Enable Address Sanitizer" OFF) option(ICEBERG_ENABLE_UBSAN "Enable Undefined Behavior Sanitizer" OFF) @@ -68,6 +69,12 @@ if(ICEBERG_BUILD_REST_INTEGRATION_TESTS AND WIN32) message(WARNING "Cannot build rest integration test on Windows, turning it off.") endif() +# ICEBERG_S3 requires ICEBERG_BUILD_BUNDLE +if(NOT ICEBERG_BUILD_BUNDLE AND ICEBERG_S3) + set(ICEBERG_S3 OFF) + message(STATUS "ICEBERG_S3 is disabled because ICEBERG_BUILD_BUNDLE is OFF") +endif() + include(CMakeParseArguments) include(IcebergBuildUtils) include(IcebergSanitizer) diff --git a/ci/scripts/start_minio.sh b/ci/scripts/start_minio.sh new file mode 100644 index 000000000..219990d3f --- /dev/null +++ b/ci/scripts/start_minio.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -eux + +MINIO_ROOT_USER="${MINIO_ROOT_USER:-minio}" +MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-minio123}" +MINIO_IMAGE="${MINIO_IMAGE:-minio/minio:latest}" +MINIO_CONTAINER_NAME="${MINIO_CONTAINER_NAME:-iceberg-minio}" +MINIO_PORT="${MINIO_PORT:-9000}" +MINIO_CONSOLE_PORT="${MINIO_CONSOLE_PORT:-9001}" +MINIO_BUCKET="${MINIO_BUCKET:-iceberg-test}" +MINIO_ENDPOINT="${MINIO_ENDPOINT:-http://127.0.0.1:${MINIO_PORT}}" + +wait_for_minio() { + for i in {1..30}; do + if curl -fsS "${MINIO_ENDPOINT}/minio/health/ready" >/dev/null; then + return 0 + fi + sleep 1 + done + return 1 +} + +start_minio_docker() { + if ! command -v docker >/dev/null 2>&1; then + return 1 + fi + + if docker ps -a --format '{{.Names}}' | grep -q "^${MINIO_CONTAINER_NAME}\$"; then + docker rm -f "${MINIO_CONTAINER_NAME}" + fi + + docker run -d --name "${MINIO_CONTAINER_NAME}" \ + -p "${MINIO_PORT}:9000" -p "${MINIO_CONSOLE_PORT}:9001" \ + -e "MINIO_ROOT_USER=${MINIO_ROOT_USER}" \ + -e "MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD}" \ + "${MINIO_IMAGE}" \ + server /data --console-address ":${MINIO_CONSOLE_PORT}" + + wait_for_minio +} + +start_minio_macos() { + if ! command -v brew >/dev/null 2>&1; then + echo "brew is required to start MinIO on macOS without Docker" >&2 + return 1 + fi + + brew install minio + MINIO_ROOT_USER="${MINIO_ROOT_USER}" MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD}" \ + minio server /tmp/minio --console-address ":${MINIO_CONSOLE_PORT}" & + wait_for_minio +} + +download_mc() { + local uname_out + uname_out="$(uname -s)" + + local mc_dir + mc_dir="${RUNNER_TEMP:-/tmp}" + mkdir -p "${mc_dir}" + + case "${uname_out}" in + Linux*) + MC_BIN="${mc_dir}/mc" + curl -sSL "https://dl.min.io/client/mc/release/linux-amd64/mc" -o "${MC_BIN}" + chmod +x "${MC_BIN}" + ;; + Darwin*) + MC_BIN="${mc_dir}/mc" + local arch + arch="$(uname -m)" + if [ "${arch}" = "arm64" ]; then + curl -sSL "https://dl.min.io/client/mc/release/darwin-arm64/mc" -o "${MC_BIN}" + else + curl -sSL "https://dl.min.io/client/mc/release/darwin-amd64/mc" -o "${MC_BIN}" + fi + chmod +x "${MC_BIN}" + ;; + MINGW*|MSYS*|CYGWIN*) + MC_BIN="${mc_dir}/mc.exe" + curl -sSL "https://dl.min.io/client/mc/release/windows-amd64/mc.exe" -o "${MC_BIN}" + ;; + *) + echo "Unsupported OS for mc: ${uname_out}" >&2 + return 1 + ;; + esac +} + +create_bucket() { + download_mc + for i in {1..30}; do + if "${MC_BIN}" alias set local "${MINIO_ENDPOINT}" "${MINIO_ROOT_USER}" "${MINIO_ROOT_PASSWORD}"; then + break + fi + sleep 1 + done + "${MC_BIN}" mb --ignore-existing "local/${MINIO_BUCKET}" +} + +start_minio_windows() { + local minio_dir="${RUNNER_TEMP:-/tmp}" + local minio_bin="${minio_dir}/minio.exe" + curl -sSL "https://dl.min.io/server/minio/release/windows-amd64/minio.exe" -o "${minio_bin}" + MINIO_ROOT_USER="${MINIO_ROOT_USER}" MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD}" \ + "${minio_bin}" server "${minio_dir}/minio-data" --console-address ":${MINIO_CONSOLE_PORT}" & + wait_for_minio +} + +case "$(uname -s)" in + Darwin*) + if ! start_minio_docker; then + start_minio_macos + fi + ;; + MINGW*|MSYS*|CYGWIN*) + if ! start_minio_docker; then + start_minio_windows + fi + ;; + Linux*) + start_minio_docker + ;; + *) + echo "Unsupported OS: $(uname -s)" >&2 + exit 1 + ;; +esac + +create_bucket diff --git a/cmake_modules/IcebergThirdpartyToolchain.cmake b/cmake_modules/IcebergThirdpartyToolchain.cmake index 8b32eb749..d4f837d67 100644 --- a/cmake_modules/IcebergThirdpartyToolchain.cmake +++ b/cmake_modules/IcebergThirdpartyToolchain.cmake @@ -102,6 +102,7 @@ function(resolve_arrow_dependency) # Work around undefined symbol: arrow::ipc::ReadSchema(arrow::io::InputStream*, arrow::ipc::DictionaryMemo*) set(ARROW_IPC ON) set(ARROW_FILESYSTEM ON) + set(ARROW_S3 ${ICEBERG_S3}) set(ARROW_JSON ON) set(ARROW_PARQUET ON) set(ARROW_SIMD_LEVEL "NONE") @@ -164,6 +165,13 @@ function(resolve_arrow_dependency) install(FILES ${arrow_bundled_dependencies_location} DESTINATION ${ICEBERG_INSTALL_LIBDIR}) endif() + + # Arrow's exported static target interface may reference system libraries + # (e.g. OpenSSL, CURL, ZLIB) that consumers need to find. + list(APPEND ICEBERG_SYSTEM_DEPENDENCIES ZLIB) + if(ARROW_S3) + list(APPEND ICEBERG_SYSTEM_DEPENDENCIES OpenSSL CURL) + endif() else() set(ARROW_VENDORED FALSE) find_package(Arrow CONFIG REQUIRED) diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index b503a41ea..33f32f2de 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -40,6 +40,7 @@ set(ICEBERG_SOURCES expression/rewrite_not.cc expression/strict_metrics_evaluator.cc expression/term.cc + file_io_registry.cc file_reader.cc file_writer.cc inheritable_metadata.cc @@ -103,6 +104,7 @@ set(ICEBERG_SOURCES update/update_statistics.cc util/bucket_util.cc util/content_file_util.cc + util/file_io_util.cc util/conversions.cc util/decimal.cc util/gzip_internal.cc @@ -176,6 +178,8 @@ add_subdirectory(util) if(ICEBERG_BUILD_BUNDLE) set(ICEBERG_BUNDLE_SOURCES arrow/arrow_fs_file_io.cc + arrow/s3/arrow_s3_file_io.cc + arrow/file_io_register.cc arrow/metadata_column_util.cc avro/avro_data_util.cc avro/avro_direct_decoder.cc @@ -241,6 +245,15 @@ if(ICEBERG_BUILD_BUNDLE) OUTPUTS ICEBERG_BUNDLE_LIBRARIES) + if(ICEBERG_S3) + foreach(target iceberg_bundle_static iceberg_bundle_shared) + if(TARGET ${target}) + target_compile_definitions(${target} + PUBLIC "$") + endif() + endforeach() + endif() + add_subdirectory(arrow) add_subdirectory(avro) add_subdirectory(parquet) diff --git a/src/iceberg/arrow/CMakeLists.txt b/src/iceberg/arrow/CMakeLists.txt index 3416d5e95..71c161f02 100644 --- a/src/iceberg/arrow/CMakeLists.txt +++ b/src/iceberg/arrow/CMakeLists.txt @@ -16,3 +16,5 @@ # under the License. iceberg_install_all_headers(iceberg/arrow) + +add_subdirectory(s3) diff --git a/src/iceberg/arrow/arrow_file_io.h b/src/iceberg/arrow/arrow_file_io.h index 12a9b2303..100987f53 100644 --- a/src/iceberg/arrow/arrow_file_io.h +++ b/src/iceberg/arrow/arrow_file_io.h @@ -20,9 +20,12 @@ #pragma once #include +#include +#include #include "iceberg/file_io.h" #include "iceberg/iceberg_bundle_export.h" +#include "iceberg/result.h" namespace iceberg::arrow { @@ -30,4 +33,21 @@ ICEBERG_BUNDLE_EXPORT std::unique_ptr MakeMockFileIO(); ICEBERG_BUNDLE_EXPORT std::unique_ptr MakeLocalFileIO(); +/// \brief Create an S3 FileIO backed by Arrow's S3FileSystem. +/// +/// This function initializes the S3 subsystem if not already initialized (thread-safe). +/// The S3 initialization is done once per process using std::call_once. +/// +/// \param properties Configuration properties for S3 access. See S3Properties +/// for available keys (credentials, region, endpoint, timeouts, etc.). +/// \return A FileIO instance for S3 operations, or an error if S3 is not supported. +ICEBERG_BUNDLE_EXPORT Result> MakeS3FileIO( + const std::unordered_map& properties = {}); + +/// \brief Finalize (clean up) the Arrow S3 subsystem. +/// +/// Must be called before process exit if S3 was initialized, otherwise Arrow's +/// static destructors may cause a non-zero exit. +ICEBERG_BUNDLE_EXPORT Status FinalizeS3(); + } // namespace iceberg::arrow diff --git a/src/iceberg/arrow/arrow_fs_file_io.cc b/src/iceberg/arrow/arrow_fs_file_io.cc index be62b79af..769fcfb13 100644 --- a/src/iceberg/arrow/arrow_fs_file_io.cc +++ b/src/iceberg/arrow/arrow_fs_file_io.cc @@ -25,13 +25,23 @@ #include "iceberg/arrow/arrow_file_io.h" #include "iceberg/arrow/arrow_fs_file_io_internal.h" #include "iceberg/arrow/arrow_status_internal.h" +#include "iceberg/util/macros.h" namespace iceberg::arrow { +Result ArrowFileSystemFileIO::ResolvePath(const std::string& file_location) { + if (file_location.find("://") != std::string::npos) { + ICEBERG_ARROW_ASSIGN_OR_RETURN(auto path, arrow_fs_->PathFromUri(file_location)); + return path; + } + return file_location; +} + /// \brief Read the content of the file at the given location. Result ArrowFileSystemFileIO::ReadFile(const std::string& file_location, std::optional length) { - ::arrow::fs::FileInfo file_info(file_location); + ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location)); + ::arrow::fs::FileInfo file_info(path); if (length.has_value()) { file_info.set_size(length.value()); } @@ -47,6 +57,10 @@ Result ArrowFileSystemFileIO::ReadFile(const std::string& file_loca ICEBERG_ARROW_ASSIGN_OR_RETURN( auto read_bytes, file->Read(read_length, reinterpret_cast(&content[offset]))); + if (read_bytes == 0) { + return IOError("Unexpected EOF reading {}: got {} of {} bytes", file_location, + offset, file_size); + } remain -= read_bytes; offset += read_bytes; } @@ -57,7 +71,8 @@ Result ArrowFileSystemFileIO::ReadFile(const std::string& file_loca /// \brief Write the given content to the file at the given location. Status ArrowFileSystemFileIO::WriteFile(const std::string& file_location, std::string_view content) { - ICEBERG_ARROW_ASSIGN_OR_RETURN(auto file, arrow_fs_->OpenOutputStream(file_location)); + ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location)); + ICEBERG_ARROW_ASSIGN_OR_RETURN(auto file, arrow_fs_->OpenOutputStream(path)); ICEBERG_ARROW_RETURN_NOT_OK(file->Write(content.data(), content.size())); ICEBERG_ARROW_RETURN_NOT_OK(file->Flush()); ICEBERG_ARROW_RETURN_NOT_OK(file->Close()); @@ -66,7 +81,8 @@ Status ArrowFileSystemFileIO::WriteFile(const std::string& file_location, /// \brief Delete a file at the given location. Status ArrowFileSystemFileIO::DeleteFile(const std::string& file_location) { - ICEBERG_ARROW_RETURN_NOT_OK(arrow_fs_->DeleteFile(file_location)); + ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location)); + ICEBERG_ARROW_RETURN_NOT_OK(arrow_fs_->DeleteFile(path)); return {}; } diff --git a/src/iceberg/arrow/arrow_fs_file_io_internal.h b/src/iceberg/arrow/arrow_fs_file_io_internal.h index f151c7a5b..92a991501 100644 --- a/src/iceberg/arrow/arrow_fs_file_io_internal.h +++ b/src/iceberg/arrow/arrow_fs_file_io_internal.h @@ -56,6 +56,9 @@ class ICEBERG_BUNDLE_EXPORT ArrowFileSystemFileIO : public FileIO { const std::shared_ptr<::arrow::fs::FileSystem>& fs() const { return arrow_fs_; } private: + /// \brief Resolve a file location to a filesystem path. + Result ResolvePath(const std::string& file_location); + std::shared_ptr<::arrow::fs::FileSystem> arrow_fs_; }; diff --git a/src/iceberg/arrow/file_io_register.cc b/src/iceberg/arrow/file_io_register.cc new file mode 100644 index 000000000..48119347c --- /dev/null +++ b/src/iceberg/arrow/file_io_register.cc @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "iceberg/arrow/file_io_register.h" + +#include + +#include "iceberg/arrow/arrow_file_io.h" +#include "iceberg/arrow/arrow_fs_file_io_internal.h" +#include "iceberg/file_io_registry.h" +#include "iceberg/util/macros.h" + +namespace iceberg::arrow { + +void RegisterLocalFileIO() { + static std::once_flag flag; + std::call_once(flag, []() { + FileIORegistry::Register( + std::string(FileIORegistry::kArrowLocalFileIO), + [](const std::unordered_map& /*properties*/) + -> Result> { return MakeLocalFileIO(); }); + }); +} + +void RegisterS3FileIO() { + static std::once_flag flag; + std::call_once(flag, []() { + FileIORegistry::Register( + std::string(FileIORegistry::kArrowS3FileIO), + [](const std::unordered_map& properties) + -> Result> { return MakeS3FileIO(properties); }); + }); +} + +} // namespace iceberg::arrow diff --git a/src/iceberg/arrow/file_io_register.h b/src/iceberg/arrow/file_io_register.h new file mode 100644 index 000000000..b3e25c6c5 --- /dev/null +++ b/src/iceberg/arrow/file_io_register.h @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/arrow/file_io_register.h +/// \brief Provide functions to register Arrow FileIO implementations. + +#include "iceberg/iceberg_bundle_export.h" + +namespace iceberg::arrow { + +/// \brief Register the Arrow local filesystem FileIO into the FileIORegistry. +ICEBERG_BUNDLE_EXPORT void RegisterLocalFileIO(); + +/// \brief Register the Arrow S3 FileIO into the FileIORegistry. +ICEBERG_BUNDLE_EXPORT void RegisterS3FileIO(); + +} // namespace iceberg::arrow diff --git a/src/iceberg/arrow/s3/CMakeLists.txt b/src/iceberg/arrow/s3/CMakeLists.txt new file mode 100644 index 000000000..27eda89ed --- /dev/null +++ b/src/iceberg/arrow/s3/CMakeLists.txt @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +iceberg_install_all_headers(iceberg/arrow/s3) diff --git a/src/iceberg/arrow/s3/arrow_s3_file_io.cc b/src/iceberg/arrow/s3/arrow_s3_file_io.cc new file mode 100644 index 000000000..a4931b3a4 --- /dev/null +++ b/src/iceberg/arrow/s3/arrow_s3_file_io.cc @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include +#ifdef ICEBERG_S3_ENABLED +# include +# define ICEBERG_ARROW_HAS_S3 1 +#else +# define ICEBERG_ARROW_HAS_S3 0 +#endif + +#include "iceberg/arrow/arrow_file_io.h" +#include "iceberg/arrow/arrow_fs_file_io_internal.h" +#include "iceberg/arrow/arrow_status_internal.h" +#include "iceberg/arrow/s3/s3_properties.h" +#include "iceberg/util/macros.h" +#include "iceberg/util/string_util.h" + +namespace iceberg::arrow { + +namespace { + +#if ICEBERG_ARROW_HAS_S3 +Status EnsureS3Initialized() { + static std::once_flag init_flag; + static ::arrow::Status init_status = ::arrow::Status::OK(); + std::call_once(init_flag, []() { + auto options = ::arrow::fs::S3GlobalOptions::Defaults(); + init_status = ::arrow::fs::InitializeS3(options); + }); + if (!init_status.ok()) { + return std::unexpected(Error{.kind = ::iceberg::arrow::ToErrorKind(init_status), + .message = init_status.ToString()}); + } + return {}; +} +/// \brief Configure S3Options from a properties map. +/// +/// \param properties The configuration properties map. +/// \return Configured S3Options. +Result<::arrow::fs::S3Options> ConfigureS3Options( + const std::unordered_map& properties) { + ::arrow::fs::S3Options options; + + // Configure credentials + auto access_key_it = properties.find(std::string(S3Properties::kAccessKeyId)); + auto secret_key_it = properties.find(std::string(S3Properties::kSecretAccessKey)); + auto session_token_it = properties.find(std::string(S3Properties::kSessionToken)); + + if (access_key_it != properties.end() && secret_key_it != properties.end()) { + if (session_token_it != properties.end()) { + options.ConfigureAccessKey(access_key_it->second, secret_key_it->second, + session_token_it->second); + } else { + options.ConfigureAccessKey(access_key_it->second, secret_key_it->second); + } + } else { + // Use default credential chain (environment, instance profile, etc.) + options.ConfigureDefaultCredentials(); + } + + // Configure region + auto region_it = properties.find(std::string(S3Properties::kRegion)); + if (region_it != properties.end()) { + options.region = region_it->second; + } + + // Configure endpoint (for MinIO, LocalStack, etc.) + auto endpoint_it = properties.find(std::string(S3Properties::kEndpoint)); + if (endpoint_it != properties.end()) { + options.endpoint_override = endpoint_it->second; + } else { + // Fall back to AWS standard environment variables for endpoint override + const char* s3_endpoint_env = std::getenv("AWS_ENDPOINT_URL_S3"); + if (s3_endpoint_env != nullptr) { + options.endpoint_override = s3_endpoint_env; + } else { + const char* endpoint_env = std::getenv("AWS_ENDPOINT_URL"); + if (endpoint_env != nullptr) { + options.endpoint_override = endpoint_env; + } + } + } + + auto path_style_it = properties.find(std::string(S3Properties::kPathStyleAccess)); + if (path_style_it != properties.end() && path_style_it->second == "true") { + options.force_virtual_addressing = false; + } + + // Configure SSL + auto ssl_it = properties.find(std::string(S3Properties::kSslEnabled)); + if (ssl_it != properties.end() && ssl_it->second == "false") { + options.scheme = "http"; + } + + // Configure timeouts + auto connect_timeout_it = properties.find(std::string(S3Properties::kConnectTimeoutMs)); + if (connect_timeout_it != properties.end()) { + ICEBERG_ASSIGN_OR_RAISE(auto timeout_ms, + StringUtils::ParseNumber(connect_timeout_it->second)); + options.connect_timeout = timeout_ms / 1000.0; + } + + auto socket_timeout_it = properties.find(std::string(S3Properties::kSocketTimeoutMs)); + if (socket_timeout_it != properties.end()) { + ICEBERG_ASSIGN_OR_RAISE(auto timeout_ms, + StringUtils::ParseNumber(socket_timeout_it->second)); + options.request_timeout = timeout_ms / 1000.0; + } + + return options; +} +#endif + +} // namespace + +Result> MakeS3FileIO( + const std::unordered_map& properties) { +#if !ICEBERG_ARROW_HAS_S3 + return NotSupported("Arrow S3 support is not enabled"); +#else + ICEBERG_RETURN_UNEXPECTED(EnsureS3Initialized()); + + // Configure S3 options from properties (uses default credentials if empty) + ICEBERG_ASSIGN_OR_RAISE(auto options, ConfigureS3Options(properties)); + ICEBERG_ARROW_ASSIGN_OR_RETURN(auto fs, ::arrow::fs::S3FileSystem::Make(options)); + + return std::make_unique(std::move(fs)); +#endif +} + +Status FinalizeS3() { +#if ICEBERG_ARROW_HAS_S3 + auto status = ::arrow::fs::FinalizeS3(); + ICEBERG_ARROW_RETURN_NOT_OK(status); + return {}; +#else + return NotSupported("Arrow S3 support is not enabled"); +#endif +} + +} // namespace iceberg::arrow diff --git a/src/iceberg/arrow/s3/s3_properties.h b/src/iceberg/arrow/s3/s3_properties.h new file mode 100644 index 000000000..53657743d --- /dev/null +++ b/src/iceberg/arrow/s3/s3_properties.h @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +namespace iceberg::arrow { + +/// \brief S3 configuration property keys for ArrowS3FileIO. +/// +/// These constants define the property keys used to configure S3 access +/// via the Arrow filesystem integration, following the Iceberg spec for +/// S3 configuration properties. +struct S3Properties { + /// S3 URI scheme + static constexpr std::string_view kS3Schema = "s3"; + /// AWS access key ID + static constexpr std::string_view kAccessKeyId = "s3.access-key-id"; + /// AWS secret access key + static constexpr std::string_view kSecretAccessKey = "s3.secret-access-key"; + /// AWS session token (for temporary credentials) + static constexpr std::string_view kSessionToken = "s3.session-token"; + /// AWS region + static constexpr std::string_view kRegion = "s3.region"; + /// Custom endpoint override (for MinIO, LocalStack, etc.) + static constexpr std::string_view kEndpoint = "s3.endpoint"; + /// Whether to use path-style access (needed for MinIO) + static constexpr std::string_view kPathStyleAccess = "s3.path-style-access"; + /// Whether SSL is enabled + static constexpr std::string_view kSslEnabled = "s3.ssl.enabled"; + /// Connection timeout in milliseconds + static constexpr std::string_view kConnectTimeoutMs = "s3.connect-timeout-ms"; + /// Socket timeout in milliseconds + static constexpr std::string_view kSocketTimeoutMs = "s3.socket-timeout-ms"; +}; + +} // namespace iceberg::arrow diff --git a/src/iceberg/catalog/rest/catalog_properties.cc b/src/iceberg/catalog/rest/catalog_properties.cc index 0e417e6c3..c6efdefc6 100644 --- a/src/iceberg/catalog/rest/catalog_properties.cc +++ b/src/iceberg/catalog/rest/catalog_properties.cc @@ -61,4 +61,12 @@ Result RestCatalogProperties::SnapshotLoadingMode() const { } } +void RestCatalogProperties::SetFileIO(std::shared_ptr file_io) { + file_io_ = std::move(file_io); +} + +const std::shared_ptr& RestCatalogProperties::GetFileIO() const { + return file_io_; +} + } // namespace iceberg::rest diff --git a/src/iceberg/catalog/rest/catalog_properties.h b/src/iceberg/catalog/rest/catalog_properties.h index a00aa87d6..af69251f8 100644 --- a/src/iceberg/catalog/rest/catalog_properties.h +++ b/src/iceberg/catalog/rest/catalog_properties.h @@ -19,10 +19,12 @@ #pragma once +#include #include #include #include "iceberg/catalog/rest/iceberg_rest_export.h" +#include "iceberg/file_io.h" #include "iceberg/result.h" #include "iceberg/util/config.h" @@ -73,6 +75,16 @@ class ICEBERG_REST_EXPORT RestCatalogProperties /// "REFS", or an error if the value is invalid. Parsing is /// case-insensitive to match Java behavior. Result SnapshotLoadingMode() const; + + /// \brief Set a custom FileIO instance. If not set, FileIO will be + /// auto-detected from warehouse URI scheme or "io-impl" property. + void SetFileIO(std::shared_ptr file_io); + + /// \brief Get the configured FileIO instance (may be nullptr). + const std::shared_ptr& GetFileIO() const; + + private: + std::shared_ptr file_io_; }; } // namespace iceberg::rest diff --git a/src/iceberg/catalog/rest/rest_catalog.cc b/src/iceberg/catalog/rest/rest_catalog.cc index 40e112db7..dbdb61d4c 100644 --- a/src/iceberg/catalog/rest/rest_catalog.cc +++ b/src/iceberg/catalog/rest/rest_catalog.cc @@ -36,6 +36,7 @@ #include "iceberg/catalog/rest/resource_paths.h" #include "iceberg/catalog/rest/rest_util.h" #include "iceberg/catalog/rest/types.h" +#include "iceberg/file_io_registry.h" #include "iceberg/json_serde_internal.h" #include "iceberg/partition_spec.h" #include "iceberg/result.h" @@ -45,6 +46,7 @@ #include "iceberg/table_requirement.h" #include "iceberg/table_update.h" #include "iceberg/transaction.h" +#include "iceberg/util/file_io_util.h" #include "iceberg/util/macros.h" namespace iceberg::rest { @@ -121,10 +123,27 @@ Result CaptureNoSuchNamespace(const auto& status) { RestCatalog::~RestCatalog() = default; Result> RestCatalog::Make( - const RestCatalogProperties& config, std::shared_ptr file_io) { + const RestCatalogProperties& config) { ICEBERG_ASSIGN_OR_RAISE(auto uri, config.Uri()); + + // Resolve FileIO: from config, or auto-detect from properties/URI + std::shared_ptr file_io = config.GetFileIO(); if (!file_io) { - return InvalidArgument("FileIO is required to create RestCatalog"); + auto properties = config.configs(); + // If io-impl is not set or empty, auto-detect from warehouse URI scheme + auto io_impl_it = properties.find(std::string(FileIOProperties::kImpl)); + if (io_impl_it == properties.end() || io_impl_it->second.empty()) { + auto warehouse = config.Get(RestCatalogProperties::kWarehouse); + if (warehouse.empty()) { + return InvalidArgument( + "Warehouse location is required when FileIO is not explicitly provided. " + "Set the 'warehouse' property or call SetFileIO()."); + } + properties[std::string(FileIOProperties::kImpl)] = + FileIOUtil::DetectFileIOName(warehouse); + } + ICEBERG_ASSIGN_OR_RAISE(auto created_io, FileIOUtil::CreateFileIO(properties)); + file_io = std::shared_ptr(std::move(created_io)); } std::string catalog_name = config.Get(RestCatalogProperties::kName); diff --git a/src/iceberg/catalog/rest/rest_catalog.h b/src/iceberg/catalog/rest/rest_catalog.h index 38230a5e2..9dabb48a8 100644 --- a/src/iceberg/catalog/rest/rest_catalog.h +++ b/src/iceberg/catalog/rest/rest_catalog.h @@ -46,13 +46,27 @@ class ICEBERG_REST_EXPORT RestCatalog : public Catalog, RestCatalog(RestCatalog&&) = delete; RestCatalog& operator=(RestCatalog&&) = delete; - /// \brief Create a RestCatalog instance + /// \brief Create a RestCatalog instance. /// - /// \param config the configuration for the RestCatalog - /// \param file_io the FileIO instance to use for table operations - /// \return a shared_ptr to RestCatalog instance - static Result> Make(const RestCatalogProperties& config, - std::shared_ptr file_io); + /// FileIO is resolved in this order: + /// 1. If config.GetFileIO() is set, use it directly. + /// 2. If the "io-impl" property is set, load from FileIORegistry. + /// 3. Otherwise, auto-detect based on warehouse URI scheme: + /// - "s3://..." -> "s3" (ArrowS3FileIO) + /// - anything else -> "local" (ArrowLocalFileIO) + /// + /// Users can register custom FileIO implementations via FileIORegistry::Register(): + /// \code + /// FileIORegistry::Register("myfs", + /// [](const auto& props) -> Result> { + /// return std::make_unique(props); + /// }); + /// \endcode + /// + /// \param config the configuration for the RestCatalog, including warehouse location + /// and optional "io-impl" property or pre-configured FileIO + /// \return a shared_ptr to RestCatalog instance, or an error if FileIO creation fails + static Result> Make(const RestCatalogProperties& config); std::string_view name() const override; diff --git a/src/iceberg/file_io_registry.cc b/src/iceberg/file_io_registry.cc new file mode 100644 index 000000000..576227893 --- /dev/null +++ b/src/iceberg/file_io_registry.cc @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "iceberg/file_io_registry.h" + +#include + +namespace iceberg { + +namespace { + +std::mutex& RegistryMutex() { + static std::mutex mutex; + return mutex; +} + +auto& RegistryMap() { + static std::unordered_map registry; + return registry; +} + +} // namespace + +void FileIORegistry::Register(const std::string& name, Factory factory) { + std::lock_guard lock(RegistryMutex()); + RegistryMap()[name] = std::move(factory); +} + +Result> FileIORegistry::Load( + const std::string& name, + const std::unordered_map& properties) { + Factory factory; + { + std::lock_guard lock(RegistryMutex()); + auto it = RegistryMap().find(name); + if (it == RegistryMap().end()) { + return std::unexpected( + {.kind = ErrorKind::kNotFound, + .message = "FileIO implementation not found: " + name}); + } + factory = it->second; + } + // Invoke factory outside the lock to avoid blocking other + // Register/Load calls and to prevent deadlocks if the factory + // calls back into the registry. + return factory(properties); +} + +} // namespace iceberg diff --git a/src/iceberg/file_io_registry.h b/src/iceberg/file_io_registry.h new file mode 100644 index 000000000..d706ab7d0 --- /dev/null +++ b/src/iceberg/file_io_registry.h @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "iceberg/file_io.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" + +namespace iceberg { + +/// \brief Registry for FileIO implementations. +/// +/// Provides a mechanism to register and load FileIO implementations by name. +/// This allows the REST catalog (and others) to resolve FileIO implementations +/// at runtime based on configuration properties like "io-impl". +class ICEBERG_EXPORT FileIORegistry { + public: + /// Well-known implementation names + static constexpr std::string_view kArrowLocalFileIO = "local"; + static constexpr std::string_view kArrowS3FileIO = "s3"; + + /// Factory function type for creating FileIO instances. + using Factory = std::function>( + const std::unordered_map& properties)>; + + /// \brief Register a FileIO factory under the given name. + /// + /// \param name The implementation name (e.g., "local", "s3") + /// \param factory The factory function that creates the FileIO instance. + static void Register(const std::string& name, Factory factory); + + /// \brief Load a FileIO implementation by name. + /// + /// \param name The implementation name to look up. + /// \param properties Configuration properties to pass to the factory. + /// \return A unique_ptr to the FileIO instance, or an error if not found. + static Result> Load( + const std::string& name, + const std::unordered_map& properties); +}; + +/// \brief Property keys for FileIO configuration. +struct FileIOProperties { + /// The FileIO implementation class name (e.g., "local", "s3") + // TODO: add mapping from Java io-impl class names (e.g., + // "org.apache.iceberg.aws.s3.S3FileIO") to the simple keys used here. + static constexpr std::string_view kImpl = "io-impl"; +}; + +} // namespace iceberg diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index 2cf1065b0..4892ae91c 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -58,6 +58,7 @@ iceberg_sources = files( 'expression/rewrite_not.cc', 'expression/strict_metrics_evaluator.cc', 'expression/term.cc', + 'file_io_registry.cc', 'file_reader.cc', 'file_writer.cc', 'inheritable_metadata.cc', @@ -123,6 +124,7 @@ iceberg_sources = files( 'util/content_file_util.cc', 'util/conversions.cc', 'util/decimal.cc', + 'util/file_io_util.cc', 'util/gzip_internal.cc', 'util/murmurhash3_internal.cc', 'util/property_util.cc', @@ -140,8 +142,8 @@ iceberg_sources = files( # CRoaring does not export symbols, so on Windows it must # be used as a static lib croaring_needs_static = ( - get_option('default_library') == 'static' or - host_machine.system() == 'windows' + get_option('default_library') == 'static' + or host_machine.system() == 'windows' ) croaring_dep = dependency('croaring', static: croaring_needs_static) nanoarrow_dep = dependency('nanoarrow') @@ -185,6 +187,7 @@ install_headers( 'exception.h', 'file_format.h', 'file_io.h', + 'file_io_registry.h', 'file_reader.h', 'file_writer.h', 'iceberg_export.h', diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 768e0507e..a4b5b5f27 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -113,6 +113,7 @@ add_iceberg_test(util_test data_file_set_test.cc decimal_test.cc endian_test.cc + file_io_registry_test.cc formatter_test.cc location_util_test.cc string_util_test.cc @@ -142,6 +143,8 @@ if(ICEBERG_BUILD_BUNDLE) metadata_io_test.cc struct_like_test.cc) + add_iceberg_test(file_io_test USE_BUNDLE SOURCES arrow_s3_file_io_test.cc) + add_iceberg_test(catalog_test USE_BUNDLE SOURCES in_memory_catalog_test.cc) add_iceberg_test(eval_expr_test diff --git a/src/iceberg/test/arrow_s3_file_io_test.cc b/src/iceberg/test/arrow_s3_file_io_test.cc new file mode 100644 index 000000000..349c06574 --- /dev/null +++ b/src/iceberg/test/arrow_s3_file_io_test.cc @@ -0,0 +1,216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include + +#include + +#include "iceberg/arrow/arrow_file_io.h" +#include "iceberg/arrow/s3/s3_properties.h" +#include "iceberg/test/matchers.h" + +#ifdef ICEBERG_S3_ENABLED +namespace { + +/// \brief GTest environment that finalizes Arrow S3 after all tests complete. +/// +/// Arrow's S3 initialization creates global state that must be cleaned up via +/// FinalizeS3() before the process exits. Without this, Arrow's static destructor +/// detects the missing finalization and causes a non-zero exit (which fails under +/// sanitizers). GTest Environment::TearDown() runs after all tests but before +/// static destructors, making it the safe place to finalize. +class ArrowS3TestEnvironment : public ::testing::Environment { + public: + void TearDown() override { + auto status = iceberg::arrow::FinalizeS3(); + if (!status.has_value()) { + std::cerr << "Warning: FinalizeS3 failed: " << status.error().message << std::endl; + } + } +}; + +// Register before main() runs. GTest takes ownership of the pointer. +[[maybe_unused]] auto* const kS3Env = + ::testing::AddGlobalTestEnvironment(new ArrowS3TestEnvironment); + +} // namespace +#endif + +namespace iceberg::arrow { + +TEST(ArrowS3FileIOTest, CreateWithDefaultProperties) { + auto result = MakeS3FileIO({}); + if (result.has_value()) { + EXPECT_NE(result.value(), nullptr); + } +} + +#ifdef ICEBERG_S3_ENABLED +TEST(ArrowS3FileIOTest, RequiresS3SupportAtBuildTime) { + auto result = MakeS3FileIO(); + if (!result.has_value()) { + EXPECT_NE(result.error().kind, ErrorKind::kNotSupported); + } +} +#else +TEST(ArrowS3FileIOTest, RequiresS3SupportAtBuildTime) { + auto result = MakeS3FileIO(); + EXPECT_THAT(result, IsError(ErrorKind::kNotSupported)); +} +#endif + +TEST(ArrowS3FileIOTest, ReadWriteFile) { + const char* base_uri = std::getenv("ICEBERG_TEST_S3_URI"); + if (base_uri == nullptr || std::string(base_uri).empty()) { + GTEST_SKIP() << "Set ICEBERG_TEST_S3_URI to enable S3 IO test"; + } + + auto io_res = MakeS3FileIO(); + if (!io_res.has_value()) { + if (io_res.error().kind == ErrorKind::kNotSupported) { + GTEST_SKIP() << "Arrow S3 support is not enabled"; + } + FAIL() << "MakeS3FileIO failed: " << io_res.error().message; + } + + auto io = std::move(io_res.value()); + std::string object_uri = base_uri; + if (!object_uri.ends_with('/')) { + object_uri += '/'; + } + object_uri += "iceberg_s3_io_test.txt"; + auto write_res = io->WriteFile(object_uri, "hello s3"); + ASSERT_THAT(write_res, IsOk()); + + auto read_res = io->ReadFile(object_uri, std::nullopt); + ASSERT_THAT(read_res, IsOk()); + EXPECT_THAT(read_res, HasValue(::testing::Eq("hello s3"))); + + auto del_res = io->DeleteFile(object_uri); + EXPECT_THAT(del_res, IsOk()); +} + +// ============================================================================ +// Tests for MakeS3FileIO with properties +// ============================================================================ + +TEST(ArrowS3FileIOTest, MakeS3FileIOWithEmptyProperties) { + auto result = MakeS3FileIO({}); + if (result.has_value()) { + EXPECT_NE(result.value(), nullptr); + } +} + +TEST(ArrowS3FileIOTest, MakeS3FileIOWithProperties) { + const char* base_uri = std::getenv("ICEBERG_TEST_S3_URI"); + const char* access_key = std::getenv("AWS_ACCESS_KEY_ID"); + const char* secret_key = std::getenv("AWS_SECRET_ACCESS_KEY"); + const char* endpoint = std::getenv("ICEBERG_TEST_S3_ENDPOINT"); + const char* region = std::getenv("AWS_REGION"); + + if (base_uri == nullptr || std::string(base_uri).empty()) { + GTEST_SKIP() << "Set ICEBERG_TEST_S3_URI to enable S3 IO test"; + } + + std::unordered_map properties; + + // Configure credentials if available + if (access_key != nullptr && secret_key != nullptr) { + properties[std::string(S3Properties::kAccessKeyId)] = access_key; + properties[std::string(S3Properties::kSecretAccessKey)] = secret_key; + } + + // Configure endpoint if available (for MinIO, LocalStack, etc.) + if (endpoint != nullptr && std::string(endpoint).length() > 0) { + properties[std::string(S3Properties::kEndpoint)] = endpoint; + } + + // Configure region if available + if (region != nullptr && std::string(region).length() > 0) { + properties[std::string(S3Properties::kRegion)] = region; + } + + auto io_res = MakeS3FileIO(properties); + if (!io_res.has_value()) { + if (io_res.error().kind == ErrorKind::kNotSupported) { + GTEST_SKIP() << "Arrow S3 support is not enabled"; + } + FAIL() << "MakeS3FileIO failed: " << io_res.error().message; + } + + auto io = std::move(io_res.value()); + std::string object_uri = base_uri; + if (!object_uri.ends_with('/')) { + object_uri += '/'; + } + object_uri += "iceberg_s3_io_props_test.txt"; + + auto write_res = io->WriteFile(object_uri, "hello s3 with properties"); + ASSERT_THAT(write_res, IsOk()); + + auto read_res = io->ReadFile(object_uri, std::nullopt); + ASSERT_THAT(read_res, IsOk()); + EXPECT_THAT(read_res, HasValue(::testing::Eq("hello s3 with properties"))); + + auto del_res = io->DeleteFile(object_uri); + EXPECT_THAT(del_res, IsOk()); +} + +TEST(ArrowS3FileIOTest, MakeS3FileIOWithSslDisabled) { + const char* base_uri = std::getenv("ICEBERG_TEST_S3_URI"); + if (base_uri == nullptr || std::string(base_uri).empty()) { + GTEST_SKIP() << "Set ICEBERG_TEST_S3_URI to enable S3 IO test"; + } + + std::unordered_map properties; + properties[std::string(S3Properties::kSslEnabled)] = "false"; + + // Just test that the configuration is accepted + auto io_res = MakeS3FileIO(properties); + if (!io_res.has_value()) { + if (io_res.error().kind == ErrorKind::kNotSupported) { + GTEST_SKIP() << "Arrow S3 support is not enabled"; + } + // Other errors are acceptable - just checking config parsing works + } +} + +TEST(ArrowS3FileIOTest, MakeS3FileIOWithTimeouts) { + const char* base_uri = std::getenv("ICEBERG_TEST_S3_URI"); + if (base_uri == nullptr || std::string(base_uri).empty()) { + GTEST_SKIP() << "Set ICEBERG_TEST_S3_URI to enable S3 IO test"; + } + + std::unordered_map properties; + properties[std::string(S3Properties::kConnectTimeoutMs)] = "5000"; + properties[std::string(S3Properties::kSocketTimeoutMs)] = "10000"; + + auto io_res = MakeS3FileIO(properties); + if (!io_res.has_value()) { + if (io_res.error().kind == ErrorKind::kNotSupported) { + GTEST_SKIP() << "Arrow S3 support is not enabled"; + } + // Other errors are acceptable - just checking config parsing works + } +} + +} // namespace iceberg::arrow diff --git a/src/iceberg/test/file_io_registry_test.cc b/src/iceberg/test/file_io_registry_test.cc new file mode 100644 index 000000000..fba0cb640 --- /dev/null +++ b/src/iceberg/test/file_io_registry_test.cc @@ -0,0 +1,199 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "iceberg/file_io_registry.h" + +#include +#include + +#include "iceberg/test/matchers.h" +#include "iceberg/util/file_io_util.h" + +namespace iceberg { + +namespace { + +/// A minimal FileIO implementation for testing. +class MockFileIO : public FileIO { + public: + Result ReadFile(const std::string& /*file_location*/, + std::optional /*length*/) override { + return std::string("mock"); + } + + Status WriteFile(const std::string& /*file_location*/, + std::string_view /*content*/) override { + return {}; + } + + Status DeleteFile(const std::string& /*file_location*/) override { return {}; } +}; + +} // namespace + +TEST(FileIoRegistryTest, RegisterAndLoad) { + const std::string impl_name = "com.test.MockFileIO"; + FileIORegistry::Register( + impl_name, + [](const std::unordered_map& /*properties*/) + -> Result> { return std::make_unique(); }); + + auto result = FileIORegistry::Load(impl_name, {}); + ASSERT_THAT(result, IsOk()); + EXPECT_NE(result.value(), nullptr); + + // Verify the loaded FileIO works + auto read_result = result.value()->ReadFile("any_file", std::nullopt); + ASSERT_THAT(read_result, IsOk()); + EXPECT_EQ(read_result.value(), "mock"); +} + +TEST(FileIoRegistryTest, LoadNonExistentReturnsError) { + auto result = FileIORegistry::Load("com.nonexistent.FileIO", {}); + EXPECT_THAT(result, IsError(ErrorKind::kNotFound)); + EXPECT_THAT(result, HasErrorMessage("FileIO implementation not found")); +} + +TEST(FileIoRegistryTest, OverrideExistingRegistration) { + const std::string impl_name = "com.test.OverrideFileIO"; + + // Register first implementation + FileIORegistry::Register( + impl_name, + [](const std::unordered_map& /*properties*/) + -> Result> { return std::make_unique(); }); + + // Override with a different factory + FileIORegistry::Register( + impl_name, + [](const std::unordered_map& /*properties*/) + -> Result> { return std::make_unique(); }); + + // Should still work (the override replaces the original) + auto result = FileIORegistry::Load(impl_name, {}); + ASSERT_THAT(result, IsOk()); + EXPECT_NE(result.value(), nullptr); +} + +TEST(FileIoRegistryTest, FactoryReceivesProperties) { + const std::string impl_name = "com.test.PropCheckFileIO"; + std::unordered_map captured_properties; + + FileIORegistry::Register( + impl_name, + [&captured_properties]( + const std::unordered_map& properties) + -> Result> { + captured_properties = properties; + return std::make_unique(); + }); + + std::unordered_map props = {{"key1", "val1"}, + {"key2", "val2"}}; + auto result = FileIORegistry::Load(impl_name, props); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ(captured_properties.at("key1"), "val1"); + EXPECT_EQ(captured_properties.at("key2"), "val2"); +} + +TEST(FileIOUtilTest, DetectFileIONameFromScheme) { + EXPECT_EQ(FileIOUtil::DetectFileIOName("s3://bucket/path"), + FileIORegistry::kArrowS3FileIO); + EXPECT_EQ(FileIOUtil::DetectFileIOName("/tmp/warehouse"), + FileIORegistry::kArrowLocalFileIO); + EXPECT_EQ(FileIOUtil::DetectFileIOName("file:///tmp/warehouse"), + FileIORegistry::kArrowLocalFileIO); +} + +TEST(FileIOUtilTest, CreateFileIOMissingImpl) { + auto result = FileIOUtil::CreateFileIO({}); + EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); +} + +TEST(FileIOUtilTest, CreateFileIORejectsIncompatibleWarehouse) { + // Register a mock "s3" factory so the registry lookup would succeed + FileIORegistry::Register( + std::string(FileIORegistry::kArrowS3FileIO), + [](const std::unordered_map& /*properties*/) + -> Result> { return std::make_unique(); }); + + // io-impl=s3 with a local warehouse path should fail-fast + std::unordered_map props = { + {"io-impl", "s3"}, + {"warehouse", "/tmp/warehouse"}, + }; + auto result = FileIOUtil::CreateFileIO(props); + EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(result, HasErrorMessage("incompatible")); +} + +TEST(FileIOUtilTest, CreateFileIOAllowsCompatibleWarehouse) { + FileIORegistry::Register( + std::string(FileIORegistry::kArrowS3FileIO), + [](const std::unordered_map& /*properties*/) + -> Result> { return std::make_unique(); }); + + // io-impl=s3 with an s3:// warehouse should succeed + std::unordered_map props = { + {"io-impl", "s3"}, + {"warehouse", "s3://my-bucket/warehouse"}, + }; + auto result = FileIOUtil::CreateFileIO(props); + ASSERT_THAT(result, IsOk()); +} + +TEST(FileIOUtilTest, CreateFileIOPassesThroughCustomImpl) { + const std::string custom_impl = "com.mycompany.CustomFileIO"; + FileIORegistry::Register( + custom_impl, + [](const std::unordered_map& /*properties*/) + -> Result> { return std::make_unique(); }); + + // Custom io-impl with a local warehouse should NOT be rejected + std::unordered_map props = { + {"io-impl", custom_impl}, + {"warehouse", "/tmp/warehouse"}, + }; + auto result = FileIOUtil::CreateFileIO(props); + ASSERT_THAT(result, IsOk()); +} + +TEST(FileIOUtilTest, CreateFileIOUnregisteredCustomImplReturnsNotFound) { + // Unregistered custom io-impl should get NotFound, not InvalidArgument + std::unordered_map props = { + {"io-impl", "com.nonexistent.FileIO"}, + {"warehouse", "/tmp/warehouse"}, + }; + auto result = FileIOUtil::CreateFileIO(props); + EXPECT_THAT(result, IsError(ErrorKind::kNotFound)); +} + +TEST(FileIOUtilTest, CreateFileIOSkipsCheckWhenWarehouseAbsent) { + FileIORegistry::Register( + std::string(FileIORegistry::kArrowLocalFileIO), + [](const std::unordered_map& /*properties*/) + -> Result> { return std::make_unique(); }); + + // No warehouse property — should just go through to the factory + std::unordered_map props = { + {"io-impl", "local"}, + }; + auto result = FileIOUtil::CreateFileIO(props); + ASSERT_THAT(result, IsOk()); +} + +} // namespace iceberg diff --git a/src/iceberg/test/rest_catalog_integration_test.cc b/src/iceberg/test/rest_catalog_integration_test.cc index b364ffd36..9618945ee 100644 --- a/src/iceberg/test/rest_catalog_integration_test.cc +++ b/src/iceberg/test/rest_catalog_integration_test.cc @@ -39,6 +39,7 @@ #include "iceberg/catalog/rest/http_client.h" #include "iceberg/catalog/rest/json_serde_internal.h" #include "iceberg/catalog/rest/rest_catalog.h" +#include "iceberg/file_io_registry.h" #include "iceberg/partition_spec.h" #include "iceberg/result.h" #include "iceberg/schema.h" @@ -129,7 +130,8 @@ class RestCatalogIntegrationTest : public ::testing::Test { for (const auto& [k, v] : extra) { config.mutable_configs()[k] = v; } - return RestCatalog::Make(config, std::make_shared()); + config.SetFileIO(std::make_shared()); + return RestCatalog::Make(config); } /// Create a catalog configured with a specific snapshot loading mode. @@ -476,4 +478,77 @@ TEST_F(RestCatalogIntegrationTest, LoadTableWithSnapshotModeRefs) { EXPECT_FALSE(loaded->metadata()->schemas.empty()); } +// ============================================================================ +// Tests for RestCatalog::Make(config) with auto-detected FileIO +// ============================================================================ + +TEST_F(RestCatalogIntegrationTest, MakeWithoutWarehouseReturnsError) { + auto config = RestCatalogProperties::default_properties(); + config.Set(RestCatalogProperties::kUri, CatalogUri()) + .Set(RestCatalogProperties::kName, std::string(kCatalogName)); + // Note: warehouse is NOT set + + auto result = RestCatalog::Make(config); + + EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(result, HasErrorMessage("Warehouse location is required")); +} + +TEST_F(RestCatalogIntegrationTest, MakeWithUnregisteredIoImplReturnsError) { + auto config = RestCatalogProperties::default_properties(); + config.Set(RestCatalogProperties::kUri, CatalogUri()) + .Set(RestCatalogProperties::kName, std::string(kCatalogName)) + .Set(RestCatalogProperties::kWarehouse, std::string("/local/warehouse")); + config.mutable_configs()[std::string(FileIOProperties::kImpl)] = + "com.nonexistent.FileIO"; + + auto result = RestCatalog::Make(config); + + // Should fail because the io-impl is not registered + EXPECT_THAT(result, IsError(ErrorKind::kNotFound)); + EXPECT_THAT(result, HasErrorMessage("FileIO implementation not found")); +} + +TEST_F(RestCatalogIntegrationTest, MakeWithAutoDetectedLocalFileIO) { + FileIORegistry::Register( + std::string(FileIORegistry::kArrowLocalFileIO), + [](const std::unordered_map& /*properties*/) + -> Result> { + return std::make_unique(); + }); + + auto config = RestCatalogProperties::default_properties(); + config.Set(RestCatalogProperties::kUri, CatalogUri()) + .Set(RestCatalogProperties::kName, std::string(kCatalogName)) + .Set(RestCatalogProperties::kWarehouse, std::string("/local/warehouse")); + + auto catalog_result = RestCatalog::Make(config); + ASSERT_THAT(catalog_result, IsOk()); + + auto& catalog = catalog_result.value(); + EXPECT_EQ(catalog->name(), kCatalogName); +} + +TEST_F(RestCatalogIntegrationTest, MakeWithCustomIoImpl) { + const std::string custom_impl = "com.mycompany.CustomFileIO"; + FileIORegistry::Register( + custom_impl, + [](const std::unordered_map& /*properties*/) + -> Result> { + return std::make_unique(); + }); + + auto config = RestCatalogProperties::default_properties(); + config.Set(RestCatalogProperties::kUri, CatalogUri()) + .Set(RestCatalogProperties::kName, std::string(kCatalogName)) + .Set(RestCatalogProperties::kWarehouse, std::string("/any/warehouse")); + config.mutable_configs()[std::string(FileIOProperties::kImpl)] = custom_impl; + + auto catalog_result = RestCatalog::Make(config); + ASSERT_THAT(catalog_result, IsOk()); + + auto& catalog = catalog_result.value(); + EXPECT_EQ(catalog->name(), kCatalogName); +} + } // namespace iceberg::rest diff --git a/src/iceberg/util/file_io_util.cc b/src/iceberg/util/file_io_util.cc new file mode 100644 index 000000000..13d1aa8d6 --- /dev/null +++ b/src/iceberg/util/file_io_util.cc @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/file_io_util.h" + +#include "iceberg/file_io_registry.h" + +namespace iceberg { + +std::string FileIOUtil::DetectFileIOName(std::string_view uri) { + // Check for "scheme://" pattern + auto pos = uri.find("://"); + if (pos != std::string_view::npos) { + auto scheme = uri.substr(0, pos); + if (scheme == "s3") { + return std::string(FileIORegistry::kArrowS3FileIO); + } + } + return std::string(FileIORegistry::kArrowLocalFileIO); +} + +Result> FileIOUtil::CreateFileIO( + const std::unordered_map& properties) { + auto io_impl = properties.find(std::string(FileIOProperties::kImpl)); + if (io_impl == properties.end() || io_impl->second.empty()) { + return InvalidArgument("\"{}\" property is required to create FileIO", + FileIOProperties::kImpl); + } + + const auto& impl_name = io_impl->second; + if (impl_name == FileIORegistry::kArrowS3FileIO || + impl_name == FileIORegistry::kArrowLocalFileIO) { + auto warehouse_it = properties.find("warehouse"); + if (warehouse_it != properties.end() && !warehouse_it->second.empty()) { + auto detected = DetectFileIOName(warehouse_it->second); + if (detected != impl_name) { + return InvalidArgument( + "io-impl '{}' is incompatible with warehouse URI '{}' " + "(detected scheme: '{}')", + impl_name, warehouse_it->second, detected); + } + } + } + + return FileIORegistry::Load(io_impl->second, properties); +} + +} // namespace iceberg diff --git a/src/iceberg/util/file_io_util.h b/src/iceberg/util/file_io_util.h new file mode 100644 index 000000000..5d70191dc --- /dev/null +++ b/src/iceberg/util/file_io_util.h @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include "iceberg/file_io.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" + +namespace iceberg { + +class ICEBERG_EXPORT FileIOUtil { + public: + /// \brief Detect FileIO registry key from URI scheme. + /// + /// Returns "s3" for "s3://..." URIs, "local" otherwise. + static std::string DetectFileIOName(std::string_view uri); + + /// \brief Create a FileIO instance based on properties. + /// + /// Checks the "io-impl" property first; if not set, falls back to + /// detecting the scheme from the "warehouse" property. + static Result> CreateFileIO( + const std::unordered_map& properties); +}; + +} // namespace iceberg diff --git a/src/iceberg/util/meson.build b/src/iceberg/util/meson.build index 496a75758..d39bc9614 100644 --- a/src/iceberg/util/meson.build +++ b/src/iceberg/util/meson.build @@ -21,6 +21,7 @@ install_headers( 'checked_cast.h', 'config.h', 'content_file_util.h', + 'file_io_util.h', 'conversions.h', 'data_file_set.h', 'decimal.h',