From 683b148628753db1568c2cd8cb9d593ba2dc3486 Mon Sep 17 00:00:00 2001 From: Shailaja Koppu Date: Mon, 15 Jun 2026 16:26:24 +0100 Subject: [PATCH 1/9] SSTable-version-based bridge determination (squashed from bridged) Determines the Cassandra bridge version from the highest SSTable version found on the cluster, making analytics independent of the Cassandra server version. Applied to both bulk writer and reader paths. --- .../sidecar/client/SidecarClient.java | 13 + .../BridgeInitializationParameters.java | 13 +- .../cassandra/bridge/CassandraVersion.java | 156 +++++- .../bridge/SSTableVersionAnalyzer.java | 232 +++++++++ .../bridge/CassandraVersionTest.java | 177 +++++++ .../bridge/SSTableVersionAnalyzerTest.java | 186 +++++++ .../apache/cassandra/spark/KryoRegister.java | 38 ++ .../bulkwriter/AbstractBulkWriterContext.java | 102 +++- .../bulkwriter/BroadcastableClusterInfo.java | 19 +- .../BroadcastableClusterInfoGroup.java | 28 +- .../bulkwriter/BroadcastableTableSchema.java | 13 +- .../spark/bulkwriter/BulkSparkConf.java | 24 + .../spark/bulkwriter/BulkWriterConfig.java | 24 +- .../spark/bulkwriter/BulkWriterContext.java | 3 + .../CassandraBulkWriterContext.java | 40 +- .../bulkwriter/CassandraClusterInfo.java | 176 +++---- .../CassandraDirectDataTransportContext.java | 5 +- .../spark/bulkwriter/ClusterInfo.java | 5 +- .../bulkwriter/IBroadcastableClusterInfo.java | 12 +- .../bulkwriter/SSTableWriterFactory.java | 17 +- .../spark/bulkwriter/SortedSSTableWriter.java | 29 +- .../spark/bulkwriter/TableSchema.java | 20 +- .../CloudStorageStreamSession.java | 3 +- .../CassandraClusterInfoGroup.java | 118 +++-- ...CassandraCoordinatedBulkWriterContext.java | 43 +- .../spark/data/CassandraDataLayer.java | 222 +++++++- .../cassandra/spark/data/LocalDataLayer.java | 16 +- .../cassandra/spark/KryoRegisterTest.java | 95 ++++ .../AbstractBulkWriterContextTest.java | 174 +++++++ .../spark/bulkwriter/BulkSparkConfTest.java | 98 ++++ .../BulkWriterConfigExtensibilityTest.java | 24 +- .../bulkwriter/CassandraClusterInfoTest.java | 5 +- .../bulkwriter/MockBulkWriterContext.java | 17 +- .../bulkwriter/SortedSSTableWriterTest.java | 2 +- .../bulkwriter/TableSchemaTestCommon.java | 5 +- .../CassandraClusterInfoGroupTest.java | 38 +- .../CassandraDataLayerValidationTest.java | 472 ++++++++++++++++++ ...eaderSSTableVersionBridgeDisabledTest.java | 107 ++++ ...ReaderSSTableVersionBridgeEnabledTest.java | 160 ++++++ ...riterSSTableVersionBridgeDisabledTest.java | 117 +++++ ...WriterSSTableVersionBridgeEnabledTest.java | 109 ++++ .../build.gradle | 19 + .../org/apache/cassandra/clients/Sidecar.java | 77 +++ .../apache/cassandra/clients/SidecarTest.java | 193 +++++++ .../format/bti/BtiReaderUtilsTest.java | 8 +- .../spark/reader/SummaryDbTests.java | 6 +- .../bridge/CassandraTypesImplementation.java | 2 +- 47 files changed, 3112 insertions(+), 350 deletions(-) create mode 100644 cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java create mode 100644 cassandra-analytics-common/src/test/java/org/apache/cassandra/bridge/CassandraVersionTest.java create mode 100644 cassandra-analytics-common/src/test/java/org/apache/cassandra/bridge/SSTableVersionAnalyzerTest.java create mode 100644 cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/KryoRegisterTest.java create mode 100644 cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContextTest.java create mode 100644 cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/CassandraDataLayerValidationTest.java create mode 100644 cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkReaderSSTableVersionBridgeDisabledTest.java create mode 100644 cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkReaderSSTableVersionBridgeEnabledTest.java create mode 100644 cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriterSSTableVersionBridgeDisabledTest.java create mode 100644 cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriterSSTableVersionBridgeEnabledTest.java create mode 100644 cassandra-analytics-sidecar-client/src/test/java/org/apache/cassandra/clients/SidecarTest.java diff --git a/analytics-sidecar-client/src/main/java/org/apache/cassandra/sidecar/client/SidecarClient.java b/analytics-sidecar-client/src/main/java/org/apache/cassandra/sidecar/client/SidecarClient.java index c084c4fff..55beba1c5 100644 --- a/analytics-sidecar-client/src/main/java/org/apache/cassandra/sidecar/client/SidecarClient.java +++ b/analytics-sidecar-client/src/main/java/org/apache/cassandra/sidecar/client/SidecarClient.java @@ -244,6 +244,19 @@ public CompletableFuture gossipInfo() return executor.executeRequestAsync(requestBuilder().gossipInfoRequest().build()); } + /** + * Executes the gossip info request using the default retry policy and configured selection policy + * + * @param instance the instance where the request will be executed + * @return a completable future of the gossip info + */ + public CompletableFuture gossipInfo(SidecarInstance instance) + { + return executor.executeRequestAsync(requestBuilder().singleInstanceSelectionPolicy(instance) + .gossipInfoRequest() + .build()); + } + /** * Executes the GET gossip health request using the default retry policy and configured selection policy diff --git a/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/BridgeInitializationParameters.java b/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/BridgeInitializationParameters.java index d7857a1be..3cc7714d3 100644 --- a/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/BridgeInitializationParameters.java +++ b/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/BridgeInitializationParameters.java @@ -24,21 +24,20 @@ */ public class BridgeInitializationParameters { - private final String sstableFormat; + private final String configuredSSTableFormat; - public BridgeInitializationParameters(String sstableFormat) + public BridgeInitializationParameters(String configuredSSTableFormat) { - this.sstableFormat = sstableFormat; + this.configuredSSTableFormat = configuredSSTableFormat; } public static BridgeInitializationParameters fromEnvironment() { - String sstableFormat = CassandraVersion.sstableFormat(); - return new BridgeInitializationParameters(sstableFormat); + return new BridgeInitializationParameters(CassandraVersion.configuredSSTableFormat()); } - public String getSstableFormat() + public String getConfiguredSSTableFormat() { - return sstableFormat; + return configuredSSTableFormat; } } diff --git a/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/CassandraVersion.java b/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/CassandraVersion.java index 1e23d1350..1ef30dc0f 100644 --- a/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/CassandraVersion.java +++ b/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/CassandraVersion.java @@ -20,7 +20,9 @@ package org.apache.cassandra.bridge; import java.util.Arrays; +import java.util.Collections; import java.util.HashSet; +import java.util.List; import java.util.Optional; import java.util.Set; @@ -45,22 +47,47 @@ */ public enum CassandraVersion { - THREEZERO(30, "3.0", "three-zero", "big"), - FOURZERO(40, "4.0", "four-zero", "big"), - FOURONE(41, "4.1", "four-zero", "big"), - FIVEZERO(50, "5.0", "five-zero", "big", "bti"); + THREEZERO(30, "3.0", "three-zero", new String[]{"big"}, + new String[]{ + // Cassandra 3.x native sstable versions + "big-ma", + "big-mb", + "big-mc", + "big-md", + "big-me", + "big-mf" + }), + FOURZERO(40, "4.0", "four-zero", new String[]{"big"}, + new String[]{ + // Cassandra 4.0 native sstable versions + "big-na", + "big-nb", + }), + FOURONE(41, "4.1", "four-zero", new String[]{"big"}, + new String[]{ + // Cassandra 4.1 did not introduce new native SSTable versions + }), + FIVEZERO(50, "5.0", "five-zero", new String[]{"big", "bti"}, + new String[]{ + // Cassandra 5.0 native sstable versions + "big-oa", + "bti-da", + }); private final int number; private final String name; private final String jarBaseName; // Must match shadowJar.archiveFileName from Gradle configuration (without extension) private final Set sstableFormats; + private final List nativeSStableVersions; - CassandraVersion(int number, String name, String jarBaseName, String... sstableFormats) + + CassandraVersion(int number, String name, String jarBaseName, String[] sstableFormats, String[] nativeSStableVersions) { this.number = number; this.name = name; this.jarBaseName = jarBaseName; this.sstableFormats = new HashSet<>(Arrays.asList(sstableFormats)); + this.nativeSStableVersions = List.of(nativeSStableVersions); } public int versionNumber() @@ -78,13 +105,89 @@ public String jarBaseName() return jarBaseName; } - private static final String sstableFormat; + /** + * Get the set of SSTable formats supported by this Cassandra version. + * + * @return Set of supported SSTable format strings + */ + public Set sstableFormats() + { + return sstableFormats; + } + + /** + * Get the list of native SSTable version strings for this Cassandra version. + * + * @return List of native SSTable version strings + */ + public List getNativeSStableVersions() + { + return nativeSStableVersions; + } + + /** + * Get the set of SSTable version strings that this Cassandra version can read. + * This includes: + * - Native versions for this Cassandra version + * - All SSTable versions from the previous major version (including all minor versions) + * For example, Cassandra 5.0 can read: + * - 5.0 native versions (big-oa, bti-da) + * - 4.0 versions (big-na, big-nb) + * - 4.1 versions (if any) + * But NOT 3.0 versions + * + * @return Set of full SSTable version strings that can be read + */ + public Set getSupportedSStableVersionsForRead() + { + Set readableVersions = new HashSet<>(this.nativeSStableVersions); + + int previousMajor = getPreviousMajorVersion(); + + // Add all SSTable versions from the previous major version and its minors + // E.g., C* 5.0 (version 50) can read C* 4.0 (40) and C* 4.1 (41) SSTables, but not C* 3.x (30) + for (CassandraVersion version : CassandraVersion.values()) + { + // Include versions from the previous major version family (e.g., 40-49 for C* 5.0) + if (version.versionNumber() >= previousMajor && version.versionNumber() < this.number) + { + readableVersions.addAll(version.nativeSStableVersions); + } + } + + return Collections.unmodifiableSet(readableVersions); + } + + /** + * Get the previous major version number for this Cassandra version. + * Calculates dynamically using: (majorVersion - 1) * 10 + * For example: + * - C5.0 (50) returns 40 (C4.x) + * - C4.1 (41) returns 30 (C3.x) + * - C4.0 (40) returns 30 (C3.x) + * - C3.0 (30) returns 20 (C2.x - which doesn't exist) + * - C10.0 (100) returns 90 (C9.x) + * + * @return previous major version number + */ + @VisibleForTesting + int getPreviousMajorVersion() + { + // Get major version: 50 -> 5, 41 -> 4, 40 -> 4, 30 -> 3 + int majorVersion = this.number / 10; + + // Calculate previous major version: (majorVersion - 1) * 10 + // E.g., 5 -> 40, 4 -> 30, 3 -> 20 + return (majorVersion - 1) * 10; + } + + private static final String configuredSSTableFormat; private static final CassandraVersion[] implementedVersions; private static final String[] supportedVersions; static { - sstableFormat = System.getProperty("cassandra.analytics.bridges.sstable_format", "big"); + configuredSSTableFormat = System.getProperty("cassandra.analytics.bridges.sstable_format", "big"); // NOTE: These default enum names must stay in sync with cassandraVersionEnumMap in build.gradle. // FOURONE is intentionally excluded from local-dev defaults to keep iteration fast; @@ -93,7 +196,7 @@ public String jarBaseName() String.join(",", FOURZERO.name(), FIVEZERO.name())); implementedVersions = Arrays.stream(providedVersionsOrDefault.split(",")) .map(CassandraVersion::valueOf) - .filter(v -> v.sstableFormats.contains(sstableFormat)) + .filter(v -> v.sstableFormats().contains(configuredSSTableFormat)) .toArray(CassandraVersion[]::new); // NOTE: These default versions must stay in sync with cassandraFullVersionMap in build.gradle. @@ -101,7 +204,7 @@ public String jarBaseName() "cassandra-4.0.17,cassandra-5.0.5"); supportedVersions = Arrays.stream(providedSupportedVersionsOrDefault.split(",")) .filter(version -> CassandraVersion.fromVersion(version) - .filter(v -> v.sstableFormats.contains(sstableFormat)) + .filter(v -> v.sstableFormats().contains(configuredSSTableFormat)) .isPresent()) .toArray(String[]::new); @@ -109,9 +212,9 @@ public String jarBaseName() "No versions available"); } - public static String sstableFormat() + public static String configuredSSTableFormat() { - return sstableFormat; + return configuredSSTableFormat; } public static Optional fromVersion(String cassandraVersion) @@ -122,6 +225,37 @@ public static Optional fromVersion(String cassandraVersion) .findAny(); } + /** + * Find the Cassandra version that originally writes SSTables with this version string. + * Returns the native Cassandra version that introduced this SSTable version. + * + * @param sstableVersion full version string including format (e.g., "big-na", "bti-da") + * @return Optional containing the CassandraVersion that natively writes this format, + * or Optional.empty() if: + *
    + *
  • The version string is null
  • + *
  • The version string is unrecognized (not in any enum's nativeSStableVersions)
  • + *
  • The version format is invalid or doesn't match expected pattern
  • + *
+ */ + public static Optional fromSSTableVersion(String sstableVersion) + { + if (sstableVersion == null) + { + return Optional.empty(); + } + + for (CassandraVersion version : CassandraVersion.values()) + { + if (version.nativeSStableVersions.contains(sstableVersion)) + { + return Optional.of(version); + } + } + + return Optional.empty(); + } + public static CassandraVersion[] implementedVersions() { return implementedVersions; diff --git a/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java b/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java new file mode 100644 index 000000000..a53fe8688 --- /dev/null +++ b/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java @@ -0,0 +1,232 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.bridge; + +import java.util.Comparator; +import java.util.Optional; +import java.util.Set; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Analyzes SSTable versions on a cluster to determine the appropriate + * Cassandra bridge to load for bulk write/read operations. + * + *

This class provides logic to select Cassandra bridge based on the highest SSTable + * version detected on the cluster and the user's requested format preference.

+ */ +public final class SSTableVersionAnalyzer +{ + private static final Logger LOGGER = LoggerFactory.getLogger(SSTableVersionAnalyzer.class); + static final String DISABLE_SSTABLE_VERSION_BASED_BRIDGE = + "spark.cassandra_analytics.bridge.disable_sstable_version_based"; + + private SSTableVersionAnalyzer() + { + // Utility class + } + + /** + * Determines which CassandraVersion bridge to load based on: + * - Highest SSTable version detected on cluster + * - User's format preference + * + * @param sstableVersionsOnCluster Set of SSTable versions found on cluster nodes + * @param requestedFormat User's requested format, example: "big" or "bti" + * @param cassandraVersion Cassandra version string for fallback + * @param isSSTableVersionBasedBridgeDisabled flag to disable sstable version based bridge determination + * @return CassandraVersion enum indicating which bridge to load + * @throws UnsupportedOperationException if cluster doesn't support requested format + * @throws IllegalStateException if SSTable versions are empty/unknown + */ + public static CassandraVersion determineBridgeVersionForWrite(Set sstableVersionsOnCluster, + String requestedFormat, + String cassandraVersion, + boolean isSSTableVersionBasedBridgeDisabled) + { + // Check for fallback mode + Optional fallback = resolveFallbackVersion(cassandraVersion, isSSTableVersionBasedBridgeDisabled); + if (fallback.isPresent()) + { + return fallback.get(); + } + + // Validate SSTable versions are present + ensureSSTableVersionsNotEmpty(sstableVersionsOnCluster); + + // Find highest Cassandra version based on SSTable versions + CassandraVersion highestCassandraVersion = findHighestCassandraVersion(sstableVersionsOnCluster); + + // Check if highestCassandraVersion supports the requested format + boolean supportsRequestedFormat = highestCassandraVersion.getNativeSStableVersions() + .stream() + .anyMatch(v -> v.startsWith(requestedFormat + "-")); + + if (supportsRequestedFormat) + { + return highestCassandraVersion; + } + else + { + throw new UnsupportedOperationException(String.format( + "Cluster does not support requested SSTable format '%s'. " + + "Bridge version determined is %s, which only supports formats: %s", + requestedFormat, highestCassandraVersion.versionName(), + highestCassandraVersion.sstableFormats())); + } + } + + /** + * Determines which CassandraVersion bridge to load for read operations based on: + * - Highest SSTable version detected on cluster + * + * @param sstableVersionsOnCluster Set of SSTable versions found on cluster nodes + * @param cassandraVersion Cassandra version string for fallback + * @param isSSTableVersionBasedBridgeDisabled flag to disable sstable version based bridge determination + * @return CassandraVersion enum indicating which bridge to load + * @throws IllegalStateException if SSTable versions are empty/unknown + */ + public static CassandraVersion determineBridgeVersionForRead(Set sstableVersionsOnCluster, + String cassandraVersion, + boolean isSSTableVersionBasedBridgeDisabled) + { + // Check for fallback mode + Optional fallback = resolveFallbackVersion(cassandraVersion, isSSTableVersionBasedBridgeDisabled); + if (fallback.isPresent()) + { + return fallback.get(); + } + + // Validate SSTable versions are present + ensureSSTableVersionsNotEmpty(sstableVersionsOnCluster); + + // Find highest Cassandra version based on SSTable versions + CassandraVersion bridgeVersion = findHighestCassandraVersion(sstableVersionsOnCluster); + + LOGGER.debug("Determined bridge version {} for read based on SSTable versions on cluster: {}", + bridgeVersion.versionName(), sstableVersionsOnCluster); + + return bridgeVersion; + } + + private static Optional resolveFallbackVersion(String cassandraVersion, + boolean isSSTableVersionBasedBridgeDisabled) + { + if (!isSSTableVersionBasedBridgeDisabled) + { + return Optional.empty(); + } + + LOGGER.info("SSTable version-based bridge selection is disabled via configuration. " + + "Using cassandra.version for bridge selection: {}", cassandraVersion); + return Optional.of(CassandraVersion.fromVersion(cassandraVersion) + .orElseThrow(() -> new UnsupportedOperationException( + String.format("Unsupported Cassandra version: %s", cassandraVersion)))); + } + + /** + * Ensures that SSTable versions from cluster are not null or empty. + * + * @param sstableVersionsOnCluster Set of SSTable versions to validate + * @throws IllegalStateException if versions are null or empty + */ + private static void ensureSSTableVersionsNotEmpty(Set sstableVersionsOnCluster) + { + if (sstableVersionsOnCluster == null || sstableVersionsOnCluster.isEmpty()) + { + throw new IllegalStateException(String.format( + "Unable to retrieve SSTable versions from cluster. " + + "This is required for SSTable version-based bridge selection. " + + "If you want to bypass this check and use cassandra.version for bridge selection, " + + "set %s=true", DISABLE_SSTABLE_VERSION_BASED_BRIDGE)); + } + } + + /** + * Finds the highest Cassandra version based on SSTable versions found on cluster. + * + * @param sstableVersionsOnCluster Set of SSTable versions found on cluster + * @return CassandraVersion corresponding to the highest SSTable version + * @throws IllegalStateException if highest version is unknown + */ + private static CassandraVersion findHighestCassandraVersion(Set sstableVersionsOnCluster) + { + String highestSSTableVersion = findHighestSSTableVersion(sstableVersionsOnCluster); + return CassandraVersion.fromSSTableVersion(highestSSTableVersion) + .orElseThrow(() -> new IllegalStateException( + String.format("Unknown SSTable version: %s. Cannot determine bridge version. " + + "SSTable versions on cluster: %s. " + + "To retry the job using a fallback Cassandra version, " + + "set %s=true", + highestSSTableVersion, sstableVersionsOnCluster, + DISABLE_SSTABLE_VERSION_BASED_BRIDGE))); + } + + /** + * Finds the highest SSTable version from the set using CassandraVersion mappings. + * Ordering is based on CassandraVersion number (e.g., 5.0 > 4.0 > 3.0). + * Versions within the same CassandraVersion are considered equal. + * + * @param versions Set of SSTable version strings + * @return Highest SSTable version string + * @throws IllegalStateException if versions is empty, contains null values, or contains unknown versions + */ + public static String findHighestSSTableVersion(Set versions) + { + if (versions == null || versions.isEmpty()) + { + throw new IllegalStateException("SSTable versions set cannot be empty"); + } + + Comparator sstableVersionComparator = (v1, v2) -> { + // Find which CassandraVersion each SSTable version belongs to + Optional v1Opt = CassandraVersion.fromSSTableVersion(v1); + Optional v2Opt = CassandraVersion.fromSSTableVersion(v2); + + if (!v1Opt.isPresent() || !v2Opt.isPresent()) + { + String unknownVersion = !v1Opt.isPresent() ? v1 : v2; + throw new IllegalStateException( + String.format("Unknown SSTable version: %s. Cannot determine Cassandra version. " + + "To retry the job using a fallback Cassandra version, " + + "set %s=true", unknownVersion, DISABLE_SSTABLE_VERSION_BASED_BRIDGE)); + } + + CassandraVersion cv1 = v1Opt.get(); + CassandraVersion cv2 = v2Opt.get(); + + // First, compare by CassandraVersion number + // FIVEZERO (50) > FOURONE (41) > FOURZERO (40) > THREEZERO (30) + int versionComparison = Integer.compare(cv1.versionNumber(), cv2.versionNumber()); + if (versionComparison != 0) + { + return versionComparison; + } + + // Same CassandraVersion - versions are considered equal + return 0; + }; + + return versions.stream() + .max(sstableVersionComparator) + .orElseThrow(() -> new IllegalStateException("Unable to find highest SSTable version")); + } +} diff --git a/cassandra-analytics-common/src/test/java/org/apache/cassandra/bridge/CassandraVersionTest.java b/cassandra-analytics-common/src/test/java/org/apache/cassandra/bridge/CassandraVersionTest.java new file mode 100644 index 000000000..1f8c1a107 --- /dev/null +++ b/cassandra-analytics-common/src/test/java/org/apache/cassandra/bridge/CassandraVersionTest.java @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.bridge; + +import java.util.List; +import java.util.Optional; +import java.util.Set; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Tests for CassandraVersion SSTable version methods + */ +public class CassandraVersionTest +{ + @ParameterizedTest + @CsvSource({ + "big-ma, THREEZERO", "big-mb, THREEZERO", "big-mc, THREEZERO", + "big-md, THREEZERO", "big-me, THREEZERO", "big-mf, THREEZERO", + "big-na, FOURZERO", "big-nb, FOURZERO", + "big-oa, FIVEZERO", "bti-da, FIVEZERO" + }) + void testFromSSTableVersionNativeVersions(String ssTableVersion, String expectedVersion) + { + Optional result = CassandraVersion.fromSSTableVersion(ssTableVersion); + assertThat(result).isPresent(); + assertThat(result.get()).isEqualTo(CassandraVersion.valueOf(expectedVersion)); + } + + @Test + void testFromSSTableVersionReturnsEmpty() + { + Optional result = CassandraVersion.fromSSTableVersion("unknown-xx"); + assertThat(result).isEmpty(); + } + + @Test + void testGetSupportedSStableVersionsForReadFourZero() + { + Set supported = CassandraVersion.FOURZERO.getSupportedSStableVersionsForRead(); + + // C* 4.0 can read its own versions + assertThat(supported).contains("big-na", "big-nb"); + + // C* 4.0 can read C* 3.0 versions (previous major version family) + assertThat(supported).contains("big-ma", "big-mb", "big-mc", "big-md", "big-me", "big-mf"); + + // C* 4.0 cannot read C* 5.0 versions + assertThat(supported).doesNotContain("big-oa", "bti-da"); + } + + @Test + void testGetSupportedSStableVersionsForReadFiveZero() + { + Set supported = CassandraVersion.FIVEZERO.getSupportedSStableVersionsForRead(); + + // C* 5.0 can read its own versions + assertThat(supported).contains("big-oa", "bti-da"); + + // C* 5.0 can read C* 4.0 and 4.1 versions (previous major version family) + assertThat(supported).contains("big-na", "big-nb"); + + // C* 5.0 cannot read C* 3.0 versions (not in previous major version family) + assertThat(supported).doesNotContain("big-ma", "big-mb", "big-mc", "big-md", "big-me", "big-mf"); + } + + @Test + void testGetSupportedSStableVersionsForReadThreeZero() + { + Set supported = CassandraVersion.THREEZERO.getSupportedSStableVersionsForRead(); + + // C* 3.0 can read its own versions + assertThat(supported).contains("big-ma", "big-mb", "big-mc", "big-md", "big-me", "big-mf"); + + // C* 3.0 cannot read C* 4.0+ versions + assertThat(supported).doesNotContain("big-na", "big-nb", "big-oa", "bti-da"); + + // C* 3.0's previous major version (2.x) is not defined, so only its own versions are readable + assertThat(supported).hasSize(6); // Only the 6 native 3.0 versions + } + + @Test + void testGetSupportedSStableVersionsForReadFourOne() + { + Set supported = CassandraVersion.FOURONE.getSupportedSStableVersionsForRead(); + + // C* 4.1 has no native SSTable versions of its own, but can read C* 4.0 and C* 3.0 versions + // C* 4.1 can read C* 4.0 versions (same major version family) + assertThat(supported).contains("big-na", "big-nb"); + + // C* 4.1 can read C* 3.0 versions (previous major version family) + assertThat(supported).contains("big-ma", "big-mb", "big-mc", "big-md", "big-me", "big-mf"); + + // C* 4.1 cannot read C* 5.0 versions + assertThat(supported).doesNotContain("big-oa", "bti-da"); + } + + @Test + void testGetNativeSStableVersionsFourZero() + { + List nativeVersions = CassandraVersion.FOURZERO.getNativeSStableVersions(); + assertThat(nativeVersions).containsExactlyInAnyOrder("big-na", "big-nb"); + } + + @Test + void testGetNativeSStableVersionsFiveZero() + { + List nativeVersions = CassandraVersion.FIVEZERO.getNativeSStableVersions(); + assertThat(nativeVersions).containsExactlyInAnyOrder("big-oa", "bti-da"); + } + + @Test + void testGetNativeSStableVersionsFourOneEmpty() + { + // C* 4.1 did not introduce new native SSTable versions + List nativeVersions = CassandraVersion.FOURONE.getNativeSStableVersions(); + assertThat(nativeVersions).isEmpty(); + } + + @ParameterizedTest + @CsvSource({ + "FIVEZERO, 40", + "FOURONE, 30", + "FOURZERO, 30", + "THREEZERO, 20" + }) + void testGetPreviousMajorVersion(String versionName, int expectedPrevious) + { + CassandraVersion version = CassandraVersion.valueOf(versionName); + int previous = version.getPreviousMajorVersion(); + assertThat(previous).isEqualTo(expectedPrevious); + } + + @Test + void testConfiguredSSTableFormatDefault() + { + // Assuming no system property set, should return "big" + String format = CassandraVersion.configuredSSTableFormat(); + assertThat(format).isNotNull(); + assertThat(format).isIn("big", "bti"); // Could be either depending on env + } + + @Test + void testSStableFormatsFourZero() + { + Set formats = CassandraVersion.FOURZERO.sstableFormats(); + assertThat(formats).containsExactly("big"); + } + + @Test + void testSStableFormatsFiveZero() + { + Set formats = CassandraVersion.FIVEZERO.sstableFormats(); + assertThat(formats).containsExactly("big", "bti"); + } +} diff --git a/cassandra-analytics-common/src/test/java/org/apache/cassandra/bridge/SSTableVersionAnalyzerTest.java b/cassandra-analytics-common/src/test/java/org/apache/cassandra/bridge/SSTableVersionAnalyzerTest.java new file mode 100644 index 000000000..f517ce079 --- /dev/null +++ b/cassandra-analytics-common/src/test/java/org/apache/cassandra/bridge/SSTableVersionAnalyzerTest.java @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.bridge; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Stream; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +/** + * Unit tests for SSTableVersionAnalyzer + */ +public class SSTableVersionAnalyzerTest +{ + // --- determineBridgeVersionForWrite success cases (parameterized) --- + + static Stream writeFallbackDisabledSuccessCases() + { + return Stream.of( + Arguments.of(Collections.singleton("big-oa"), "big", "5.0.0", CassandraVersion.FIVEZERO), + Arguments.of(new HashSet<>(Arrays.asList("big-na", "big-nb")), "big", "4.0.0", CassandraVersion.FOURZERO), + Arguments.of(new HashSet<>(Arrays.asList("big-na", "big-oa")), "big", "5.0.0", CassandraVersion.FIVEZERO) + ); + } + + @ParameterizedTest + @MethodSource("writeFallbackDisabledSuccessCases") + void testDetermineBridgeVersionForWriteFallbackDisabled(Set versions, + String format, + String cassandraVersion, + CassandraVersion expected) + { + CassandraVersion result = SSTableVersionAnalyzer.determineBridgeVersionForWrite( + versions, format, cassandraVersion, false + ); + assertThat(result).isEqualTo(expected); + } + + // --- determineBridgeVersionForWrite null/empty exception cases (parameterized) --- + + static Stream writeNullOrEmptyVersionsCases() + { + return Stream.of( + Arguments.of(Collections.emptySet()), + Arguments.of((Set) null) + ); + } + + @ParameterizedTest + @MethodSource("writeNullOrEmptyVersionsCases") + void testDetermineBridgeVersionForWriteNullOrEmptyThrowsException(Set versions) + { + assertThatThrownBy(() -> SSTableVersionAnalyzer.determineBridgeVersionForWrite( + versions, "big", "5.0.0", false + )).isInstanceOf(IllegalStateException.class) + .hasMessageContaining("Unable to retrieve SSTable versions from cluster"); + } + + // --- determineBridgeVersionForWrite standalone tests --- + + @Test + void testDetermineBridgeVersionForWriteFallbackEnabled() + { + CassandraVersion result = SSTableVersionAnalyzer.determineBridgeVersionForWrite( + null, "big", "5.0.0", true + ); + assertThat(result).isEqualTo(CassandraVersion.FIVEZERO); + } + + @Test + void testDetermineBridgeVersionForWriteUnsupportedFormat() + { + Set sstableVersions = Collections.singleton("big-na"); + assertThatThrownBy(() -> SSTableVersionAnalyzer.determineBridgeVersionForWrite( + sstableVersions, "bti", "4.0.0", false + )).isInstanceOf(UnsupportedOperationException.class) + .hasMessageContaining("Cluster does not support requested SSTable format 'bti'"); + } + + // --- determineBridgeVersionForRead standalone tests --- + + @Test + void testDetermineBridgeVersionForReadFallbackDisabled() + { + Set sstableVersions = Collections.singleton("big-oa"); + CassandraVersion result = SSTableVersionAnalyzer.determineBridgeVersionForRead( + sstableVersions, "5.0.0", false + ); + assertThat(result).isEqualTo(CassandraVersion.FIVEZERO); + } + + @Test + void testDetermineBridgeVersionForReadFallbackEnabled() + { + CassandraVersion result = SSTableVersionAnalyzer.determineBridgeVersionForRead( + null, "4.0.0", true + ); + assertThat(result).isEqualTo(CassandraVersion.FOURZERO); + } + + @Test + void testDetermineBridgeVersionForReadEmptyVersionsThrowsException() + { + assertThatThrownBy(() -> SSTableVersionAnalyzer.determineBridgeVersionForRead( + Collections.emptySet(), "5.0.0", false + )).isInstanceOf(IllegalStateException.class) + .hasMessageContaining("Unable to retrieve SSTable versions from cluster"); + } + + // --- findHighestSSTableVersion success cases (parameterized) --- + + static Stream findHighestSuccessCases() + { + return Stream.of( + Arguments.of(Collections.singleton("big-na"), CassandraVersion.FOURZERO), + Arguments.of(new HashSet<>(Arrays.asList("big-na", "big-nb")), CassandraVersion.FOURZERO), + Arguments.of(new HashSet<>(Arrays.asList("big-na", "big-oa")), CassandraVersion.FIVEZERO), + Arguments.of(new HashSet<>(Arrays.asList("big-oa", "bti-da")), CassandraVersion.FIVEZERO) + ); + } + + @ParameterizedTest + @MethodSource("findHighestSuccessCases") + void testFindHighestSSTableVersion(Set versions, CassandraVersion expectedCassandraVersion) + { + String result = SSTableVersionAnalyzer.findHighestSSTableVersion(versions); + assertThat(CassandraVersion.fromSSTableVersion(result)).hasValue(expectedCassandraVersion); + } + + // --- findHighestSSTableVersion null/empty exception cases (parameterized) --- + + static Stream findHighestNullOrEmptyCases() + { + return Stream.of( + Arguments.of(Collections.emptySet()), + Arguments.of((Set) null) + ); + } + + @ParameterizedTest + @MethodSource("findHighestNullOrEmptyCases") + void testFindHighestSSTableVersionNullOrEmptyThrowsException(Set versions) + { + assertThatThrownBy(() -> SSTableVersionAnalyzer.findHighestSSTableVersion(versions)) + .isInstanceOf(IllegalStateException.class) + .hasMessageContaining("SSTable versions set cannot be empty"); + } + + // --- findHighestSSTableVersion standalone test --- + + @Test + void testFindHighestSSTableVersionUnknownVersionThrowsException() + { + Set versions = new HashSet<>(Arrays.asList("unknown-xx", "unknown-yy")); + assertThatThrownBy(() -> SSTableVersionAnalyzer.findHighestSSTableVersion(versions)) + .isInstanceOf(IllegalStateException.class) + .hasMessageContaining("Unknown SSTable version:") + .hasMessageContaining("disable_sstable_version_based=true"); + } +} diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/KryoRegister.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/KryoRegister.java index f53112169..054de0c7d 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/KryoRegister.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/KryoRegister.java @@ -83,6 +83,44 @@ public static void addSerializer(@NotNull Class type, @NotNull Serializer KRYO_SERIALIZERS.put(type, serializer); } + /** + * Validates that a Kryo registrator exists for the given Cassandra version. + * This should be called after bridge version determination to ensure that + * Spark can properly serialize objects for the selected bridge. + * + * @param bridgeVersion the CassandraVersion to validate + * @param clusterCassandraVersion optional Cassandra version string for additional context in error messages + * @throws IllegalStateException if no Kryo registrator is configured for this version + */ + public static void validateKryoRegistratorExists(@NotNull CassandraVersion bridgeVersion, + String clusterCassandraVersion) + { + Class registratorClass = KRYO_REGISTRATORS.get(bridgeVersion); + if (registratorClass == null) + { + // Build available versions list for suggestion + String availableKryoVersions = KRYO_REGISTRATORS.keySet().stream() + .map(v -> v.name() + " (" + v.versionName() + ")") + .collect(Collectors.joining(", ")); + + throw new IllegalStateException( + String.format("No Kryo registrator configured for bridge version %s (%s). " + + "Cluster Cassandra version: %s. " + + "Available Kryo registrators: %s. " + + "Cannot proceed with job as Spark serialization will fail. " + + "To resolve this issue, update the '%s' configuration property to match the bridge version: %s", + bridgeVersion.name(), + bridgeVersion.versionName(), + clusterCassandraVersion, + availableKryoVersions, + CASSANDRA_VERSION, + bridgeVersion.versionName())); + } + + LOGGER.debug("Validated Kryo registrator exists for bridge version {}: {}", + bridgeVersion.versionName(), registratorClass.getName()); + } + private final CassandraVersion cassandraVersion; protected KryoRegister(CassandraVersion cassandraVersion) diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java index 447285ff3..b78d140cf 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java @@ -32,6 +32,9 @@ import com.esotericsoftware.kryo.io.Output; import org.apache.cassandra.bridge.CassandraBridge; import org.apache.cassandra.bridge.CassandraBridgeFactory; +import org.apache.cassandra.bridge.CassandraVersion; +import org.apache.cassandra.bridge.SSTableVersionAnalyzer; +import org.apache.cassandra.spark.KryoRegister; import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.CassandraClusterInfoGroup; import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.MultiClusterContainer; import org.apache.cassandra.spark.bulkwriter.token.TokenRangeMapping; @@ -76,7 +79,7 @@ public abstract class AbstractBulkWriterContext implements BulkWriterContext, Kr private final JobInfo jobInfo; private final ClusterInfo clusterInfo; private final SchemaInfo schemaInfo; - private final String lowestCassandraVersion; + private final CassandraVersion bridgeVersion; // Note: do not declare transient fields as final; but they need to be volatile as there could be contention when recreating them after deserialization // For the transient field, they are assigned null once deserialized, remember to use getOrRebuildAfterDeserialization for their getters private transient volatile CassandraBridge bridge; @@ -98,10 +101,37 @@ protected AbstractBulkWriterContext(@NotNull BulkSparkConf conf, this.conf = conf; this.sparkDefaultParallelism = sparkDefaultParallelism; - // Build everything fresh on driver - this.clusterInfo = buildClusterInfo(); + // Retrieve lowest Cassandra version without building full ClusterInfo + String lowestCassandraVersion = getLowestCassandraVersion(conf); + Set sstableVersionsOnCluster = null; + + // Get SSTable versions from cluster only if SSTable version-based selection is enabled + // If disabled, skip retrieval to allow job to proceed even when SSTable version detection fails + if (!conf.isSSTableVersionBasedBridgeDisabled()) + { + sstableVersionsOnCluster = getSSTableVersionsOnCluster(conf); + } + + // Determine bridge version + this.bridgeVersion = SSTableVersionAnalyzer.determineBridgeVersionForWrite( + sstableVersionsOnCluster, + CassandraVersion.configuredSSTableFormat(), + lowestCassandraVersion, + conf.isSSTableVersionBasedBridgeDisabled() + ); + + logger.info("Selected bridge version: {}, lowest Cassandra version: {}, SSTable versions: {}", + this.bridgeVersion.versionName(), + lowestCassandraVersion, + sstableVersionsOnCluster); + + // Validate that Kryo registrator exists for this bridge version + KryoRegister.validateKryoRegistratorExists(this.bridgeVersion, lowestCassandraVersion); + + // Build cluster info with determined bridge version + this.clusterInfo = buildClusterInfo(this.bridgeVersion); this.clusterInfo.startupValidate(); - this.lowestCassandraVersion = findLowestCassandraVersion(); + this.bridge = buildCassandraBridge(); this.jobInfo = buildJobInfo(); this.schemaInfo = buildSchemaInfo(structType); @@ -121,9 +151,12 @@ protected AbstractBulkWriterContext(@NotNull BulkWriterConfig config) this.conf = config.getConf(); this.sparkDefaultParallelism = config.getSparkDefaultParallelism(); - // Reconstruct from broadcast data on executor - this.clusterInfo = reconstructClusterInfoOnExecutor(config.getBroadcastableClusterInfo()); - this.lowestCassandraVersion = config.getLowestCassandraVersion(); + // Get bridge version from broadcast config + this.bridgeVersion = config.getBridgeVersion(); + + // Reconstruct from broadcast data on executor with bridge version + this.clusterInfo = reconstructClusterInfoOnExecutor(config.getBroadcastableClusterInfo(), this.bridgeVersion); + this.bridge = buildCassandraBridge(); this.jobInfo = reconstructJobInfoOnExecutor(config.getBroadcastableJobInfo()); this.schemaInfo = reconstructSchemaInfoOnExecutor(config.getBroadcastableSchemaInfo()); @@ -141,14 +174,44 @@ protected final int sparkDefaultParallelism() return sparkDefaultParallelism; } - protected String lowestCassandraVersion() + @Override + public CassandraVersion bridgeVersion() { - return lowestCassandraVersion; + if (bridgeVersion == null) + { + throw new IllegalStateException( + "Bridge version must be determined before accessing it. " + + "Ensure SSTable versions are retrieved from cluster and bridge version is set."); + } + + return bridgeVersion; } /*--- Methods to build required fields ---*/ - protected abstract ClusterInfo buildClusterInfo(); + /** + * Retrieves the lowest Cassandra version from the cluster(s). + * + * @param conf Bulk Spark configuration + * @return lowest Cassandra version string + */ + protected abstract String getLowestCassandraVersion(@NotNull BulkSparkConf conf); + + /** + * Retrieves SSTable versions from the cluster(s). + * + * @param conf Bulk Spark configuration + * @return set of SSTable version strings present on the cluster(s) + */ + protected abstract Set getSSTableVersionsOnCluster(@NotNull BulkSparkConf conf); + + /** + * Builds the ClusterInfo with the determined bridge version. + * + * @param bridgeVersion the determined Cassandra bridge version + * @return ClusterInfo instance with bridge version set + */ + protected abstract ClusterInfo buildClusterInfo(CassandraVersion bridgeVersion); /** * Reconstructs ClusterInfo on executors from broadcastable versions. @@ -157,11 +220,13 @@ protected String lowestCassandraVersion() * into the appropriate full ClusterInfo implementation. * * @param clusterInfo the BroadcastableClusterInfo from broadcast + * @param bridgeVersion the bridge version from broadcast * @return reconstructed ClusterInfo (CassandraClusterInfo or CassandraClusterInfoGroup) */ - protected ClusterInfo reconstructClusterInfoOnExecutor(IBroadcastableClusterInfo clusterInfo) + protected ClusterInfo reconstructClusterInfoOnExecutor(IBroadcastableClusterInfo clusterInfo, + CassandraVersion bridgeVersion) { - return clusterInfo.reconstruct(); + return clusterInfo.reconstruct(bridgeVersion); } /** @@ -216,7 +281,7 @@ protected JobInfo buildJobInfo() protected CassandraBridge buildCassandraBridge() { - return CassandraBridgeFactory.get(lowestCassandraVersion()); + return CassandraBridgeFactory.get(bridgeVersion()); } protected TransportContext buildTransportContext(boolean isOnDriver) @@ -229,11 +294,6 @@ protected JobStatsPublisher buildJobStatsPublisher() return new LogStatsPublisher(); } - protected String findLowestCassandraVersion() - { - return cluster().getLowestCassandraVersion(); - } - protected SchemaInfo buildSchemaInfo(StructType structType) { QualifiedTableName tableName = job().qualifiedTableName(); @@ -248,7 +308,7 @@ protected SchemaInfo buildSchemaInfo(StructType structType) CqlTable cqlTable = bridge().buildSchema(createTableSchema, keyspace, replicationFactor, partitioner, udts, null, indexCount, false); TableInfoProvider tableInfoProvider = new CqlTableInfoProvider(createTableSchema, cqlTable); - TableSchema tableSchema = initializeTableSchema(bulkSparkConf(), structType, tableInfoProvider, lowestCassandraVersion()); + TableSchema tableSchema = initializeTableSchema(bulkSparkConf(), structType, tableInfoProvider, bridgeVersion()); return new CassandraSchemaInfo(tableSchema, udts); } @@ -313,14 +373,14 @@ public void shutdown() protected TableSchema initializeTableSchema(@NotNull BulkSparkConf conf, @NotNull StructType dfSchema, TableInfoProvider tableInfoProvider, - String lowestCassandraVersion) + CassandraVersion bridgeVersion) { return new TableSchema(dfSchema, tableInfoProvider, conf.writeMode, conf.getTTLOptions(), conf.getTimestampOptions(), - lowestCassandraVersion, + bridgeVersion, job().qualifiedTableName().quoteIdentifiers(), conf.skipSecondaryIndexCheck); } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableClusterInfo.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableClusterInfo.java index de06151ef..e2cd8e87b 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableClusterInfo.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableClusterInfo.java @@ -19,6 +19,7 @@ package org.apache.cassandra.spark.bulkwriter; +import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.data.partitioner.Partitioner; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; @@ -45,7 +46,6 @@ public final class BroadcastableClusterInfo implements IBroadcastableClusterInfo // Essential fields broadcast to executors private final Partitioner partitioner; - private final String cassandraVersion; private final String clusterId; private final BulkSparkConf conf; @@ -58,31 +58,26 @@ public final class BroadcastableClusterInfo implements IBroadcastableClusterInfo */ public static BroadcastableClusterInfo from(@NotNull ClusterInfo source, @NotNull BulkSparkConf conf) { - return new BroadcastableClusterInfo(source.getPartitioner(), source.getLowestCassandraVersion(), source.clusterId(), conf); + return new BroadcastableClusterInfo(source.getPartitioner(), + source.clusterId(), + conf); } private BroadcastableClusterInfo(@NotNull Partitioner partitioner, - @NotNull String cassandraVersion, @Nullable String clusterId, @NotNull BulkSparkConf conf) { this.partitioner = partitioner; - this.cassandraVersion = cassandraVersion; this.clusterId = clusterId; this.conf = conf; } + @NotNull public BulkSparkConf getConf() { return conf; } - @Override - public String getLowestCassandraVersion() - { - return cassandraVersion; - } - @Override public Partitioner getPartitioner() { @@ -97,8 +92,8 @@ public String clusterId() } @Override - public ClusterInfo reconstruct() + public ClusterInfo reconstruct(CassandraVersion bridgeVersion) { - return new CassandraClusterInfo(this); + return new CassandraClusterInfo(this, bridgeVersion); } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableClusterInfoGroup.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableClusterInfoGroup.java index d5ad1e13d..9cba76fca 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableClusterInfoGroup.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableClusterInfoGroup.java @@ -24,6 +24,7 @@ import java.util.List; import java.util.function.BiConsumer; +import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.CassandraClusterInfoGroup; import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.MultiClusterSupport; import org.apache.cassandra.spark.data.partitioner.Partitioner; @@ -34,7 +35,7 @@ * Broadcastable wrapper for coordinated writes with ZERO transient fields to optimize Spark broadcasting. *

* This class wraps multiple BroadcastableCluster instances for multi-cluster scenarios. - * Pre-computed values (partitioner, lowestCassandraVersion) are extracted from CassandraClusterInfoGroup on the driver + * Pre-computed values (partitioner) are extracted from CassandraClusterInfoGroup on the driver * to avoid duplicating aggregation/validation logic on executors. *

* Why ZERO transient fields matters:
@@ -56,12 +57,10 @@ public final class BroadcastableClusterInfoGroup implements IBroadcastableCluste private final String clusterId; private final BulkSparkConf conf; private final Partitioner partitioner; - private final String lowestCassandraVersion; /** * Creates a BroadcastableClusterInfoGroup from a source ClusterInfo group. - * Extracts pre-computed values (partitioner, lowestCassandraVersion) from the source - * to avoid duplicating aggregation/validation logic on executors. + * Extracts pre-computed partitioner from the source to avoid duplicating validation logic on executors. * * @param source the source CassandraClusterInfoGroup * @param conf the BulkSparkConf needed to connect to Sidecar on executors @@ -72,25 +71,22 @@ public static BroadcastableClusterInfoGroup from(@NotNull CassandraClusterInfoGr List broadcastableInfos = new ArrayList<>(); source.forEach((clusterId, clusterInfo) -> broadcastableInfos.add(BroadcastableClusterInfo.from(clusterInfo, conf))); - // Extract pre-computed values from CassandraClusterInfoGroup + // Extract pre-computed value from CassandraClusterInfoGroup // These have already been validated/computed on the driver Partitioner partitioner = source.getPartitioner(); - String lowestVersion = source.getLowestCassandraVersion(); - return new BroadcastableClusterInfoGroup(broadcastableInfos, source.clusterId(), conf, partitioner, lowestVersion); + return new BroadcastableClusterInfoGroup(broadcastableInfos, source.clusterId(), conf, partitioner); } private BroadcastableClusterInfoGroup(List clusterInfos, String clusterId, BulkSparkConf conf, - Partitioner partitioner, - String lowestCassandraVersion) + Partitioner partitioner) { this.clusterInfos = Collections.unmodifiableList(clusterInfos); this.conf = conf; this.clusterId = clusterId; this.partitioner = partitioner; - this.lowestCassandraVersion = lowestCassandraVersion; } @Override @@ -100,14 +96,6 @@ public BulkSparkConf getConf() return conf; } - @Override - public String getLowestCassandraVersion() - { - // Return pre-computed value from CassandraClusterInfoGroup - // No need to duplicate aggregation/validation logic - return lowestCassandraVersion; - } - @Override public Partitioner getPartitioner() { @@ -143,8 +131,8 @@ public IBroadcastableClusterInfo getValueOrNull(@NotNull String clusterId) } @Override - public ClusterInfo reconstruct() + public ClusterInfo reconstruct(CassandraVersion bridgeVersion) { - return CassandraClusterInfoGroup.from(this); + return CassandraClusterInfoGroup.from(this, bridgeVersion); } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableTableSchema.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableTableSchema.java index 322e815dd..bdcd91d43 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableTableSchema.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableTableSchema.java @@ -24,6 +24,7 @@ import com.google.common.base.Preconditions; +import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.common.schema.ColumnType; import org.jetbrains.annotations.NotNull; @@ -58,7 +59,7 @@ public final class BroadcastableTableSchema implements Serializable private final WriteMode writeMode; private final TTLOption ttlOption; private final TimestampOption timestampOption; - private final String lowestCassandraVersion; + private final CassandraVersion bridgeVersion; private final boolean quoteIdentifiers; /** @@ -80,7 +81,7 @@ public static BroadcastableTableSchema from(@NotNull TableSchema source) source.writeMode, source.ttlOption, source.timestampOption, - source.lowestCassandraVersion, + source.bridgeVersion, source.quoteIdentifiers ); } @@ -94,7 +95,7 @@ private BroadcastableTableSchema(String createStatement, WriteMode writeMode, TTLOption ttlOption, TimestampOption timestampOption, - String lowestCassandraVersion, + CassandraVersion bridgeVersion, boolean quoteIdentifiers) { this.createStatement = createStatement; @@ -106,7 +107,7 @@ private BroadcastableTableSchema(String createStatement, this.writeMode = writeMode; this.ttlOption = ttlOption; this.timestampOption = timestampOption; - this.lowestCassandraVersion = lowestCassandraVersion; + this.bridgeVersion = bridgeVersion; this.quoteIdentifiers = quoteIdentifiers; } @@ -155,9 +156,9 @@ public TimestampOption getTimestampOption() return timestampOption; } - public String getLowestCassandraVersion() + public CassandraVersion getBridgeVersion() { - return lowestCassandraVersion; + return bridgeVersion; } public boolean isQuoteIdentifiers() diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java index a01c2d1e3..555f321e5 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java @@ -117,6 +117,9 @@ public class BulkSparkConf implements Serializable // Cassandra version of target cluster. Configuration parameter is exposed to be able to correctly initialize static // components, before cluster version is discovered via Sidecar. public static final String CASSANDRA_VERSION = SETTING_PREFIX + "cassandra.version"; + // Disable SSTable version-based bridge determination. When true, falls back to using cassandra.version for bridge selection. + // This provides a safety fallback mechanism if SSTable version detection fails or encounters issues. + public static final String DISABLE_SSTABLE_VERSION_BASED_BRIDGE = SETTING_PREFIX + "bridge.disable_sstable_version_based"; public static final String HTTP_MAX_CONNECTIONS = SETTING_PREFIX + "request.max_connections"; public static final String HTTP_RESPONSE_TIMEOUT = SETTING_PREFIX + "request.response_timeout"; public static final String HTTP_CONNECTION_TIMEOUT = SETTING_PREFIX + "request.connection_timeout"; @@ -169,6 +172,7 @@ public class BulkSparkConf implements Serializable protected final String configuredJobId; protected boolean useOpenSsl; protected int ringRetryCount; + protected final boolean disableSSTableVersionBasedBridge; // create sidecarInstances from sidecarContactPointsValue and effectiveSidecarPort private final String sidecarContactPointsValue; // It takes comma separated values private transient Set sidecarContactPoints; // not serialized @@ -221,6 +225,7 @@ public BulkSparkConf(SparkConf conf, Map options, @Nullable Logg // else fall back to props, and then default if neither specified this.useOpenSsl = getBoolean(USE_OPENSSL, true); this.ringRetryCount = getInt(RING_RETRY_COUNT, DEFAULT_RING_RETRY_COUNT); + this.disableSSTableVersionBasedBridge = getBoolean(DISABLE_SSTABLE_VERSION_BASED_BRIDGE, false); this.importCoordinatorTimeoutMultiplier = getDouble(IMPORT_COORDINATOR_TIMEOUT_MULTIPLIER, 0.5); this.ttl = MapUtils.getOrDefault(options, WriterOptions.TTL.name(), null); this.timestamp = MapUtils.getOrDefault(options, WriterOptions.TIMESTAMP.name(), null); @@ -732,6 +737,25 @@ public int getRingRetryCount() return ringRetryCount; } + public boolean isSSTableVersionBasedBridgeDisabled() + { + return disableSSTableVersionBasedBridge; + } + + /** + * Utility method to retrieve the disable SSTable version-based bridge flag + * from Spark configuration. This can be called from contexts where a BulkSparkConf + * instance is not available. + * + * @return true if SSTable version-based bridge selection should be disabled + */ + public static boolean getDisableSSTableVersionBasedBridge() + { + return org.apache.spark.SparkContext.getOrCreate() + .getConf() + .getBoolean(DISABLE_SSTABLE_VERSION_BASED_BRIDGE, false); + } + public StorageClientConfig getStorageClientConfig() { return storageClientConfig; diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfig.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfig.java index 0e126fc95..7e69ec1ef 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfig.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfig.java @@ -21,6 +21,7 @@ import java.io.Serializable; +import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.CassandraCoordinatedBulkWriterContext; import org.jetbrains.annotations.NotNull; @@ -56,31 +57,32 @@ public class BulkWriterConfig implements Serializable // BroadcastableClusterInfo can be either BroadcastableCluster or BroadcastableClusterInfoGroup private final IBroadcastableClusterInfo clusterInfo; private final BroadcastableSchemaInfo schemaInfo; - private final String lowestCassandraVersion; + // SSTable version-based bridge loading fields + private final CassandraVersion bridgeVersion; /** * Creates a new immutable BulkWriterConfig with pre-computed values * - * @param conf Bulk writer Spark configuration - * @param sparkDefaultParallelism Spark default parallelism setting - * @param jobInfo Broadcastable job information - * @param clusterInfo Broadcastable cluster information (BroadcastableCluster or BroadcastableClusterInfoGroup) - * @param schemaInfo Broadcastable schema information - * @param lowestCassandraVersion Lowest Cassandra version in the cluster + * @param conf Bulk writer Spark configuration + * @param sparkDefaultParallelism Spark default parallelism setting + * @param jobInfo Broadcastable job information + * @param clusterInfo Broadcastable cluster information (BroadcastableCluster or BroadcastableClusterInfoGroup) + * @param schemaInfo Broadcastable schema information + * @param bridgeVersion Cassandra bridge version to use */ public BulkWriterConfig(@NotNull BulkSparkConf conf, int sparkDefaultParallelism, @NotNull BroadcastableJobInfo jobInfo, @NotNull IBroadcastableClusterInfo clusterInfo, @NotNull BroadcastableSchemaInfo schemaInfo, - @NotNull String lowestCassandraVersion) + @NotNull CassandraVersion bridgeVersion) { this.conf = conf; this.sparkDefaultParallelism = sparkDefaultParallelism; this.jobInfo = jobInfo; this.clusterInfo = clusterInfo; this.schemaInfo = schemaInfo; - this.lowestCassandraVersion = lowestCassandraVersion; + this.bridgeVersion = bridgeVersion; } public BulkSparkConf getConf() @@ -108,9 +110,9 @@ public BroadcastableSchemaInfo getBroadcastableSchemaInfo() return schemaInfo; } - public String getLowestCassandraVersion() + public CassandraVersion getBridgeVersion() { - return lowestCassandraVersion; + return bridgeVersion; } /** diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkWriterContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkWriterContext.java index 8f240215c..a717f1fb9 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkWriterContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkWriterContext.java @@ -22,6 +22,7 @@ import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.CassandraCoordinatedBulkWriterContext; import org.apache.cassandra.spark.common.stats.JobStatsPublisher; import org.apache.cassandra.bridge.CassandraBridge; +import org.apache.cassandra.bridge.CassandraVersion; import org.apache.spark.api.java.JavaSparkContext; /** @@ -47,6 +48,8 @@ public interface BulkWriterContext CassandraBridge bridge(); + CassandraVersion bridgeVersion(); + // NOTE: This interface intentionally does *not* implement AutoClosable as Spark can close Broadcast variables // that implement AutoClosable while they are still in use, causing the underlying object to become unusable void shutdown(); diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraBulkWriterContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraBulkWriterContext.java index e658b3dc7..14ab2958b 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraBulkWriterContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraBulkWriterContext.java @@ -19,11 +19,13 @@ package org.apache.cassandra.spark.bulkwriter; +import java.util.Set; import java.util.UUID; import com.google.common.base.Preconditions; import org.apache.commons.lang3.StringUtils; +import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.MultiClusterContainer; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.types.StructType; @@ -38,6 +40,13 @@ // CHECKSTYLE IGNORE: This class cannot be declared as final, because consumers should be able to extend it public class CassandraBulkWriterContext extends AbstractBulkWriterContext { + // A temporary CassandraClusterInfo created with bridgeVersion=null during driver-side initialization. + // This preliminaryClusterInfo provides the Sidecar connectivity needed to determine the bridge version. + // Once bridge version is determined, buildClusterInfo() promotes this by setting its bridge version. + // Marked transient because it is only used during the driver-side constructor and must not be serialized + // when broadcasting to executors. + private transient CassandraClusterInfo preliminaryClusterInfo; + protected CassandraBulkWriterContext(@NotNull BulkSparkConf conf, @NotNull StructType structType, int sparkDefaultParallelism) @@ -57,9 +66,34 @@ protected CassandraBulkWriterContext(@NotNull BulkWriterConfig config) } @Override - protected ClusterInfo buildClusterInfo() + protected String getLowestCassandraVersion(@NotNull BulkSparkConf conf) + { + return getOrCreatePreliminaryClusterInfo(conf).getLowestCassandraVersion(); + } + + @Override + protected Set getSSTableVersionsOnCluster(@NotNull BulkSparkConf conf) { - return new CassandraClusterInfo(bulkSparkConf()); + return getOrCreatePreliminaryClusterInfo(conf).getSSTableVersionsOnCluster(); + } + + @Override + protected ClusterInfo buildClusterInfo(CassandraVersion bridgeVersion) + { + CassandraClusterInfo clusterInfo = getOrCreatePreliminaryClusterInfo(bulkSparkConf()); + preliminaryClusterInfo = null; + clusterInfo.setBridgeVersion(bridgeVersion); + return clusterInfo; + } + + private CassandraClusterInfo getOrCreatePreliminaryClusterInfo(BulkSparkConf conf) + { + if (preliminaryClusterInfo == null) + { + preliminaryClusterInfo = new CassandraClusterInfo(conf, null); + } + + return preliminaryClusterInfo; } @Override @@ -95,6 +129,6 @@ public BulkWriterConfig toBulkWriterConfigForBroadcasting(JavaSparkContext spark broadcastableJobInfo, broadcastableClusterInfo, broadcastableSchemaInfo, - lowestCassandraVersion()); + bridgeVersion()); } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java index a9c120871..207bbc96b 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java @@ -33,7 +33,7 @@ import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicReference; + import java.util.stream.Collectors; import com.google.common.annotations.VisibleForTesting; @@ -47,6 +47,7 @@ import o.a.c.sidecar.client.shaded.common.response.TokenRangeReplicasResponse; import org.apache.cassandra.bridge.CassandraBridge; import org.apache.cassandra.bridge.CassandraBridgeFactory; +import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.bridge.CassandraVersionFeatures; import org.apache.cassandra.clients.Sidecar; import o.a.c.sidecar.client.shaded.client.SidecarInstance; @@ -84,7 +85,6 @@ public class CassandraClusterInfo implements ClusterInfo, Closeable // Changes here must be reflected in BroadcastableClusterInfo. protected final BulkSparkConf conf; protected final String clusterId; - protected String cassandraVersion; protected Partitioner partitioner; // -- Driver-only fields (not broadcast) -- @@ -95,42 +95,50 @@ public class CassandraClusterInfo implements ClusterInfo, Closeable protected volatile String keyspaceSchema; protected volatile ReplicationFactor replicationFactor; protected volatile CassandraContext cassandraContext; - protected final AtomicReference nodeSettings; - protected final List> allNodeSettingFutures; + protected volatile CassandraVersion bridgeVersion; + private final List> allNodeSettingFutures; + private List resolvedNodeSettings; - public CassandraClusterInfo(BulkSparkConf conf) + public CassandraClusterInfo(BulkSparkConf conf, CassandraVersion bridgeVersion) { - this(conf, null); + this(conf, null, bridgeVersion); } - // Used by CassandraClusterInfoGroup - public CassandraClusterInfo(BulkSparkConf conf, String clusterId) + /** + * Constructor with bridge version for driver-side usage. + * + * @param conf Bulk Spark configuration + * @param clusterId Optional cluster identifier + * @param bridgeVersion Determined bridge version (nullable for preliminary construction) + */ + public CassandraClusterInfo(BulkSparkConf conf, String clusterId, CassandraVersion bridgeVersion) { this.conf = conf; this.clusterId = clusterId; + this.bridgeVersion = bridgeVersion; this.cassandraContext = buildCassandraContext(); LOGGER.info("Getting Cassandra versions from all nodes"); - this.nodeSettings = new AtomicReference<>(null); this.allNodeSettingFutures = Sidecar.allNodeSettings(cassandraContext.getSidecarClient(), cassandraContext.getCluster()); } /** * Reconstruct from BroadcastableCluster on executor. - * Reuses cassandraVersion and partitioner from broadcast, + * Reuses partitioner and bridge version from broadcast, * fetches other data (tokenRangeMapping, replicationFactor, keyspaceSchema, writeAvailability) fresh from Sidecar. * * @param broadcastable the broadcastable cluster info from broadcast + * @param bridgeVersion the bridge version from broadcast */ - public CassandraClusterInfo(BroadcastableClusterInfo broadcastable) + public CassandraClusterInfo(BroadcastableClusterInfo broadcastable, CassandraVersion bridgeVersion) { this.conf = broadcastable.getConf(); this.clusterId = broadcastable.clusterId(); - this.cassandraVersion = broadcastable.getLowestCassandraVersion(); this.partitioner = broadcastable.getPartitioner(); + this.bridgeVersion = bridgeVersion; this.cassandraContext = buildCassandraContext(); - LOGGER.info("Reconstructing CassandraClusterInfo on executor from BroadcastableCluster. clusterId={}", clusterId); - this.nodeSettings = new AtomicReference<>(null); + LOGGER.info("Reconstructing CassandraClusterInfo on executor from BroadcastableCluster. clusterId={}, bridgeVersion={}", + clusterId, bridgeVersion != null ? bridgeVersion.versionName() : "null"); // Executors do not need to query all node settings since cassandraVersion is already set from broadcast this.allNodeSettingFutures = null; } @@ -208,24 +216,9 @@ public Partitioner getPartitioner() { if (partitioner == null) { - try - { - String partitionerString; - NodeSettings currentNodeSettings = nodeSettings.get(); - if (currentNodeSettings != null) - { - partitionerString = currentNodeSettings.partitioner(); - } - else - { - partitionerString = getCassandraContext().getSidecarClient().nodeSettings().get().partitioner(); - } - partitioner = Partitioner.from(partitionerString); - } - catch (ExecutionException | InterruptedException exception) - { - throw new RuntimeException("Unable to retrieve partitioner information", exception); - } + List settings = resolveAllNodeSettings(); + String partitionerString = settings.get(0).partitioner(); + partitioner = Partitioner.from(partitionerString); } return partitioner; } @@ -390,33 +383,6 @@ public TokenRangeMapping getTokenRangeMapping(boolean cached) } } - @Override - public String getLowestCassandraVersion() - { - String currentCassandraVersion = cassandraVersion; - if (currentCassandraVersion != null) - { - return currentCassandraVersion; - } - - synchronized (this) - { - if (cassandraVersion == null) - { - String versionFromFeature = getVersionFromFeature(); - if (versionFromFeature != null) - { - // Forcing writer to use a particular version - cassandraVersion = versionFromFeature; - } - else - { - cassandraVersion = getVersionFromSidecar(); - } - } - } - return cassandraVersion; - } @Override public Map clusterWriteAvailability() @@ -447,22 +413,30 @@ private TokenRangeMapping getTokenRangeReplicasFromSidecar() metadata -> new RingInstance(metadata, clusterId())); } - public String getVersionFromFeature() + /** + * Sets the bridge version after preliminary construction. + * This allows constructing CassandraClusterInfo with a null bridgeVersion for early + * context reuse, then setting the version once it has been determined. + * + * @param bridgeVersion the determined Cassandra bridge version + */ + public void setBridgeVersion(CassandraVersion bridgeVersion) { - return null; + this.bridgeVersion = bridgeVersion; } - protected List getAllNodeSettings() + /** + * Resolves the node settings futures on first call and caches the result. + * + * @return list of resolved NodeSettings from all nodes + */ + private synchronized List resolveAllNodeSettings() { - if (allNodeSettingFutures == null) + if (resolvedNodeSettings != null) { - throw new IllegalStateException("getAllNodeSettings should not be called on executor. " - + "Cassandra version is pre-computed on driver and broadcast to executors."); + return resolvedNodeSettings; } - // Worst-case, the http client is configured for 1 worker pool. - // In that case, each future can take the full retry delay * number of retries, - // and each instance will be processed serially. final long totalTimeout = conf.getSidecarRequestMaxRetryDelayMillis() * conf.getSidecarRequestRetries() * allNodeSettingFutures.size(); @@ -481,43 +455,59 @@ else if (allNodeSettings.size() < allNodeSettingFutures.size()) allNodeSettings.size(), allNodeSettingFutures.size()); } - return allNodeSettings; + resolvedNodeSettings = allNodeSettings; + return resolvedNodeSettings; } - public String getVersionFromSidecar() + /** + * Retrieves the lowest Cassandra version using the already-fired allNodeSettingFutures. + * Reuses the existing CassandraContext instead of creating a separate one. + * + * @return lowest Cassandra version string + */ + public String getLowestCassandraVersion() { - NodeSettings nodeSettings = this.nodeSettings.get(); - if (nodeSettings != null) - { - return nodeSettings.releaseVersion(); - } + List allNodeSettings = resolveAllNodeSettings(); + + NodeSettings ns = allNodeSettings + .stream() + .filter(settings -> !settings.releaseVersion().equalsIgnoreCase("unknown")) + .min(Comparator.comparing(settings -> + CassandraVersionFeatures.cassandraVersionFeaturesFromCassandraVersion(settings.releaseVersion()))) + .orElseThrow(() -> new RuntimeException("No valid Cassandra Versions were returned from Cassandra Sidecar")); - return getLowestVersion(getAllNodeSettings()); + return ns.releaseVersion(); } - @VisibleForTesting - public String getLowestVersion(List allNodeSettings) + /** + * Retrieves SSTable versions using the existing cassandraContext. + * Reuses the existing CassandraContext instead of creating a separate one. + * + * @return set of SSTable version strings present on the cluster + */ + public Set getSSTableVersionsOnCluster() { - NodeSettings ns = this.nodeSettings.get(); - if (ns != null) - { - return ns.releaseVersion(); - } + CassandraContext context = getCassandraContext(); - // It is possible to run the below computation multiple times. Since the computation is local-only, it is OK. - ns = allNodeSettings - .stream() - .filter(settings -> !settings.releaseVersion().equalsIgnoreCase("unknown")) - .min(Comparator.comparing(settings -> - CassandraVersionFeatures.cassandraVersionFeaturesFromCassandraVersion(settings.releaseVersion()))) - .orElseThrow(() -> new RuntimeException("No valid Cassandra Versions were returned from Cassandra Sidecar")); - nodeSettings.compareAndSet(null, ns); - return ns.releaseVersion(); + return Sidecar.getSSTableVersionsFromCluster( + context.getSidecarClient(), + context.getCluster(), + conf.getSidecarRequestMaxRetryDelayMillis(), + conf.getSidecarRequestRetries() + ); } protected CassandraBridge bridge() { - return CassandraBridgeFactory.get(getLowestCassandraVersion()); + // Use the pre-determined bridgeVersion if available + if (bridgeVersion != null) + { + return CassandraBridgeFactory.get(bridgeVersion); + } + + // Bridge version must be set before accessing bridge + throw new IllegalStateException( + "Bridge version must be set during construction before using bridge()."); } // Startup Validation diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java index bed36ac4b..7f2db54ca 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java @@ -25,7 +25,6 @@ import com.google.common.collect.Range; import org.apache.cassandra.bridge.CassandraBridge; -import org.apache.cassandra.bridge.CassandraBridgeFactory; import org.apache.cassandra.spark.bulkwriter.token.ReplicaAwareFailureHandler; import org.jetbrains.annotations.NotNull; @@ -36,12 +35,15 @@ public class CassandraDirectDataTransportContext implements TransportContext.Dir @NotNull private final ClusterInfo clusterInfo; @NotNull + private final CassandraBridge bridge; + @NotNull private final DirectDataTransferApi dataTransferApi; public CassandraDirectDataTransportContext(@NotNull BulkWriterContext bulkWriterContext) { this.jobInfo = bulkWriterContext.job(); this.clusterInfo = bulkWriterContext.cluster(); + this.bridge = bulkWriterContext.bridge(); // Use bridge from context (SSTable version-based) this.dataTransferApi = createDirectDataTransferApi(); } @@ -71,7 +73,6 @@ public DirectDataTransferApi dataTransferApi() // only invoke in constructor protected DirectDataTransferApi createDirectDataTransferApi() { - CassandraBridge bridge = CassandraBridgeFactory.get(clusterInfo.getLowestCassandraVersion()); return new SidecarDataTransferApi(clusterInfo.getCassandraContext(), bridge, jobInfo); } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/ClusterInfo.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/ClusterInfo.java index 64b654449..a5d0f66d1 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/ClusterInfo.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/ClusterInfo.java @@ -24,6 +24,7 @@ import com.google.common.collect.Range; +import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.bulkwriter.token.TokenRangeMapping; import org.apache.cassandra.spark.data.ReplicationFactor; import org.apache.cassandra.spark.data.partitioner.Partitioner; @@ -43,7 +44,7 @@ * for broadcasting to executors via {@link BulkWriterConfig}. *

* On executors, ClusterInfo instances are reconstructed from the broadcastable wrappers using - * {@link AbstractBulkWriterContext#reconstructClusterInfoOnExecutor(IBroadcastableClusterInfo)}. + * {@link AbstractBulkWriterContext#reconstructClusterInfoOnExecutor(IBroadcastableClusterInfo, CassandraVersion)}. */ public interface ClusterInfo extends StartupValidatable { @@ -51,8 +52,6 @@ public interface ClusterInfo extends StartupValidatable TokenRangeMapping getTokenRangeMapping(boolean cached); - String getLowestCassandraVersion(); - /** * @return WriteAvailability per RingInstance in the cluster */ diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/IBroadcastableClusterInfo.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/IBroadcastableClusterInfo.java index fb4eb2e29..a1c0af404 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/IBroadcastableClusterInfo.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/IBroadcastableClusterInfo.java @@ -21,6 +21,7 @@ import java.io.Serializable; +import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.data.partitioner.Partitioner; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; @@ -37,10 +38,9 @@ * Methods in this interface: *

    *
  • {@link #getPartitioner()} - static cluster partitioner configuration
  • - *
  • {@link #getLowestCassandraVersion()} - pre-computed version string
  • *
  • {@link #clusterId()} - cluster identifier (optional)
  • *
  • {@link #getConf()} - BulkSparkConf needed for reconstruction on executors
  • - *
  • {@link #reconstruct()} - reconstructs full ClusterInfo instance on executors
  • + *
  • {@link #reconstruct(CassandraVersion)} - reconstructs full ClusterInfo instance on executors with bridge version
  • *
*/ public interface IBroadcastableClusterInfo extends Serializable @@ -50,11 +50,6 @@ public interface IBroadcastableClusterInfo extends Serializable */ Partitioner getPartitioner(); - /** - * @return the lowest Cassandra version in the cluster - */ - String getLowestCassandraVersion(); - /** * ID string that can uniquely identify a cluster. * When writing to a single cluster, this may be null. @@ -77,7 +72,8 @@ public interface IBroadcastableClusterInfo extends Serializable * This allows adding new broadcastable types without modifying the reconstruction logic * in {@link AbstractBulkWriterContext}. * + * @param bridgeVersion the bridge version from broadcast * @return reconstructed ClusterInfo (CassandraClusterInfo or CassandraClusterInfoGroup) */ - ClusterInfo reconstruct(); + ClusterInfo reconstruct(CassandraVersion bridgeVersion); } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SSTableWriterFactory.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SSTableWriterFactory.java index 77b8f5f42..f2350c5d4 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SSTableWriterFactory.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SSTableWriterFactory.java @@ -22,10 +22,14 @@ import java.util.Set; import org.apache.cassandra.bridge.CassandraBridge; -import org.apache.cassandra.bridge.CassandraBridgeFactory; -import org.apache.cassandra.bridge.CassandraVersionFeatures; import org.apache.cassandra.bridge.SSTableWriter; +/** + * Factory for creating SSTableWriter instances. + * + * @deprecated This class is deprecated. Use {@code BulkWriterContext.bridge().getSSTableWriter()} directly instead. + */ +@Deprecated public final class SSTableWriterFactory { private SSTableWriterFactory() @@ -33,7 +37,13 @@ private SSTableWriterFactory() throw new IllegalStateException(getClass() + " is static utility class and shall not be instantiated"); } - public static SSTableWriter getSSTableWriter(CassandraVersionFeatures serverVersion, + /** + * Creates an SSTableWriter using the provided bridge. + * + * @deprecated Use {@code cassandraBridge.getSSTableWriter()} directly instead. + */ + @Deprecated + public static SSTableWriter getSSTableWriter(CassandraBridge cassandraBridge, String inDirectory, String partitioner, String createStatement, @@ -41,7 +51,6 @@ public static SSTableWriter getSSTableWriter(CassandraVersionFeatures serverVers Set userDefinedTypeStatements, int bufferSizeMB) { - CassandraBridge cassandraBridge = CassandraBridgeFactory.get(serverVersion); return cassandraBridge.getSSTableWriter(inDirectory, partitioner, createStatement, diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java index 877aafa15..5d942ace6 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java @@ -37,9 +37,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.cassandra.bridge.CassandraBridgeFactory; import org.apache.cassandra.bridge.CassandraVersion; -import org.apache.cassandra.bridge.CassandraVersionFeatures; import org.apache.cassandra.bridge.SSTableDescriptor; import org.apache.cassandra.spark.common.Digest; import org.apache.cassandra.spark.common.SSTables; @@ -120,26 +118,17 @@ public SortedSSTableWriter(BulkWriterContext writerContext, Path outDir, DigestA this.digestAlgorithm = digestAlgorithm; this.partitionId = partitionId; - String lowestCassandraVersion = writerContext.cluster().getLowestCassandraVersion(); - String packageVersion = getPackageVersion(lowestCassandraVersion); - LOGGER.info("Running with version {}", packageVersion); - SchemaInfo schema = writerContext.schema(); TableSchema tableSchema = schema.getTableSchema(); - this.cqlSSTableWriter = SSTableWriterFactory.getSSTableWriter( - CassandraVersionFeatures.cassandraVersionFeaturesFromCassandraVersion(packageVersion), - this.outDir.toString(), - writerContext.cluster().getPartitioner().toString(), - tableSchema.createStatement, - tableSchema.modificationStatement, - schema.getUserDefinedTypeStatements(), - writerContext.job().sstableDataSizeInMiB()); - } - @NotNull - public String getPackageVersion(String lowestCassandraVersion) - { - return CASSANDRA_VERSION_PREFIX + lowestCassandraVersion; + // Use the bridge from context which is already loaded based on SSTable version analysis + this.cqlSSTableWriter = writerContext.bridge().getSSTableWriter( + this.outDir.toString(), + writerContext.cluster().getPartitioner().toString(), + tableSchema.createStatement, + tableSchema.modificationStatement, + schema.getUserDefinedTypeStatements(), + writerContext.job().sstableDataSizeInMiB()); } /** @@ -363,7 +352,7 @@ public void validateSSTables(@NotNull BulkWriterContext writerContext, @NotNull private LocalDataLayer buildLocalDataLayer(@NotNull BulkWriterContext writerContext, @NotNull Path outputDirectory, @Nullable Set dataFilePaths) { - CassandraVersion version = CassandraBridgeFactory.getCassandraVersion(writerContext.cluster().getLowestCassandraVersion()); + CassandraVersion version = writerContext.bridgeVersion(); String keyspace = writerContext.job().qualifiedTableName().keyspace(); String schema = writerContext.schema().getTableSchema().createStatement; Partitioner partitioner = writerContext.cluster().getPartitioner(); diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableSchema.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableSchema.java index b198af984..69870807c 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableSchema.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableSchema.java @@ -33,6 +33,7 @@ import org.apache.cassandra.bridge.CassandraBridge; import org.apache.cassandra.bridge.CassandraBridgeFactory; +import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.common.schema.ColumnType; import org.apache.cassandra.spark.data.CqlField; import org.apache.cassandra.spark.exception.UnsupportedAnalyticsOperationException; @@ -60,7 +61,7 @@ public class TableSchema final WriteMode writeMode; final TTLOption ttlOption; final TimestampOption timestampOption; - final String lowestCassandraVersion; + final CassandraVersion bridgeVersion; final boolean quoteIdentifiers; public TableSchema(StructType dfSchema, @@ -68,7 +69,7 @@ public TableSchema(StructType dfSchema, WriteMode writeMode, TTLOption ttlOption, TimestampOption timestampOption, - String lowestCassandraVersion, + CassandraVersion bridgeVersion, boolean quoteIdentifiers, boolean skipSecondaryIndexCheck) { @@ -76,7 +77,7 @@ public TableSchema(StructType dfSchema, this.ttlOption = ttlOption; this.timestampOption = timestampOption; - this.lowestCassandraVersion = lowestCassandraVersion; + this.bridgeVersion = bridgeVersion; this.quoteIdentifiers = quoteIdentifiers; validateDataFrameCompatibility(dfSchema, tableInfo); @@ -95,7 +96,7 @@ else if (tableInfo.hasSecondaryIndex()) + "take place automatically after writing. Reads against the index during this time " + "window will produce inconsistent or stale results until index rebuild is complete."); } - validateUserAddedColumns(lowestCassandraVersion, quoteIdentifiers, ttlOption, timestampOption); + validateUserAddedColumns(bridgeVersion, quoteIdentifiers, ttlOption, timestampOption); this.createStatement = getCreateStatement(tableInfo); this.modificationStatement = getModificationStatement(dfSchema, tableInfo); @@ -123,7 +124,7 @@ public TableSchema(BroadcastableTableSchema broadcastable) this.writeMode = broadcastable.getWriteMode(); this.ttlOption = broadcastable.getTtlOption(); this.timestampOption = broadcastable.getTimestampOption(); - this.lowestCassandraVersion = broadcastable.getLowestCassandraVersion(); + this.bridgeVersion = broadcastable.getBridgeVersion(); this.quoteIdentifiers = broadcastable.isQuoteIdentifiers(); } @@ -198,8 +199,7 @@ private String getInsertStatement(StructType dfSchema, TTLOption ttlOption, TimestampOption timestampOption) { - CassandraBridge bridge = CassandraBridgeFactory.get(lowestCassandraVersion); - + CassandraBridge bridge = CassandraBridgeFactory.get(bridgeVersion); List columnNames = Arrays.stream(dfSchema.fieldNames()) .filter(fieldName -> !fieldName.equals(ttlOption.columnName())) .filter(fieldName -> !fieldName.equals(timestampOption.columnName())) @@ -242,7 +242,7 @@ else if (ttlOption.withTTl()) private String getDeleteStatement(StructType dfSchema, TableInfoProvider tableInfo) { - CassandraBridge bridge = CassandraBridgeFactory.get(lowestCassandraVersion); + CassandraBridge bridge = CassandraBridgeFactory.get(bridgeVersion); Stream fieldEqualityStatements = Arrays.stream(dfSchema.fieldNames()).map(key -> maybeQuotedIdentifier(bridge, quoteIdentifiers, key) + "=?"); String deleteStatement = String.format("DELETE FROM %s.%s where %s;", maybeQuotedIdentifier(bridge, quoteIdentifiers, tableInfo.getKeyspaceName()), @@ -327,12 +327,12 @@ private static List getKeyFieldPositions(StructType dfSchema, .collect(Collectors.toList()); } - private static void validateUserAddedColumns(String lowestCassandraVersion, boolean quoteIdentifiers, + private static void validateUserAddedColumns(CassandraVersion bridgeVersion, boolean quoteIdentifiers, TTLOption ttlOption, TimestampOption timestampOption) { if (!quoteIdentifiers) { - CassandraBridge bridge = CassandraBridgeFactory.get(lowestCassandraVersion); + CassandraBridge bridge = CassandraBridgeFactory.get(bridgeVersion); validateColumnName(bridge, ttlOption.columnName(), WriterOptions.TTL.name()); validateColumnName(bridge, timestampOption.columnName(), WriterOptions.TIMESTAMP.name()); } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/CloudStorageStreamSession.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/CloudStorageStreamSession.java index d2a401f29..f99a543ce 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/CloudStorageStreamSession.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/CloudStorageStreamSession.java @@ -36,7 +36,6 @@ import o.a.c.sidecar.client.shaded.common.request.data.CreateSliceRequestPayload; import o.a.c.sidecar.client.shaded.common.response.data.RestoreJobSummaryResponsePayload; import org.apache.cassandra.bridge.CassandraBridge; -import org.apache.cassandra.bridge.CassandraBridgeFactory; import org.apache.cassandra.bridge.SSTableDescriptor; import org.apache.cassandra.clients.Sidecar; import o.a.c.sidecar.client.shaded.client.SidecarInstance; @@ -82,7 +81,7 @@ public CloudStorageStreamSession(BulkWriterContext bulkWriterContext, SortedSSTa ExecutorService executorService) { this(bulkWriterContext, sstableWriter, transportContext, sessionID, tokenRange, - CassandraBridgeFactory.get(bulkWriterContext.cluster().getLowestCassandraVersion()), + bulkWriterContext.bridge(), // Use bridge from context (SSTable version-based) failureHandler, executorService); } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java index 4a434a7a5..365cef5a0 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java @@ -24,6 +24,7 @@ import java.util.Collections; import java.util.EnumSet; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -40,6 +41,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.bridge.CassandraVersionFeatures; import org.apache.cassandra.spark.bulkwriter.BulkSparkConf; import org.apache.cassandra.spark.bulkwriter.CassandraClusterInfo; @@ -84,35 +86,37 @@ public class CassandraClusterInfoGroup implements ClusterInfo, MultiClusterSuppo private volatile TokenRangeMapping consolidatedTokenRangeMapping; // Pre-computed values from BroadcastableClusterInfoGroup (only set when reconstructed on executors) private Partitioner cachedPartitioner; - private String cachedLowestCassandraVersion; /** * Creates {@link CassandraClusterInfoGroup} with the list of {@link ClusterInfo} from {@link BulkSparkConf} and validation * The validation ensures non-empty list of {@link ClusterInfo}, where all objects have non-empty and unique clusterId * @param conf bulk write conf + * @param bridgeVersion bridge version (nullable for preliminary construction) * @return new {@link CassandraClusterInfoGroup} instance */ - public static CassandraClusterInfoGroup fromBulkSparkConf(BulkSparkConf conf) + public static CassandraClusterInfoGroup fromBulkSparkConf(BulkSparkConf conf, CassandraVersion bridgeVersion) { - return fromBulkSparkConf(conf, clusterId -> new CassandraClusterInfo(conf, clusterId)); + return fromBulkSparkConf(conf, clusterId -> new CassandraClusterInfo(conf, clusterId, bridgeVersion)); } + /** * Reconstruct from BroadcastableClusterInfoGroup on executor. * Creates CassandraClusterInfo instances for each cluster that will fetch data from Sidecar. - * Leverages pre-computed values (partitioner, lowestCassandraVersion) from the broadcastable + * Leverages pre-computed values (partitioner, bridgeVersion) from the broadcastable * to avoid re-validation and re-computation on executors. * * @param broadcastable the broadcastable cluster info group from broadcast * @return new {@link CassandraClusterInfoGroup} instance */ - public static CassandraClusterInfoGroup from(BroadcastableClusterInfoGroup broadcastable) + public static CassandraClusterInfoGroup from(BroadcastableClusterInfoGroup broadcastable, + CassandraVersion bridgeVersion) { - return new CassandraClusterInfoGroup(broadcastable); + return new CassandraClusterInfoGroup(broadcastable, bridgeVersion); } /** - * Similar to {@link #fromBulkSparkConf(BulkSparkConf)} but takes additional function to create {@link ClusterInfo} + * Similar to {@link #fromBulkSparkConf(BulkSparkConf, CassandraVersion)} (BulkSparkConf)} but takes additional function to create {@link ClusterInfo} */ public static CassandraClusterInfoGroup fromBulkSparkConf(BulkSparkConf conf, Function clusterInfoFactory) { @@ -163,19 +167,18 @@ private CassandraClusterInfoGroup(List clusterInfos) * re-validation and re-computation on executors. * * @param broadcastable the broadcastable cluster info group from broadcast + * @param bridgeVersion the bridge version from broadcast */ - private CassandraClusterInfoGroup(BroadcastableClusterInfoGroup broadcastable) + private CassandraClusterInfoGroup(BroadcastableClusterInfoGroup broadcastable, CassandraVersion bridgeVersion) { // Build list of ClusterInfo from broadcastable data List clusterInfosList = new ArrayList<>(); - broadcastable.forEach((clusterId, broadcastableInfo) -> { - clusterInfosList.add(new CassandraClusterInfo((BroadcastableClusterInfo) broadcastableInfo)); - }); + broadcastable.forEach((clusterId, broadcastableInfo) -> clusterInfosList.add( + new CassandraClusterInfo((BroadcastableClusterInfo) broadcastableInfo, bridgeVersion))); this.clusterInfos = Collections.unmodifiableList(clusterInfosList); // Extract pre-computed values from driver to avoid re-validation on executors this.cachedPartitioner = broadcastable.getPartitioner(); - this.cachedLowestCassandraVersion = broadcastable.getLowestCassandraVersion(); this.clusterId = broadcastable.clusterId(); clusterInfoById(); } @@ -211,38 +214,6 @@ public TokenRangeMapping getTokenRangeMapping(boolean cached) return consolidatedTokenRangeMapping; } - /** - * @return the lowest cassandra version among all clusters - */ - @Override - public String getLowestCassandraVersion() - { - // Return cached value if available (executor-side reconstruction) - if (cachedLowestCassandraVersion != null) - { - return cachedLowestCassandraVersion; - } - - if (clusterInfos.size() == 1) - { - return clusterInfos.get(0).getLowestCassandraVersion(); - } - - Map aggregated = applyOnEach(ClusterInfo::getLowestCassandraVersion); - List versions = aggregated.values() - .stream() - .map(CassandraVersionFeatures::cassandraVersionFeaturesFromCassandraVersion) - .sorted() - .collect(Collectors.toList()); - CassandraVersionFeatures first = versions.get(0); - CassandraVersionFeatures last = versions.get(versions.size() - 1); - Preconditions.checkState(first.getMajorVersion() == last.getMajorVersion(), - "Cluster versions are not compatible. lowest=%s and highest=%s", - first.getRawVersionString(), last.getRawVersionString()); - - return first.getRawVersionString(); - } - @Override public Map clusterWriteAvailability() { @@ -398,9 +369,62 @@ private RuntimeException toRuntimeException(ClusterInfo clusterInfo, Throwable c return new RuntimeException("Failed to perform action on cluster: " + clusterInfo.clusterId(), cause); } - @VisibleForTesting - Map clusterInfoByIdUnsafe() + /** + * Sets the bridge version on all contained CassandraClusterInfo instances. + * + * @param bridgeVersion the determined Cassandra bridge version + */ + public void setBridgeVersion(CassandraVersion bridgeVersion) { - return clusterInfoById; + for (ClusterInfo ci : clusterInfos) + { + ((CassandraClusterInfo) ci).setBridgeVersion(bridgeVersion); + } + } + + /** + * Retrieves the lowest Cassandra version from all contained clusters. + * + * @return lowest Cassandra version string across all clusters + */ + public String getLowestCassandraVersion() + { + Map clusterVersions = new HashMap<>(); + for (ClusterInfo ci : clusterInfos) + { + CassandraClusterInfo cci = (CassandraClusterInfo) ci; + clusterVersions.put(ci.clusterId(), cci.getLowestCassandraVersion()); + } + + // Find the lowest version across all clusters + List versions = clusterVersions.values() + .stream() + .map(CassandraVersionFeatures::cassandraVersionFeaturesFromCassandraVersion) + .sorted() + .collect(Collectors.toList()); + + CassandraVersionFeatures first = versions.get(0); + CassandraVersionFeatures last = versions.get(versions.size() - 1); + Preconditions.checkState(first.getMajorVersion() == last.getMajorVersion(), + "Cluster versions are not compatible. lowest=%s and highest=%s", + first.getRawVersionString(), last.getRawVersionString()); + + return first.getRawVersionString(); + } + + /** + * Retrieves aggregated SSTable versions from all contained clusters. + * + * @return set of SSTable versions present across all clusters + */ + public Set getSSTableVersionsOnCluster() + { + Set aggregatedSSTableVersions = new HashSet<>(); + for (ClusterInfo ci : clusterInfos) + { + CassandraClusterInfo cci = (CassandraClusterInfo) ci; + aggregatedSSTableVersions.addAll(cci.getSSTableVersionsOnCluster()); + } + return aggregatedSSTableVersions; } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraCoordinatedBulkWriterContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraCoordinatedBulkWriterContext.java index 6cd199bf2..410a2ab27 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraCoordinatedBulkWriterContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraCoordinatedBulkWriterContext.java @@ -19,11 +19,13 @@ package org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated; +import java.util.Set; import java.util.UUID; import com.google.common.base.Preconditions; import org.apache.commons.lang3.StringUtils; +import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.bulkwriter.AbstractBulkWriterContext; import org.apache.cassandra.spark.bulkwriter.BroadcastableClusterInfoGroup; import org.apache.cassandra.spark.bulkwriter.BroadcastableJobInfo; @@ -46,6 +48,13 @@ */ public class CassandraCoordinatedBulkWriterContext extends AbstractBulkWriterContext { + // A temporary CassandraClusterInfoGroup created with bridgeVersion=null during driver-side initialization. + // This preliminaryGroup provides the Sidecar connectivity needed to determine the bridge version. + // Once bridge version is determined, buildClusterInfo() promotes this by setting its bridge version. + // Marked transient because it is only used during the driver-side constructor and must not be serialized + // when broadcasting to executors. + private transient CassandraClusterInfoGroup preliminaryGroup; + public CassandraCoordinatedBulkWriterContext(@NotNull BulkSparkConf conf, @NotNull StructType structType, int sparkDefaultParallelism) @@ -77,11 +86,35 @@ private void validateConfiguration(BulkSparkConf conf) } @Override - protected ClusterInfo buildClusterInfo() + protected String getLowestCassandraVersion(@NotNull BulkSparkConf conf) + { + return getOrCreatePreliminaryGroup(conf).getLowestCassandraVersion(); + } + + @Override + protected Set getSSTableVersionsOnCluster(@NotNull BulkSparkConf conf) { - CassandraClusterInfoGroup clusterInfoGroup = CassandraClusterInfoGroup.fromBulkSparkConf(bulkSparkConf()); - clusterInfoGroup.startupValidate(); - return clusterInfoGroup; + return getOrCreatePreliminaryGroup(conf).getSSTableVersionsOnCluster(); + } + + @Override + protected ClusterInfo buildClusterInfo(CassandraVersion bridgeVersion) + { + CassandraClusterInfoGroup group = getOrCreatePreliminaryGroup(bulkSparkConf()); + preliminaryGroup = null; + group.setBridgeVersion(bridgeVersion); + group.startupValidate(); + return group; + } + + private CassandraClusterInfoGroup getOrCreatePreliminaryGroup(BulkSparkConf conf) + { + if (preliminaryGroup == null) + { + preliminaryGroup = CassandraClusterInfoGroup.fromBulkSparkConf(conf, (CassandraVersion) null); + } + + return preliminaryGroup; } @Override @@ -134,6 +167,6 @@ public BulkWriterConfig toBulkWriterConfigForBroadcasting(JavaSparkContext spark broadcastableJobInfo, broadcastableClusterInfo, broadcastableSchemaInfo, - lowestCassandraVersion()); + bridgeVersion()); } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java index 99cbc6827..ee4b47720 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java @@ -68,6 +68,7 @@ import org.apache.cassandra.bridge.CassandraBridge; import org.apache.cassandra.bridge.CassandraBridgeFactory; import org.apache.cassandra.bridge.CassandraVersion; +import org.apache.cassandra.bridge.SSTableVersionAnalyzer; import org.apache.cassandra.clients.ExecutorHolder; import org.apache.cassandra.clients.Sidecar; import org.apache.cassandra.secrets.SslConfig; @@ -77,6 +78,8 @@ import o.a.c.sidecar.client.shaded.client.SidecarInstanceImpl; import o.a.c.sidecar.client.shaded.client.SimpleSidecarInstancesProvider; import o.a.c.sidecar.client.shaded.client.exception.RetriesExhaustedException; +import org.apache.cassandra.spark.KryoRegister; +import org.apache.cassandra.spark.bulkwriter.BulkSparkConf; import org.apache.cassandra.spark.common.SidecarInstanceFactory; import org.apache.cassandra.spark.common.SizingFactory; import org.apache.cassandra.spark.config.SchemaFeature; @@ -140,6 +143,7 @@ public class CassandraDataLayer extends PartitionedDataLayer implements StartupV protected Map rfMap; @Nullable protected String lastModifiedTimestampField; + protected Set sstableVersionsOnCluster; // volatile in order to publish the reference for visibility protected volatile CqlTable cqlTable; protected transient TimeProvider timeProvider; @@ -185,7 +189,7 @@ protected CassandraDataLayer(@Nullable String keyspace, @Nullable SslConfig sslConfig, @NotNull CqlTable cqlTable, @NotNull TokenPartitioner tokenPartitioner, - @NotNull CassandraVersion version, + @NotNull CassandraVersion bridgeVersion, @NotNull ConsistencyLevel consistencyLevel, @NotNull String sidecarInstances, @NotNull int sidecarPort, @@ -198,11 +202,12 @@ protected CassandraDataLayer(@Nullable String keyspace, List requestedFeatures, @NotNull Map rfMap, TimeProvider timeProvider, - SSTableTimeRangeFilter sstableTimeRangeFilter) + SSTableTimeRangeFilter sstableTimeRangeFilter, + Set sstableVersionsOnCluster) { super(consistencyLevel, datacenter); this.snapshotName = snapshotName; - this.bridge = CassandraBridgeFactory.get(version); + this.bridge = CassandraBridgeFactory.get(bridgeVersion); this.keyspace = keyspace; this.table = table; this.quoteIdentifiers = quoteIdentifiers; @@ -225,6 +230,7 @@ protected CassandraDataLayer(@Nullable String keyspace, this.rfMap = rfMap; this.timeProvider = timeProvider; this.sstableTimeRangeFilter = sstableTimeRangeFilter; + this.sstableVersionsOnCluster = sstableVersionsOnCluster; this.maybeQuoteKeyspaceAndTable(); this.initSidecarClient(); this.initInstanceMap(); @@ -272,7 +278,11 @@ private int initBulkReader(@NotNull ClientConfig options) throws ExecutionExcept NodeSettings nodeSettings = sidecar.nodeSettings().get(); String cassandraVersion = getEffectiveCassandraVersionForRead(clusterConfig, nodeSettings); Partitioner partitioner = Partitioner.from(nodeSettings.partitioner()); - bridge = CassandraBridgeFactory.get(cassandraVersion); + + // Initialize SSTable versions and bridge version + CassandraVersion bridgeVersion = initializeSSTableVersionsAndBridgeVersion(cassandraVersion); + bridge = CassandraBridgeFactory.get(bridgeVersion); + // optionally quote identifiers if the option has been set, we need an instance for the bridge maybeQuoteKeyspaceAndTable(); @@ -324,6 +334,73 @@ private int initBulkReader(@NotNull ClientConfig options) throws ExecutionExcept return effectiveNumberOfCores; } + /** + * Checks if SSTable version-based bridge selection is disabled. + * Protected to allow test overrides without SparkContext. + * + * @return true if disabled, false if enabled + */ + @VisibleForTesting + protected boolean isSSTableVersionBasedBridgeDisabled() + { + return BulkSparkConf.getDisableSSTableVersionBasedBridge(); + } + + /** + * Initializes SSTable versions from cluster gossip and determines bridge version. + * + * @param cassandraVersion the Cassandra version + * @return the determined bridge version + */ + @VisibleForTesting + protected CassandraVersion initializeSSTableVersionsAndBridgeVersion(String cassandraVersion) + { + // Check if user has explicitly disabled SSTable version-based selection via Spark configuration + boolean isSSTableVersionBasedBridgeDisabled = isSSTableVersionBasedBridgeDisabled(); + + // Get SSTable versions from cluster only if SSTable version-based selection is enabled + // If disabled, skip retrieval to allow job to proceed even when SSTable version detection fails + if (isSSTableVersionBasedBridgeDisabled) + { + this.sstableVersionsOnCluster = null; + } + else + { + this.sstableVersionsOnCluster = retrieveSSTableVersionsFromCluster(); + } + + // Determine bridge version + CassandraVersion bridgeVersion = SSTableVersionAnalyzer.determineBridgeVersionForRead( + sstableVersionsOnCluster, + cassandraVersion, + isSSTableVersionBasedBridgeDisabled); + + // Validate SSTable versions + validateSStableVersions(this.sstableVersionsOnCluster, bridgeVersion, isSSTableVersionBasedBridgeDisabled); + + // Validate that Kryo registrator exists for this bridge version + KryoRegister.validateKryoRegistratorExists(bridgeVersion, cassandraVersion); + + return bridgeVersion; + } + + /** + * Retrieves SSTable versions from cluster via Sidecar gossip. + * Protected to allow test overrides. + * + * @return set of SSTable versions from cluster + */ + @VisibleForTesting + protected Set retrieveSSTableVersionsFromCluster() + { + return Sidecar.getSSTableVersionsFromCluster( + sidecar, + clusterConfig, + sidecarClientConfig.maxMillisToSleep(), + sidecarClientConfig.maxRetries() + ); + } + protected void shutdownHook(ClientConfig options) { ClientConfig.ClearSnapshotStrategy clearSnapshotStrategy = options.clearSnapshotStrategy(); @@ -625,7 +702,7 @@ private List collectSSTableList(SidecarInstance sidecarInstance, } // Map to SSTable - return result.values().stream() + List sstables = result.values().stream() .map(components -> new SidecarProvisionedSSTable(sidecar, sidecarClientConfig, sidecarInstance, @@ -636,6 +713,11 @@ private List collectSSTableList(SidecarInstance sidecarInstance, partitionId, stats())) .collect(Collectors.toList()); + + // Validate SSTable versions against expected versions from gossip + validateSStableVersions(sstables); + + return sstables; } @Override @@ -773,6 +855,7 @@ private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundE this.rfMap = (Map) in.readObject(); this.timeProvider = new ReaderTimeProvider(in.readLong()); this.sstableTimeRangeFilter = (SSTableTimeRangeFilter) in.readObject(); + this.sstableVersionsOnCluster = (Set) in.readObject(); this.maybeQuoteKeyspaceAndTable(); this.initSidecarClient(); this.initInstanceMap(); @@ -819,6 +902,7 @@ private void writeObject(ObjectOutputStream out) throws IOException, ClassNotFou out.writeObject(this.rfMap); out.writeLong(timeProvider.referenceEpochInSeconds()); out.writeObject(this.sstableTimeRangeFilter); + out.writeObject(this.sstableVersionsOnCluster); } private static void writeNullable(ObjectOutputStream out, @Nullable String string) throws IOException @@ -844,6 +928,130 @@ private static String readNullable(ObjectInputStream in) throws IOException return null; } + /** + * Validates that all SSTable versions are supported for the given Cassandra version. + * This validation runs on the Spark driver. + * + * @param sstableVersionsOnCluster set of SSTable versions across all nodes in the cluster + * @param cassandraVersion the Cassandra version + * @param isSSTableVersionBasedBridgeDisabled flag to disable sstable version based bridge determination + * @throws UnsupportedOperationException if any unsupported SSTable version is detected + */ + @VisibleForTesting + void validateSStableVersions(Set sstableVersionsOnCluster, + CassandraVersion cassandraVersion, + boolean isSSTableVersionBasedBridgeDisabled) + { + // Skip validation when fallback mode is enabled + if (isSSTableVersionBasedBridgeDisabled) + { + LOGGER.debug("Skipping SSTable version validation on driver - fallback mode enabled", + BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE); + return; + } + + if (sstableVersionsOnCluster == null || sstableVersionsOnCluster.isEmpty()) + { + // Fail fast with helpful error message + throw new IllegalStateException(String.format( + "Unable to retrieve SSTable versions from cluster. " + + "This is required for SSTable version-based bridge selection. " + + "If you want to bypass this check and use cassandra.version for bridge selection, " + + "set %s=true", BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE)); + } + + // Get SSTable versions that can be read by this Cassandra version + Set supportedVersions = cassandraVersion.getSupportedSStableVersionsForRead(); + + // Find any unsupported versions + Set unsupportedVersions = sstableVersionsOnCluster.stream() + .filter(version -> !supportedVersions.contains(version)) + .collect(Collectors.toSet()); + + if (!unsupportedVersions.isEmpty()) + { + String errorMessage = String.format( + "Detected unsupported SSTable version(s) %s for Cassandra version %s. " + + "Supported versions: %s. " + + "Observed SSTable versions in the cluster: %s. " + + "To retry the job using a fallback Cassandra version, " + + "set %s=true", + unsupportedVersions, + cassandraVersion, + supportedVersions, + sstableVersionsOnCluster, + BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE); + LOGGER.error(errorMessage); + throw new UnsupportedOperationException(errorMessage); + } + + LOGGER.debug("SSTable version validation successful. All observed versions {} are supported for Cassandra version {}", + sstableVersionsOnCluster, cassandraVersion); + } + + /** + * Validates that all SSTables being read have versions that were observed in gossip info. + * This catches cases where SSTables have unexpected versions that weren't seen during driver initialization. + * This validation runs on executors. + * + * @param sstables list of SSTables to validate + * @throws UnsupportedOperationException if any SSTable has a version not in expected sstable versions + */ + @VisibleForTesting + void validateSStableVersions(List sstables) + { + // Check if user has explicitly disabled SSTable version-based selection + boolean isSSTableVersionBasedBridgeDisabled = isSSTableVersionBasedBridgeDisabled(); + + // Skip validation when fallback mode is enabled + if (isSSTableVersionBasedBridgeDisabled) + { + LOGGER.debug("Skipping SSTable version validation on executor - " + + "fallback mode enabled via {}=true", + BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE); + return; + } + + Set expectedVersions = this.sstableVersionsOnCluster; + if (expectedVersions == null || expectedVersions.isEmpty()) + { + // Fail fast with helpful error message + throw new IllegalStateException( + "Unable to validate SSTable versions - no expected versions available from cluster. " + + "This is required for SSTable version-based bridge selection. " + + "If you want to bypass this check and use cassandra.version for bridge selection, " + + "set " + BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE + "=true"); + } + + for (SSTable ssTable : sstables) + { + String ssTableFileName = ssTable.getDataFileName(); + // Extract full version string (e.g., "big-nb" from the filename) + String ssTableVersion = ssTable.getFormat() + "-" + ssTable.getVersion(); + + if (!expectedVersions.contains(ssTableVersion)) + { + String errorMessage = String.format( + "SSTable '%s' has version '%s' which was not observed in cluster gossip info. " + + "Expected versions from gossip: %s. " + + "To retry the job using a fallback Cassandra version, " + + "set %s=true", + ssTableFileName, + ssTableVersion, + expectedVersions, + BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE); + LOGGER.error(errorMessage); + throw new UnsupportedOperationException(errorMessage); + } + } + + if (!sstables.isEmpty()) + { + LOGGER.debug("Validated {} SSTable(s) against expected versions from gossip: {}", + sstables.size(), expectedVersions); + } + } + // Kryo Serialization public static class Serializer extends com.esotericsoftware.kryo.Serializer @@ -895,6 +1103,7 @@ public void write(Kryo kryo, Output out, CassandraDataLayer dataLayer) kryo.writeObject(out, dataLayer.rfMap); out.writeLong(dataLayer.timeProvider.referenceEpochInSeconds()); kryo.writeObject(out, dataLayer.sstableTimeRangeFilter); + kryo.writeObject(out, dataLayer.sstableVersionsOnCluster); } @SuppressWarnings("unchecked") @@ -937,7 +1146,8 @@ public CassandraDataLayer read(Kryo kryo, Input in, Class ty kryo.readObject(in, SchemaFeaturesListWrapper.class).toList(), kryo.readObject(in, HashMap.class), new ReaderTimeProvider(in.readLong()), - kryo.readObject(in, SSTableTimeRangeFilter.class)); + kryo.readObject(in, SSTableTimeRangeFilter.class), + kryo.readObject(in, HashSet.class)); } // Wrapper only used internally for Kryo serialization/deserialization diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/LocalDataLayer.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/LocalDataLayer.java index 0a30cde03..d48001c94 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/LocalDataLayer.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/LocalDataLayer.java @@ -187,12 +187,12 @@ public static LocalDataLayer from(Map options) getOrThrow(options, lowerCaseKey("dirs")).split(",")); } - public LocalDataLayer(@NotNull CassandraVersion version, + public LocalDataLayer(@NotNull CassandraVersion bridgeVersion, @NotNull String keyspace, @NotNull String createStatement, String... paths) { - this(version, + this(bridgeVersion, Partitioner.Murmur3Partitioner, keyspace, createStatement, @@ -204,13 +204,13 @@ public LocalDataLayer(@NotNull CassandraVersion version, paths); } - public LocalDataLayer(@NotNull CassandraVersion version, + public LocalDataLayer(@NotNull CassandraVersion bridgeVersion, @NotNull String keyspace, @NotNull String createStatement, @NotNull Set udtStatements, String... paths) { - this(version, + this(bridgeVersion, Partitioner.Murmur3Partitioner, keyspace, createStatement, @@ -223,7 +223,7 @@ public LocalDataLayer(@NotNull CassandraVersion version, } // CHECKSTYLE IGNORE: Constructor with many parameters - public LocalDataLayer(@NotNull CassandraVersion version, + public LocalDataLayer(@NotNull CassandraVersion bridgeVersion, @NotNull Partitioner partitioner, @NotNull String keyspace, @NotNull String createStatement, @@ -234,7 +234,7 @@ public LocalDataLayer(@NotNull CassandraVersion version, @NotNull SSTableTimeRangeFilter sstableTimeRangeFilter, String... paths) { - this.bridge = CassandraBridgeFactory.get(version); + this.bridge = CassandraBridgeFactory.get(bridgeVersion); this.partitioner = partitioner; this.cqlTable = bridge().buildSchema(createStatement, keyspace, @@ -259,7 +259,7 @@ public LocalDataLayer(@NotNull CassandraVersion version, } // For serialization - private LocalDataLayer(@NotNull CassandraVersion version, + private LocalDataLayer(@NotNull CassandraVersion bridgeVersion, @NotNull Partitioner partitioner, @NotNull CqlTable cqlTable, @NotNull String jobId, @@ -269,7 +269,7 @@ private LocalDataLayer(@NotNull CassandraVersion version, @NotNull SSTableTimeRangeFilter sstableTimeRangeFilter, String... paths) { - this.bridge = CassandraBridgeFactory.get(version); + this.bridge = CassandraBridgeFactory.get(bridgeVersion); this.partitioner = partitioner; this.cqlTable = cqlTable; this.jobId = jobId; diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/KryoRegisterTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/KryoRegisterTest.java new file mode 100644 index 000000000..a4e55bf3e --- /dev/null +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/KryoRegisterTest.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark; + +import org.junit.jupiter.api.Test; + +import org.apache.cassandra.bridge.CassandraVersion; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatNoException; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +/** + * Unit tests for KryoRegister + */ +public class KryoRegisterTest +{ + @Test + void testValidateKryoRegistratorExistsForFourZero() + { + assertThatNoException() + .describedAs("FOURZERO should have a Kryo registrator") + .isThrownBy(() -> KryoRegister.validateKryoRegistratorExists(CassandraVersion.FOURZERO, "4.0.0")); + } + + @Test + void testValidateKryoRegistratorExistsForFourOne() + { + assertThatNoException() + .describedAs("FOURONE should have a Kryo registrator") + .isThrownBy(() -> KryoRegister.validateKryoRegistratorExists(CassandraVersion.FOURONE, "4.1.0")); + } + + @Test + void testValidateKryoRegistratorExistsForFiveZero() + { + assertThatNoException() + .describedAs("FIVEZERO should have a Kryo registrator") + .isThrownBy(() -> KryoRegister.validateKryoRegistratorExists(CassandraVersion.FIVEZERO, "5.0.0")); + } + + @Test + void testValidateKryoRegistratorMissingForThreeZero() + { + assertThatThrownBy(() -> KryoRegister.validateKryoRegistratorExists(CassandraVersion.THREEZERO, "3.0.0")) + .isInstanceOf(IllegalStateException.class) + .hasMessageContaining("No Kryo registrator configured for bridge version THREEZERO") + .hasMessageContaining("Cluster Cassandra version: 3.0.0") + .hasMessageContaining("Available Kryo registrators:") + .hasMessageContaining("FOURZERO") + .hasMessageContaining("FOURONE") + .hasMessageContaining("FIVEZERO") + // should mention config param to update + .hasMessageContaining("spark.cassandra_analytics.cassandra.version"); + } + + @Test + void testKryoRegistratorClassesAreCorrect() + { + assertThat(KryoRegister.KRYO_REGISTRATORS.get(CassandraVersion.FOURZERO)) + .isEqualTo(KryoRegister.V40.class); + + assertThat(KryoRegister.KRYO_REGISTRATORS.get(CassandraVersion.FOURONE)) + .isEqualTo(KryoRegister.V41.class); + + assertThat(KryoRegister.KRYO_REGISTRATORS.get(CassandraVersion.FIVEZERO)) + .isEqualTo(KryoRegister.V50.class); + } + + @Test + void testValidateWithNullCassandraVersionString() + { + // Should not throw - clusterCassandraVersion is optional for error message context + assertThatNoException() + .describedAs("Validation should work with null cassandraVersion string") + .isThrownBy(() -> KryoRegister.validateKryoRegistratorExists(CassandraVersion.FOURZERO, null)); + } +} diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContextTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContextTest.java new file mode 100644 index 000000000..70b416149 --- /dev/null +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContextTest.java @@ -0,0 +1,174 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark.bulkwriter; + +import java.util.Collections; +import java.util.Set; +import java.util.UUID; + +import org.junit.jupiter.api.Test; + +import org.apache.cassandra.bridge.CassandraVersion; +import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.MultiClusterContainer; +import org.apache.spark.sql.types.StructType; +import org.jetbrains.annotations.NotNull; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +class AbstractBulkWriterContextTest +{ + @Test + void testKryoRegistrationWarningMessage() + { + // Test that the KRYO_REGISTRATION_WARNING constant exists and has expected content + assertThat(AbstractBulkWriterContext.KRYO_REGISTRATION_WARNING) + .isNotNull() + .contains("Spark Bulk Writer Kryo Registrator") + .contains("SbwKryoRegistrator") + .contains("was not registered with Spark"); + } + + @Test + void testSSTableVersionBasedBridgeDisabled() + { + BulkSparkConf conf = mock(BulkSparkConf.class); + when(conf.isSSTableVersionBasedBridgeDisabled()).thenReturn(true); + + StructType schema = mock(StructType.class); + TestBulkWriterContext context = TestBulkWriterContext.create(conf, schema, 1, "4.0.0", null); + + // Verify bridge version was still determined + assertThat(context.bridgeVersion()).isEqualTo(CassandraVersion.FOURZERO); + + // Verify that SSTable versions retrieval was not called (disabled) + assertThat(TestBulkWriterContext.sstableVersionRetrievalCount).isEqualTo(0); + assertThat(TestBulkWriterContext.versionRetrievalCount).isEqualTo(1); + } + + @Test + void testSSTableVersionBasedBridgeEnabled() + { + BulkSparkConf conf = mock(BulkSparkConf.class); + when(conf.isSSTableVersionBasedBridgeDisabled()).thenReturn(false); + + Set sstableVersions = Collections.singleton("big-oa"); + StructType schema = mock(StructType.class); + TestBulkWriterContext context = TestBulkWriterContext.create(conf, schema, 1, "5.0.0", sstableVersions); + + // Verify bridge version was determined + assertThat(context.bridgeVersion()).isEqualTo(CassandraVersion.FIVEZERO); + + // Verify that both version and SSTable versions retrieval were called + assertThat(TestBulkWriterContext.versionRetrievalCount).isEqualTo(1); + assertThat(TestBulkWriterContext.sstableVersionRetrievalCount).isEqualTo(1); + } + + /** + * Concrete test implementation of AbstractBulkWriterContext for testing + */ + static class TestBulkWriterContext extends AbstractBulkWriterContext + { + private static String staticLowestVersion; + private static Set staticSSTableVersions; + static int versionRetrievalCount = 0; + static int sstableVersionRetrievalCount = 0; + + private TestBulkWriterContext(@NotNull BulkSparkConf conf, + @NotNull StructType structType, + int sparkDefaultParallelism) + { + super(conf, structType, sparkDefaultParallelism); + } + + static TestBulkWriterContext create(@NotNull BulkSparkConf conf, + @NotNull StructType structType, + int sparkDefaultParallelism, + @NotNull String lowestVersion, + Set sstableVersions) + { + staticLowestVersion = lowestVersion; + staticSSTableVersions = sstableVersions; + versionRetrievalCount = 0; + sstableVersionRetrievalCount = 0; + return new TestBulkWriterContext(conf, structType, sparkDefaultParallelism); + } + + @Override + protected String getLowestCassandraVersion(@NotNull BulkSparkConf conf) + { + versionRetrievalCount++; + return staticLowestVersion; + } + + @Override + protected Set getSSTableVersionsOnCluster(@NotNull BulkSparkConf conf) + { + sstableVersionRetrievalCount++; + return staticSSTableVersions; + } + + @Override + protected ClusterInfo buildClusterInfo(CassandraVersion bridgeVersion) + { + return mock(ClusterInfo.class); + } + + @Override + protected JobInfo buildJobInfo() + { + // Return a mock JobInfo to avoid complex dependencies + return mock(JobInfo.class); + } + + @Override + protected SchemaInfo buildSchemaInfo(StructType structType) + { + // Return a mock SchemaInfo to avoid complex dependencies + return mock(SchemaInfo.class); + } + + @Override + protected TransportContext buildTransportContext(boolean isOnDriver) + { + // Return a mock TransportContext to avoid complex dependencies + return mock(TransportContext.class); + } + + @Override + protected void validateKeyspaceReplication() + { + // No-op for testing + } + + @Override + protected MultiClusterContainer generateRestoreJobIds() + { + return null; + } + + @Override + public BulkWriterConfig toBulkWriterConfigForBroadcasting(org.apache.spark.api.java.JavaSparkContext sparkContext) + { + throw new UnsupportedOperationException("Not needed for test"); + } + } +} diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConfTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConfTest.java index 944f0a186..048718501 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConfTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConfTest.java @@ -33,6 +33,7 @@ import org.apache.cassandra.spark.bulkwriter.util.SbwKryoRegistrator; import org.apache.cassandra.spark.utils.BuildInfo; import org.apache.spark.SparkConf; +import org.apache.spark.SparkContext; import org.jetbrains.annotations.NotNull; import static org.apache.cassandra.spark.bulkwriter.BulkSparkConf.DEFAULT_SIDECAR_PORT; @@ -394,6 +395,103 @@ void testStorageCredentialTypeInvalidValue() .hasMessageContaining("Invalid value 'INVALID' for option 'STORAGE_CREDENTIAL_TYPE'"); } + @Test + void testDefaultDisableSSTableVersionBasedBridge() + { + Map options = copyDefaultOptions(); + BulkSparkConf conf = new BulkSparkConf(sparkConf, options); + assertThat(conf.isSSTableVersionBasedBridgeDisabled()) + .describedAs("By default, SSTable version-based bridge should be enabled (false)") + .isFalse(); + } + + @Test + void testSSTableVersionBasedBridgeDisabledViaSparkConf() + { + SparkConf conf = new SparkConf() + .set(BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE, "true"); + BulkSparkConf bulkConf = new BulkSparkConf(conf, defaultOptions); + assertThat(bulkConf.isSSTableVersionBasedBridgeDisabled()) + .describedAs("SSTable version-based bridge should be disabled when configured") + .isTrue(); + } + + @Test + void testSSTableVersionBasedBridgeEnabledExplicitly() + { + SparkConf conf = new SparkConf() + .set(BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE, "false"); + BulkSparkConf bulkConf = new BulkSparkConf(conf, defaultOptions); + assertThat(bulkConf.isSSTableVersionBasedBridgeDisabled()) + .describedAs("SSTable version-based bridge should be enabled") + .isFalse(); + } + + @Test + void testGetDisableSSTableVersionBasedBridgeDefaultValue() + { + // Create a SparkContext without setting the DISABLE_SSTABLE_VERSION_BASED_BRIDGE flag + SparkConf conf = new SparkConf() + .setMaster("local[1]") + .setAppName("test-static-get-default"); + SparkContext sc = new SparkContext(conf); + try + { + boolean result = BulkSparkConf.getDisableSSTableVersionBasedBridge(); + assertThat(result) + .describedAs("getDisableSSTableVersionBasedBridge should return false (enabled) when flag is not set") + .isFalse(); + } + finally + { + sc.stop(); + } + } + + @Test + void testGetDisableSSTableVersionBasedBridgeWhenDisabled() + { + // Create a SparkContext with DISABLE_SSTABLE_VERSION_BASED_BRIDGE set to true + SparkConf conf = new SparkConf() + .setMaster("local[1]") + .setAppName("test-static-get-disabled") + .set(BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE, "true"); + SparkContext sc = new SparkContext(conf); + try + { + boolean result = BulkSparkConf.getDisableSSTableVersionBasedBridge(); + assertThat(result) + .describedAs("getDisableSSTableVersionBasedBridge should return true (disabled) when flag is set to true") + .isTrue(); + } + finally + { + sc.stop(); + } + } + + @Test + void testGetDisableSSTableVersionBasedBridgeWhenEnabled() + { + // Create a SparkContext with DISABLE_SSTABLE_VERSION_BASED_BRIDGE explicitly set to false + SparkConf conf = new SparkConf() + .setMaster("local[1]") + .setAppName("test-static-get-enabled") + .set(BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE, "false"); + SparkContext sc = new SparkContext(conf); + try + { + boolean result = BulkSparkConf.getDisableSSTableVersionBasedBridge(); + assertThat(result) + .describedAs("getDisableSSTableVersionBasedBridge should return false (enabled) when flag is explicitly set to false") + .isFalse(); + } + finally + { + sc.stop(); + } + } + private Map copyDefaultOptions() { TreeMap map = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfigExtensibilityTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfigExtensibilityTest.java index 380c44600..25d867fa1 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfigExtensibilityTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfigExtensibilityTest.java @@ -24,11 +24,13 @@ import org.junit.jupiter.api.Test; import org.apache.cassandra.bridge.CassandraBridge; +import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.MultiClusterContainer; import org.apache.cassandra.spark.common.stats.JobStatsPublisher; import org.jetbrains.annotations.NotNull; import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -52,7 +54,7 @@ void testToBulkWriterContextCanBeOverridden() BroadcastableSchemaInfo mockSchemaInfo = mock(BroadcastableSchemaInfo.class); // A custom BulkWriterConfig subclass overriding toBulkWriterContext() - BulkWriterConfig customConfig = new BulkWriterConfig(mockConf, 4, mockJobInfo, mockClusterInfo, mockSchemaInfo, "4.0.0") + BulkWriterConfig customConfig = new BulkWriterConfig(mockConf, 4, mockJobInfo, mockClusterInfo, mockSchemaInfo, CassandraVersion.FOURZERO) { @Override public BulkWriterContext toBulkWriterContext() @@ -73,13 +75,13 @@ void testCustomIBroadcastableClusterInfoReconstructIsCalled() { ClusterInfo expectedCluster = mock(ClusterInfo.class); IBroadcastableClusterInfo mockBroadcastable = mock(IBroadcastableClusterInfo.class); - when(mockBroadcastable.reconstruct()).thenReturn(expectedCluster); + when(mockBroadcastable.reconstruct(any(CassandraVersion.class))).thenReturn(expectedCluster); BulkSparkConf mockConf = mock(BulkSparkConf.class); BroadcastableJobInfo mockJobInfo = mock(BroadcastableJobInfo.class); BroadcastableSchemaInfo mockSchemaInfo = mock(BroadcastableSchemaInfo.class); - BulkWriterConfig config = new BulkWriterConfig(mockConf, 4, mockJobInfo, mockBroadcastable, mockSchemaInfo, "4.0.0"); + BulkWriterConfig config = new BulkWriterConfig(mockConf, 4, mockJobInfo, mockBroadcastable, mockSchemaInfo, CassandraVersion.FOURZERO); TestBulkWriterContext context = new TestBulkWriterContext(config); @@ -93,7 +95,7 @@ void testReconstructJobInfoOnExecutorCanBeOverridden() BulkSparkConf mockConf = mock(BulkSparkConf.class); BroadcastableJobInfo mockJobInfo = mock(BroadcastableJobInfo.class); BroadcastableSchemaInfo mockSchemaInfo = mock(BroadcastableSchemaInfo.class); - BulkWriterConfig config = new BulkWriterConfig(mockConf, 4, mockJobInfo, mock(IBroadcastableClusterInfo.class), mockSchemaInfo, "4.0.0"); + BulkWriterConfig config = new BulkWriterConfig(mockConf, 4, mockJobInfo, mock(IBroadcastableClusterInfo.class), mockSchemaInfo, CassandraVersion.FOURZERO); // Subclass that overrides reconstructJobInfoOnExecutor to return custom JobInfo TestBulkWriterContext context = new TestBulkWriterContext(config) @@ -120,7 +122,19 @@ private static class TestBulkWriterContext extends AbstractBulkWriterContext } @Override - protected ClusterInfo buildClusterInfo() + protected ClusterInfo buildClusterInfo(CassandraVersion bridgeVersion) + { + throw new UnsupportedOperationException("Driver-only"); + } + + @Override + protected String getLowestCassandraVersion(@NotNull BulkSparkConf conf) + { + throw new UnsupportedOperationException("Driver-only"); + } + + @Override + protected java.util.Set getSSTableVersionsOnCluster(@NotNull BulkSparkConf conf) { throw new UnsupportedOperationException("Driver-only"); } diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfoTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfoTest.java index 5ed4c5221..498a2119b 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfoTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfoTest.java @@ -28,6 +28,7 @@ import org.junit.jupiter.api.Test; import o.a.c.sidecar.client.shaded.common.response.TimeSkewResponse; +import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.bulkwriter.token.TokenRangeMapping; import org.apache.cassandra.spark.exception.TimeSkewTooLargeException; @@ -81,7 +82,7 @@ private static class MockClusterInfoForTimeSkew extends CassandraClusterInfo MockClusterInfoForTimeSkew(int allowanceMinutes, Instant remoteNow) { - super((BulkSparkConf) null); + super((BulkSparkConf) null, CassandraVersion.FIVEZERO); mockCassandraContext(allowanceMinutes, remoteNow); } @@ -107,4 +108,6 @@ private void mockCassandraContext(int allowanceMinutes, Instant remoteNow) when(cassandraContext.sidecarPort()).thenReturn(9043); } } + } + diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/MockBulkWriterContext.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/MockBulkWriterContext.java index cb5564f0c..1b4a70bf6 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/MockBulkWriterContext.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/MockBulkWriterContext.java @@ -43,6 +43,7 @@ import org.apache.cassandra.bridge.CassandraBridge; import org.apache.cassandra.bridge.CassandraBridgeFactory; +import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.CoordinatedWriteConf; import org.apache.cassandra.spark.bulkwriter.token.ConsistencyLevel; import org.apache.cassandra.spark.bulkwriter.token.ReplicaAwareFailureHandler; @@ -377,12 +378,26 @@ public void checkBulkWriterIsEnabledOrThrow() BulkFeatures.BULK_WRITER)); } - @Override public String getLowestCassandraVersion() { return cassandraVersion; } + public Set getSSTableVersionsOnCluster() + { + // Return SSTable versions based on Cassandra version for testing + CassandraVersion version = CassandraVersion.fromVersion(cassandraVersion) + .orElse(CassandraVersion.FIVEZERO); + return new HashSet<>(version.getNativeSStableVersions()); + } + + @Override + public CassandraVersion bridgeVersion() + { + return CassandraVersion.fromVersion(cassandraVersion) + .orElse(CassandraVersion.FIVEZERO); + } + private List buildCompleteBatchIds(List uuids) { return uuids.stream().map(uuid -> uuid + "-" + jobId).collect(Collectors.toList()); diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriterTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriterTest.java index 2bd5b5634..fba73361e 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriterTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriterTest.java @@ -131,7 +131,7 @@ public void canCreateWriterForVersion(String version) throws IOException break; case 50: // Format is "oa--big" or "da--bti" - if ("big".equals(CassandraVersion.sstableFormat())) + if ("big".equals(CassandraVersion.configuredSSTableFormat())) { assertThat(baseFileName).matches("oa-\\d+-big"); } diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TableSchemaTestCommon.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TableSchemaTestCommon.java index 701ca337d..3401ae85d 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TableSchemaTestCommon.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TableSchemaTestCommon.java @@ -36,6 +36,7 @@ import org.apache.cassandra.bridge.CassandraBridge; import org.apache.cassandra.bridge.CassandraBridgeFactory; +import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.common.schema.ColumnType; import org.apache.cassandra.spark.data.CqlField; import org.apache.spark.sql.types.DataTypes; @@ -339,6 +340,8 @@ public TableSchema build() dataFrameSchema = dataFrameSchema.add(timestampOption.columnName(), DataTypes.LongType); updatedCqlColumns = addColumnToCqlColumns(updatedCqlColumns, timestampOption.columnName(), SqlToCqlTypeConverter.BIGINT); } + CassandraVersion bridgeVersion = CassandraVersion.fromVersion(cassandraVersion) + .orElseThrow(() -> new IllegalArgumentException("Unsupported Cassandra version: " + cassandraVersion)); MockTableInfoProvider tableInfoProvider = new MockTableInfoProvider(bridge, updatedCqlColumns, @@ -353,7 +356,7 @@ public TableSchema build() writeMode, ttlOption, timestampOption, - cassandraVersion, + bridgeVersion, quoteIdentifiers, skipSecondaryIndexCheck); } diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroupTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroupTest.java index 4825c8dea..783f05968 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroupTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroupTest.java @@ -34,6 +34,7 @@ import org.junit.jupiter.api.Test; import o.a.c.sidecar.client.shaded.common.response.TokenRangeReplicasResponse; +import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.bulkwriter.BroadcastableClusterInfoGroup; import org.apache.cassandra.spark.bulkwriter.BulkSparkConf; import org.apache.cassandra.spark.bulkwriter.CassandraClusterInfo; @@ -90,12 +91,10 @@ void testDelegationOfSingleCluster() () -> Partitioner.Murmur3Partitioner, RingInstance::new); when(clusterInfo.getTokenRangeMapping(anyBoolean())).thenReturn(expectedTokenRangeMapping); - when(clusterInfo.getLowestCassandraVersion()).thenReturn("lowestCassandraVersion"); when(clusterInfo.clusterWriteAvailability()).thenReturn(Collections.emptyMap()); CassandraClusterInfoGroup group = mockClusterGroup(1, index -> clusterInfo); // Since there is a single clusterInfo in the group. It behaves as a simple delegation to the sole clusterInfo assertThat(group.clusterWriteAvailability()).isSameAs(clusterInfo.clusterWriteAvailability()); - assertThat(group.getLowestCassandraVersion()).isSameAs(clusterInfo.getLowestCassandraVersion()); assertThat(group.getTokenRangeMapping(true)).isSameAs(clusterInfo.getTokenRangeMapping(true)); } @@ -138,30 +137,6 @@ void testAggregateWriteAvailability() .containsValues(WriteAvailability.AVAILABLE, WriteAvailability.UNAVAILABLE_DOWN); } - @Test - void testAggregateLowestCassandraVersion() - { - CassandraClusterInfoGroup goodGroup = mockClusterGroup(2, index -> { - CassandraClusterInfo clusterInfo = mockClusterInfo("cluster" + index); - when(clusterInfo.getLowestCassandraVersion()).thenReturn("4.0." + index); - return clusterInfo; - }); - assertThat(goodGroup.getLowestCassandraVersion()).isEqualTo("4.0.0"); - } - - @Test - void testAggregateLowestCassandraVersionFailDueToDifference() - { - CassandraClusterInfoGroup badGroup = mockClusterGroup(2, index -> { - CassandraClusterInfo clusterInfo = mockClusterInfo("cluster" + index); - when(clusterInfo.getLowestCassandraVersion()).thenReturn((4 + index) + ".0.0"); - return clusterInfo; - }); - assertThatThrownBy(badGroup::getLowestCassandraVersion) - .isExactlyInstanceOf(IllegalStateException.class) - .hasMessage("Cluster versions are not compatible. lowest=4.0.0 and highest=5.0.0"); - } - @Test void testCheckBulkWriterIsEnabledOrThrow() { @@ -240,7 +215,7 @@ void testTimeSkewTooLarge() void testCreateClusterInfoListFailsDueToAbsentConfiguration() { BulkSparkConf conf = mock(BulkSparkConf.class); - assertThatThrownBy(() -> fromBulkSparkConf(conf)) + assertThatThrownBy(() -> fromBulkSparkConf(conf, (CassandraVersion) null)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("In order to create an instance of CassandraCoordinatedBulkWriterContext, " + "you must provide the appropriate coordinated write configuration by " + @@ -252,7 +227,7 @@ void testCreateClusterInfoListFailsDueToEmptyConfiguration() { BulkSparkConf conf = mock(BulkSparkConf.class, RETURNS_DEEP_STUBS); when(conf.coordinatedWriteConf().clusters()).thenReturn(Collections.emptyMap()); - assertThatThrownBy(() -> fromBulkSparkConf(conf)) + assertThatThrownBy(() -> fromBulkSparkConf(conf, (CassandraVersion) null)) .isInstanceOf(IllegalStateException.class) .hasMessageContaining("No cluster info is built from"); } @@ -263,7 +238,7 @@ void testCreateClusterInfoListFailsDueToEmptyClusterId() BulkSparkConf conf = mock(BulkSparkConf.class); CoordinatedWriteConf.ClusterConf clusterConf = new CoordinatedWriteConf.SimpleClusterConf(Collections.singletonList("localhost:9043"), "localDc"); when(conf.coordinatedWriteConf()).thenReturn(new CoordinatedWriteConf(Collections.singletonMap("", clusterConf))); - assertThatThrownBy(() -> fromBulkSparkConf(conf)) + assertThatThrownBy(() -> fromBulkSparkConf(conf, (CassandraVersion) null)) .isInstanceOf(IllegalStateException.class) .describedAs("The exception message should include the original json to help spot the wrong configuration (empty clusterId)") .hasMessage("Found coordinatedWriteConf with empty or null clusterId. " + @@ -277,7 +252,6 @@ void testSerDeser() CassandraClusterInfoGroup originalGroup = mockClusterGroup(2, index -> { CassandraClusterInfo clusterInfo = mockClusterInfo("cluster" + index); when(clusterInfo.getPartitioner()).thenReturn(Partitioner.Murmur3Partitioner); - when(clusterInfo.getLowestCassandraVersion()).thenReturn("4.0.0"); return clusterInfo; }); @@ -300,10 +274,6 @@ void testSerDeser() .describedAs("Partitioner should be preserved after serialization") .isEqualTo(Partitioner.Murmur3Partitioner); - assertThat(deserializedBroadcastable.getLowestCassandraVersion()) - .describedAs("Lowest Cassandra version should be preserved after serialization") - .isEqualTo("4.0.0"); - assertThat(deserializedBroadcastable.size()) .describedAs("Number of clusters should be preserved after serialization") .isEqualTo(2); diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/CassandraDataLayerValidationTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/CassandraDataLayerValidationTest.java new file mode 100644 index 000000000..68bf29ad6 --- /dev/null +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/CassandraDataLayerValidationTest.java @@ -0,0 +1,472 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark.data; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.junit.jupiter.api.Test; + +import org.apache.cassandra.bridge.CassandraVersion; +import org.apache.cassandra.clients.Sidecar; +import org.apache.cassandra.spark.data.partitioner.ConsistencyLevel; +import org.apache.cassandra.spark.data.partitioner.TokenPartitioner; +import org.apache.cassandra.spark.utils.TimeProvider; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatNoException; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +/** + * Unit tests for CassandraDataLayer validation methods + */ +public class CassandraDataLayerValidationTest +{ + @Test + void testValidateSStableVersionsWithAllSupportedVersions() + { + CassandraDataLayer dataLayer = createTestDataLayer(); + Set sstableVersions = new HashSet<>(Arrays.asList("big-na", "big-nb")); + + assertThatNoException() + .describedAs("All versions are supported by FOURZERO") + .isThrownBy(() -> dataLayer.validateSStableVersions(sstableVersions, CassandraVersion.FOURZERO, false)); + } + + @Test + void testValidateSStableVersionsWithUnsupportedVersion() + { + CassandraDataLayer dataLayer = createTestDataLayer(); + // C* 4.0 cannot read C* 5.0 SSTable versions + Set sstableVersions = new HashSet<>(Arrays.asList("big-na", "big-oa")); + + assertThatThrownBy(() -> dataLayer.validateSStableVersions(sstableVersions, CassandraVersion.FOURZERO, false)) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessageContaining("Detected unsupported SSTable version(s)") + .hasMessageContaining("big-oa") + .hasMessageContaining("FOURZERO") + .hasMessageContaining("set spark.cassandra_analytics.bridge.disable_sstable_version_based=true"); + } + + @Test + void testValidateSStableVersionsWithNullVersionsThrowsException() + { + CassandraDataLayer dataLayer = createTestDataLayer(); + + assertThatThrownBy(() -> dataLayer.validateSStableVersions(null, CassandraVersion.FOURZERO, false)) + .isInstanceOf(IllegalStateException.class) + .hasMessageContaining("Unable to retrieve SSTable versions from cluster"); + } + + @Test + void testValidateSStableVersionsWithEmptyVersionsThrowsException() + { + CassandraDataLayer dataLayer = createTestDataLayer(); + + assertThatThrownBy(() -> dataLayer.validateSStableVersions(Collections.emptySet(), CassandraVersion.FOURZERO, false)) + .isInstanceOf(IllegalStateException.class) + .hasMessageContaining("Unable to retrieve SSTable versions from cluster"); + } + + @Test + void testValidateSStableVersionsSkipsValidationWhenFallbackEnabled() + { + CassandraDataLayer dataLayer = createTestDataLayer(); + + // Test with invalid versions - should not throw when fallback enabled + Set invalidVersions = new HashSet<>(List.of("invalid-version")); + assertThatNoException() + .describedAs("Validation should be skipped with invalid versions when fallback mode is enabled") + .isThrownBy(() -> dataLayer.validateSStableVersions(invalidVersions, CassandraVersion.FOURZERO, true)); + + // Test with null versions - should not throw when fallback enabled + assertThatNoException() + .describedAs("Validation should be skipped with null versions when fallback enabled") + .isThrownBy(() -> dataLayer.validateSStableVersions(null, CassandraVersion.FOURZERO, true)); + } + + @Test + void testValidateSStableVersionsForFiveZeroWithBackwardCompatibility() + { + CassandraDataLayer dataLayer = createTestDataLayer(); + // C* 5.0 should be able to read C* 4.0 SSTable versions + Set sstableVersions = new HashSet<>(Arrays.asList("big-na", "big-nb", "big-oa")); + + assertThatNoException() + .describedAs("FIVEZERO should support reading FOURZERO versions") + .isThrownBy(() -> dataLayer.validateSStableVersions(sstableVersions, CassandraVersion.FIVEZERO, false)); + } + + @Test + void testValidateSStableVersionsForFiveZeroWithBtiFormat() + { + CassandraDataLayer dataLayer = createTestDataLayer(); + Set sstableVersions = new HashSet<>(Arrays.asList("big-oa", "bti-da")); + + assertThatNoException() + .describedAs("FIVEZERO should support both big and bti formats") + .isThrownBy(() -> dataLayer.validateSStableVersions(sstableVersions, CassandraVersion.FIVEZERO, false)); + } + + @Test + void testValidateSStableVersionsErrorMessageIncludesAllDetails() + { + CassandraDataLayer dataLayer = createTestDataLayer(); + Set sstableVersions = new HashSet<>(List.of("big-oa")); + + assertThatThrownBy(() -> dataLayer.validateSStableVersions(sstableVersions, CassandraVersion.FOURZERO, false)) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessageContaining("Detected unsupported SSTable version(s)") + .hasMessageContaining("Supported versions:") + .hasMessageContaining("Observed SSTable versions in the cluster:") + .hasMessageContaining("set spark.cassandra_analytics.bridge.disable_sstable_version_based=true"); + } + + @Test + void testValidateSStableVersionsListWithValidVersions() + { + Set expectedVersions = new HashSet<>(Arrays.asList("big-na", "big-nb")); + CassandraDataLayer dataLayer = createTestDataLayerWithVersions(expectedVersions); + + SSTable ssTable1 = createMockSSTable("big", "na", "test1-big-na-Data.db"); + SSTable ssTable2 = createMockSSTable("big", "nb", "test2-big-nb-Data.db"); + List sstables = Arrays.asList(ssTable1, ssTable2); + + assertThatNoException() + .describedAs("All SSTables have expected versions") + .isThrownBy(() -> dataLayer.validateSStableVersions(sstables)); + } + + @Test + void testValidateSStableVersionsListWithUnexpectedVersion() + { + Set expectedVersions = new HashSet<>(List.of("big-na")); + CassandraDataLayer dataLayer = createTestDataLayerWithVersions(expectedVersions); + + SSTable ssTable1 = createMockSSTable("big", "na", "test1-big-na-Data.db"); + SSTable ssTable2 = createMockSSTable("big", "nb", "test2-big-nb-Data.db"); + List sstables = Arrays.asList(ssTable1, ssTable2); + + assertThatThrownBy(() -> dataLayer.validateSStableVersions(sstables)) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessageContaining("has version 'big-nb' which was not observed in cluster gossip info") + .hasMessageContaining("test2-big-nb-Data.db") + .hasMessageContaining("set spark.cassandra_analytics.bridge.disable_sstable_version_based=true"); + } + + @Test + void testValidateSStableVersionsListWithEmptyList() + { + Set expectedVersions = new HashSet<>(List.of("big-na")); + CassandraDataLayer dataLayer = createTestDataLayerWithVersions(expectedVersions); + + assertThatNoException() + .describedAs("Empty SSTable list should not throw exception") + .isThrownBy(() -> dataLayer.validateSStableVersions(Collections.emptyList())); + } + + @Test + void testValidateSStableVersionsListErrorMessageIncludesFileName() + { + Set expectedVersions = new HashSet<>(List.of("big-na")); + CassandraDataLayer dataLayer = createTestDataLayerWithVersions(expectedVersions); + + SSTable ssTable = createMockSSTable("big", "oa", "keyspace-table-big-oa-Data.db"); + List sstables = Collections.singletonList(ssTable); + + assertThatThrownBy(() -> dataLayer.validateSStableVersions(sstables)) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessageContaining("keyspace-table-big-oa-Data.db") + .hasMessageContaining("Expected versions from gossip:") + .hasMessageContaining("set spark.cassandra_analytics.bridge.disable_sstable_version_based=true"); + } + + @Test + void testValidateSStableVersionsListWithMixedBigAndBtiFormats() + { + Set expectedVersions = new HashSet<>(Arrays.asList("big-oa", "bti-da")); + CassandraDataLayer dataLayer = createTestDataLayerWithVersions(expectedVersions); + + SSTable ssTable1 = createMockSSTable("big", "oa", "test1-big-oa-Data.db"); + SSTable ssTable2 = createMockSSTable("bti", "da", "test2-bti-da-Data.db"); + List sstables = Arrays.asList(ssTable1, ssTable2); + + assertThatNoException() + .describedAs("Mixed big and bti format SSTables should be validated successfully") + .isThrownBy(() -> dataLayer.validateSStableVersions(sstables)); + } + + @Test + void testValidateSStableVersionsListWithUnexpectedBtiVersion() + { + Set expectedVersions = new HashSet<>(List.of("big-oa")); + CassandraDataLayer dataLayer = createTestDataLayerWithVersions(expectedVersions); + + SSTable ssTable1 = createMockSSTable("big", "oa", "test1-big-oa-Data.db"); + SSTable ssTable2 = createMockSSTable("bti", "da", "keyspace-table-bti-da-Data.db"); + List sstables = Arrays.asList(ssTable1, ssTable2); + + assertThatThrownBy(() -> dataLayer.validateSStableVersions(sstables)) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessageContaining("has version 'bti-da' which was not observed in cluster gossip info") + .hasMessageContaining("keyspace-table-bti-da-Data.db") + .hasMessageContaining("Expected versions from gossip:") + .hasMessageContaining("set spark.cassandra_analytics.bridge.disable_sstable_version_based=true"); + } + + @Test + void testValidateSStableVersionsListSkipsValidationWhenFeatureDisabled() + { + Set expectedVersions = new HashSet<>(List.of("big-na")); + // Create a special test instance that returns true for isSSTableVersionBasedBridgeDisabled + CassandraDataLayer dataLayer = new TestCassandraDataLayerWithFeatureDisabled(expectedVersions); + + // Create SSTable with unexpected version + SSTable ssTable = createMockSSTable("big", "oa", "test-big-oa-Data.db"); + List sstables = Collections.singletonList(ssTable); + + // Should NOT throw exception because feature is disabled + assertThatNoException() + .describedAs("Validation should be skipped when feature is disabled") + .isThrownBy(() -> dataLayer.validateSStableVersions(sstables)); + } + + @Test + void testValidateSStableVersionsListWithEmptyExpectedVersionsThrowsException() + { + CassandraDataLayer dataLayer = createTestDataLayerWithVersions(Collections.emptySet()); + + SSTable ssTable = createMockSSTable("big", "na", "test-big-na-Data.db"); + List sstables = Collections.singletonList(ssTable); + + assertThatThrownBy(() -> dataLayer.validateSStableVersions(sstables)) + .isInstanceOf(IllegalStateException.class) + .hasMessageContaining("Unable to validate SSTable versions - no expected versions available from cluster"); + } + + // Tests for initializeSSTableVersionsAndBridgeVersion (called from initBulkReader - lines 283-313) + + @Test + void testInitializeSSTableVersionsAndBridgeVersionWithFeatureEnabled() + { + Set mockVersions = new HashSet<>(Arrays.asList("big-na", "big-nb")); + CassandraDataLayer dataLayer = new TestCassandraDataLayerForInitTests(mockVersions, false); + + CassandraVersion result = dataLayer.initializeSSTableVersionsAndBridgeVersion("4.0.0"); + + // Should return FOURZERO bridge version + assertThat((Object) result).isEqualTo(CassandraVersion.FOURZERO); + + // Should set sstableVersionsOnCluster to the retrieved versions + assertThat((Object) dataLayer.sstableVersionsOnCluster) + .isNotNull(); + assertThat(dataLayer.sstableVersionsOnCluster) + .containsExactlyInAnyOrder("big-na", "big-nb"); + } + + @Test + void testInitializeSSTableVersionsAndBridgeVersionWithFeatureDisabled() + { + // Versions won't be retrieved when feature is disabled + CassandraDataLayer dataLayer = new TestCassandraDataLayerForInitTests(null, true); + + CassandraVersion result = dataLayer.initializeSSTableVersionsAndBridgeVersion("4.0.0"); + + // Should return FOURZERO bridge version (fallback to cassandra.version) + assertThat((Object) result).isEqualTo(CassandraVersion.FOURZERO); + + // Should set sstableVersionsOnCluster to null (skipped retrieval) + assertThat((Object) dataLayer.sstableVersionsOnCluster).isNull(); + } + + @Test + void testInitializeSSTableVersionsAndBridgeVersionSelectsFiveZeroBridge() + { + Set mockVersions = new HashSet<>(Arrays.asList("big-oa", "bti-da")); + CassandraDataLayer dataLayer = new TestCassandraDataLayerForInitTests(mockVersions, false); + + CassandraVersion result = dataLayer.initializeSSTableVersionsAndBridgeVersion("5.0.0"); + + // Should return FIVEZERO bridge version based on highest SSTable version + assertThat((Object) result).isEqualTo(CassandraVersion.FIVEZERO); + + // Should set sstableVersionsOnCluster to the retrieved versions + assertThat((Object) dataLayer.sstableVersionsOnCluster) + .isNotNull(); + assertThat(dataLayer.sstableVersionsOnCluster) + .containsExactlyInAnyOrder("big-oa", "bti-da"); + } + + @Test + void testInitializeSSTableVersionsAndBridgeVersionWithMixedVersions() + { + // Cluster has both C* 4.0 and C* 5.0 versions (during rolling upgrade) + Set mockVersions = new HashSet<>(Arrays.asList("big-na", "big-oa")); + CassandraDataLayer dataLayer = new TestCassandraDataLayerForInitTests(mockVersions, false); + + CassandraVersion result = dataLayer.initializeSSTableVersionsAndBridgeVersion("5.0.0"); + + // Should return FIVEZERO bridge (highest version) + assertThat((Object) result).isEqualTo(CassandraVersion.FIVEZERO); + + // Should set sstableVersionsOnCluster to all retrieved versions + assertThat((Object) dataLayer.sstableVersionsOnCluster) + .isNotNull(); + assertThat(dataLayer.sstableVersionsOnCluster) + .containsExactlyInAnyOrder("big-na", "big-oa"); + } + + private CassandraDataLayer createTestDataLayer() + { + // Use TestCassandraDataLayer to avoid SparkContext initialization in unit tests + return new TestCassandraDataLayer(null); + } + + private CassandraDataLayer createTestDataLayerWithVersions(Set sstableVersions) + { + return new TestCassandraDataLayer(sstableVersions); + } + + private SSTable createMockSSTable(String format, String version, String fileName) + { + SSTable ssTable = mock(SSTable.class); + when(ssTable.getFormat()).thenReturn(format); + when(ssTable.getVersion()).thenReturn(version); + when(ssTable.getDataFileName()).thenReturn(fileName); + return ssTable; + } + + /** + * Test subclass to allow setting sstableVersionsOnCluster for testing + * This subclass uses the serialization constructor and overrides problematic methods + */ + private static class TestCassandraDataLayer extends CassandraDataLayer + { + TestCassandraDataLayer(Set sstableVersions) + { + // Use the serialization constructor which doesn't call initialize() + super("test_keyspace", // keyspace + "test_table", // table + false, // quoteIdentifiers + "", // snapshotName + null, // datacenter + Sidecar.ClientConfig.create(), // sidecarClientConfig + null, // sslConfig + mock(CqlTable.class), // cqlTable + mock(TokenPartitioner.class), // tokenPartitioner + CassandraVersion.FOURZERO, // bridgeVersion + ConsistencyLevel.LOCAL_QUORUM, // consistencyLevel + "127.0.0.1", // sidecarInstances + 9043, // sidecarPort + Collections.emptyMap(), // availabilityHints + Collections.emptyMap(), // bigNumberConfigMap + false, // enableStats + false, // readIndexOffset + false, // useIncrementalRepair + null, // lastModifiedTimestampField + Collections.emptyList(), // requestedFeatures + Collections.emptyMap(), // rfMap + mock(TimeProvider.class), // timeProvider + null, // sstableTimeRangeFilter + sstableVersions); // sstableVersionsOnCluster + this.sstableVersionsOnCluster = sstableVersions; + } + + @Override + public void startupValidate() + { + // Skip startup validation in tests to avoid SparkContext initialization + } + + @Override + protected void initSidecarClient() + { + // Skip sidecar client initialization in tests + } + + @Override + protected void initInstanceMap() + { + // Skip instance map initialization in tests + } + + @Override + protected boolean isSSTableVersionBasedBridgeDisabled() + { + // Override to avoid calling BulkSparkConf.getDisableSSTableVersionBasedBridge() + // which would try to initialize SparkContext in unit tests + // Always return false to test the validation logic + return false; + } + } + + /** + * Test subclass with feature disabled to test the skip validation path + */ + private static class TestCassandraDataLayerWithFeatureDisabled extends TestCassandraDataLayer + { + TestCassandraDataLayerWithFeatureDisabled(Set sstableVersions) + { + super(sstableVersions); + } + + @Override + protected boolean isSSTableVersionBasedBridgeDisabled() + { + // Return true to test the feature-disabled path + return true; + } + } + + /** + * Test subclass for testing initializeSSTableVersionsAndBridgeVersion method + * Mocks the Sidecar interaction to avoid actual cluster calls + */ + private static class TestCassandraDataLayerForInitTests extends TestCassandraDataLayer + { + private final Set mockSSTableVersions; + private final boolean featureDisabled; + + TestCassandraDataLayerForInitTests(Set mockSSTableVersions, boolean featureDisabled) + { + super(null); + this.mockSSTableVersions = mockSSTableVersions; + this.featureDisabled = featureDisabled; + } + + @Override + protected boolean isSSTableVersionBasedBridgeDisabled() + { + return featureDisabled; + } + + @Override + protected Set retrieveSSTableVersionsFromCluster() + { + // Mock the Sidecar call - return the mock versions instead of calling actual sidecar + return mockSSTableVersions; + } + } +} diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkReaderSSTableVersionBridgeDisabledTest.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkReaderSSTableVersionBridgeDisabledTest.java new file mode 100644 index 000000000..6a601a3d2 --- /dev/null +++ b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkReaderSSTableVersionBridgeDisabledTest.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.analytics; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.stream.Collectors; + +import org.junit.jupiter.api.Test; + +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.IInstance; +import org.apache.cassandra.sidecar.testing.QualifiedName; +import org.apache.cassandra.testing.ClusterBuilderConfiguration; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; + +import static org.apache.cassandra.testing.TestUtils.DC1_RF3; +import static org.apache.cassandra.testing.TestUtils.TEST_KEYSPACE; +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Integration tests for SSTable version-based bridge selection in bulk reader (feature DISABLED) + * This test class has the feature flag disabled via SparkConf override. + */ +class BulkReaderSSTableVersionBridgeDisabledTest extends SharedClusterSparkIntegrationTestBase +{ + static final List DATASET = Arrays.asList("alpha", "beta", "gamma", "delta", "epsilon"); + static final QualifiedName TABLE_DISABLED = new QualifiedName(TEST_KEYSPACE, "test_disabled"); + + @Override + protected ClusterBuilderConfiguration testClusterConfiguration() + { + return super.testClusterConfiguration() + .nodesPerDc(3); + } + + @Override + protected SparkConf getOrCreateSparkConf() + { + SparkConf conf = super.getOrCreateSparkConf(); + // Disable SSTable version-based bridge selection for this entire test class + conf.set("spark.cassandra_analytics.bridge.disable_sstable_version_based", "true"); + return conf; + } + + @Override + protected void initializeSchemaForTest() + { + createTestKeyspace(TEST_KEYSPACE, DC1_RF3); + + // Create table + createTestTable(TABLE_DISABLED, "CREATE TABLE %s (id int PRIMARY KEY, value text) WITH compression = {'enabled': false};"); + + IInstance firstRunningInstance = cluster.getFirstRunningInstance(); + + // Insert data for TABLE_DISABLED + for (int i = 0; i < DATASET.size(); i++) + { + String query = String.format("INSERT INTO %s (id, value) VALUES (%d, '%s');", TABLE_DISABLED, i, DATASET.get(i)); + firstRunningInstance.coordinator().execute(query, ConsistencyLevel.ALL); + } + + // Flush to create SSTables + cluster.stream().forEach(instance -> { + instance.nodetool("flush", TEST_KEYSPACE, TABLE_DISABLED.table()); + }); + } + + @Test + void testBulkReadWithSSTableVersionBasedBridgeDisabled() + { + // Read with feature disabled (fallback to cassandra.version) + Dataset df = bulkReaderDataFrame(TABLE_DISABLED).load(); + + List rows = df.collectAsList() + .stream() + .sorted(Comparator.comparing(row -> row.getInt(0))) + .collect(Collectors.toList()); + + assertThat(rows).hasSize(DATASET.size()); + for (int i = 0; i < DATASET.size(); i++) + { + assertThat(rows.get(i).getInt(0)).isEqualTo(i); + assertThat(rows.get(i).getString(1)).isEqualTo(DATASET.get(i)); + } + } +} diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkReaderSSTableVersionBridgeEnabledTest.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkReaderSSTableVersionBridgeEnabledTest.java new file mode 100644 index 000000000..295359fa6 --- /dev/null +++ b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkReaderSSTableVersionBridgeEnabledTest.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.analytics; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.stream.Collectors; + +import org.junit.jupiter.api.Test; + +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.IInstance; +import org.apache.cassandra.sidecar.testing.QualifiedName; +import org.apache.cassandra.testing.ClusterBuilderConfiguration; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; + +import static org.apache.cassandra.testing.TestUtils.DC1_RF3; +import static org.apache.cassandra.testing.TestUtils.TEST_KEYSPACE; +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Integration tests for SSTable version-based bridge selection in bulk reader (feature ENABLED) + * This test class has the feature explicitly enabled via SparkConf override. + */ +class BulkReaderSSTableVersionBridgeEnabledTest extends SharedClusterSparkIntegrationTestBase +{ + static final List DATASET = Arrays.asList("alpha", "beta", "gamma", "delta", "epsilon"); + static final QualifiedName TABLE_MULTIPLE = new QualifiedName(TEST_KEYSPACE, "test_multiple"); + static final QualifiedName TABLE_SNAPSHOT = new QualifiedName(TEST_KEYSPACE, "test_snapshot"); + + @Override + protected ClusterBuilderConfiguration testClusterConfiguration() + { + return super.testClusterConfiguration() + .nodesPerDc(3); + } + + @Override + protected SparkConf getOrCreateSparkConf() + { + SparkConf conf = super.getOrCreateSparkConf(); + // Explicitly enable SSTable version-based bridge selection for this entire test class + conf.set("spark.cassandra_analytics.bridge.disable_sstable_version_based", "false"); + return conf; + } + + @Override + protected void initializeSchemaForTest() + { + createTestKeyspace(TEST_KEYSPACE, DC1_RF3); + + // Create tables + createTestTable(TABLE_MULTIPLE, "CREATE TABLE %s (id int PRIMARY KEY, value text) WITH compression = {'enabled': false};"); + createTestTable(TABLE_SNAPSHOT, "CREATE TABLE %s (id int PRIMARY KEY, value text) WITH compression = {'enabled': false};"); + + IInstance firstRunningInstance = cluster.getFirstRunningInstance(); + + // Insert data for TABLE_MULTIPLE (3 batches to create multiple SSTables) + for (int batch = 0; batch < 3; batch++) + { + for (int i = batch * 10; i < (batch + 1) * 10; i++) + { + String query = String.format("INSERT INTO %s (id, value) VALUES (%d, 'value_%d');", TABLE_MULTIPLE, i, i); + firstRunningInstance.coordinator().execute(query, ConsistencyLevel.ALL); + } + // Flush after each batch to create separate SSTables + cluster.stream().forEach(instance -> instance.nodetool("flush", TEST_KEYSPACE, TABLE_MULTIPLE.table())); + } + + // Insert data for TABLE_SNAPSHOT + for (int i = 0; i < DATASET.size(); i++) + { + String query = String.format("INSERT INTO %s (id, value) VALUES (%d, '%s');", TABLE_SNAPSHOT, i, DATASET.get(i)); + firstRunningInstance.coordinator().execute(query, ConsistencyLevel.ALL); + } + + // Flush TABLE_SNAPSHOT to create SSTables + cluster.stream().forEach(instance -> { + instance.nodetool("flush", TEST_KEYSPACE, TABLE_SNAPSHOT.table()); + }); + } + + @Test + void testBulkReadWithSSTableVersionBasedBridgeMultipleSSTables() + { + // Read with SSTable version-based bridge enabled + Dataset df = bulkReaderDataFrame(TABLE_MULTIPLE).load(); + + List rows = df.collectAsList(); + assertThat(rows).hasSize(30); // 3 batches * 10 rows each + + // Verify all rows are present + List ids = rows.stream() + .map(row -> row.getInt(0)) + .sorted() + .collect(Collectors.toList()); + for (int i = 0; i < 30; i++) + { + assertThat(ids.get(i)).isEqualTo(i); + } + } + + @Test + void testBulkReadWithSnapshot() + { + String snapshotName = "test_snapshot_" + System.currentTimeMillis(); + + // Create snapshot on all nodes + cluster.stream().forEach(instance -> { + instance.nodetool("snapshot", "-t", snapshotName, TEST_KEYSPACE); + }); + + try + { + // Read from snapshot with SSTable version-based bridge + Dataset df = bulkReaderDataFrame(TABLE_SNAPSHOT) + .option("snapshotName", snapshotName) + .load(); + + List rows = df.collectAsList() + .stream() + .sorted(Comparator.comparing(row -> row.getInt(0))) + .collect(Collectors.toList()); + + assertThat(rows).hasSize(DATASET.size()); + for (int i = 0; i < DATASET.size(); i++) + { + assertThat(rows.get(i).getInt(0)).isEqualTo(i); + assertThat(rows.get(i).getString(1)).isEqualTo(DATASET.get(i)); + } + } + finally + { + // Clean up snapshot + cluster.stream().forEach(instance -> { + instance.nodetool("clearsnapshot", "-t", snapshotName, TEST_KEYSPACE); + }); + } + } +} diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriterSSTableVersionBridgeDisabledTest.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriterSSTableVersionBridgeDisabledTest.java new file mode 100644 index 000000000..5b97d60d1 --- /dev/null +++ b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriterSSTableVersionBridgeDisabledTest.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.analytics; + +import java.util.Arrays; +import java.util.List; + +import org.junit.jupiter.api.Test; + +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.SimpleQueryResult; +import org.apache.cassandra.sidecar.testing.QualifiedName; +import org.apache.cassandra.testing.ClusterBuilderConfiguration; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +import static org.apache.cassandra.testing.TestUtils.DC1_RF3; +import static org.apache.cassandra.testing.TestUtils.TEST_KEYSPACE; +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Integration tests for SSTable version-based bridge selection in bulk writer, with feature DISABLED + */ +class BulkWriterSSTableVersionBridgeDisabledTest extends SharedClusterSparkIntegrationTestBase +{ + static final QualifiedName TABLE_DISABLED = new QualifiedName(TEST_KEYSPACE, "test_disabled"); + + @Override + protected ClusterBuilderConfiguration testClusterConfiguration() + { + return super.testClusterConfiguration() + .nodesPerDc(3); + } + + @Override + protected SparkConf getOrCreateSparkConf() + { + SparkConf conf = super.getOrCreateSparkConf(); + // Disable SSTable version-based bridge selection for this entire test class + conf.set("spark.cassandra_analytics.bridge.disable_sstable_version_based", "true"); + return conf; + } + + @Override + protected void initializeSchemaForTest() + { + createTestKeyspace(TEST_KEYSPACE, DC1_RF3); + cluster.schemaChangeIgnoringStoppedInstances( + "CREATE TABLE " + TABLE_DISABLED + " (id int PRIMARY KEY, value text) WITH compression = {'enabled': false};" + ); + } + + @Test + void testBulkWriteWithSSTableVersionBasedBridgeDisabled() + { + SparkSession spark = getOrCreateSparkSession(); + + // Create test data + List data = Arrays.asList( + RowFactory.create(0, "x"), + RowFactory.create(1, "y"), + RowFactory.create(2, "z") + ); + + StructType schema = DataTypes.createStructType(new StructField[]{ + DataTypes.createStructField("id", DataTypes.IntegerType, false), + DataTypes.createStructField("value", DataTypes.StringType, false) + }); + + Dataset df = spark.createDataFrame(data, schema); + + // Write with feature disabled (fallback to cassandra.version) + bulkWriterDataFrameWriter(df, TABLE_DISABLED).save(); + + // Verify data was written + SimpleQueryResult result = cluster.coordinator(1) + .executeWithResult("SELECT id, value FROM " + TABLE_DISABLED, + ConsistencyLevel.ALL); + Object[][] rows = result.toObjectArrays(); + + assertThat(rows.length).isEqualTo(data.size()); + + // Verify all expected IDs are present + List actualIds = Arrays.stream(rows) + .map(row -> (Integer) row[0]) + .sorted() + .collect(java.util.stream.Collectors.toList()); + List expectedIds = data.stream() + .map(row -> row.getInt(0)) + .sorted() + .collect(java.util.stream.Collectors.toList()); + assertThat(actualIds).isEqualTo(expectedIds); + } +} diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriterSSTableVersionBridgeEnabledTest.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriterSSTableVersionBridgeEnabledTest.java new file mode 100644 index 000000000..f73254d0a --- /dev/null +++ b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriterSSTableVersionBridgeEnabledTest.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.analytics; + +import java.util.Arrays; +import java.util.List; + +import org.junit.jupiter.api.Test; + +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.SimpleQueryResult; +import org.apache.cassandra.sidecar.testing.QualifiedName; +import org.apache.cassandra.testing.ClusterBuilderConfiguration; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +import static org.apache.cassandra.testing.TestUtils.DC1_RF3; +import static org.apache.cassandra.testing.TestUtils.TEST_KEYSPACE; +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Integration tests for SSTable version-based bridge selection in bulk writer (feature ENABLED) + */ +class BulkWriterSSTableVersionBridgeEnabledTest extends SharedClusterSparkIntegrationTestBase +{ + static final QualifiedName TABLE_ENABLED = new QualifiedName(TEST_KEYSPACE, "test_enabled"); + + @Override + protected ClusterBuilderConfiguration testClusterConfiguration() + { + return super.testClusterConfiguration() + .nodesPerDc(3); + } + + @Override + protected void initializeSchemaForTest() + { + createTestKeyspace(TEST_KEYSPACE, DC1_RF3); + cluster.schemaChangeIgnoringStoppedInstances( + "CREATE TABLE " + TABLE_ENABLED + " (id int PRIMARY KEY, value text) WITH compression = {'enabled': false};" + ); + } + + @Test + void testBulkWriteWithSSTableVersionBasedBridgeEnabled() + { + SparkSession spark = getOrCreateSparkSession(); + + // Create test data + List data = Arrays.asList( + RowFactory.create(0, "a"), + RowFactory.create(1, "b"), + RowFactory.create(2, "c"), + RowFactory.create(3, "d"), + RowFactory.create(4, "e") + ); + + StructType schema = DataTypes.createStructType(new StructField[]{ + DataTypes.createStructField("id", DataTypes.IntegerType, false), + DataTypes.createStructField("value", DataTypes.StringType, false) + }); + + Dataset df = spark.createDataFrame(data, schema); + + // Write with SSTable version-based bridge enabled (default) + bulkWriterDataFrameWriter(df, TABLE_ENABLED).save(); + + // Verify data was written + SimpleQueryResult result = cluster.coordinator(1) + .executeWithResult("SELECT id, value FROM " + TABLE_ENABLED, + ConsistencyLevel.ALL); + Object[][] rows = result.toObjectArrays(); + + assertThat(rows.length).isEqualTo(data.size()); + + // Verify all expected IDs are present + List actualIds = Arrays.stream(rows) + .map(row -> (Integer) row[0]) + .sorted() + .collect(java.util.stream.Collectors.toList()); + List expectedIds = data.stream() + .map(row -> row.getInt(0)) + .sorted() + .collect(java.util.stream.Collectors.toList()); + assertThat(actualIds).isEqualTo(expectedIds); + } +} diff --git a/cassandra-analytics-sidecar-client/build.gradle b/cassandra-analytics-sidecar-client/build.gradle index cad4d9593..d0d23f53c 100644 --- a/cassandra-analytics-sidecar-client/build.gradle +++ b/cassandra-analytics-sidecar-client/build.gradle @@ -38,4 +38,23 @@ dependencies { implementation "org.slf4j:slf4j-api:${slf4jApiVersion}" api(project(path: ':analytics-sidecar-vertx-client-shaded', configuration: 'shadow')) + + // Test dependencies + testImplementation("org.junit.jupiter:junit-jupiter-api:${project.junitVersion}") + testImplementation("org.junit.jupiter:junit-jupiter-params:${project.junitVersion}") + testImplementation("org.junit.jupiter:junit-jupiter-engine:${project.junitVersion}") + testImplementation("org.assertj:assertj-core:${assertjCoreVersion}") + testImplementation(group: 'org.mockito', name: 'mockito-core', version: "${project.rootProject.mockitoVersion}") + testImplementation(group: 'org.mockito', name: 'mockito-inline', version: "${project.rootProject.mockitoVersion}") +} + +test { + useJUnitPlatform() + testLogging { + events "passed", "skipped", "failed" + showExceptions true + exceptionFormat "full" + showCauses true + showStackTraces true + } } diff --git a/cassandra-analytics-sidecar-client/src/main/java/org/apache/cassandra/clients/Sidecar.java b/cassandra-analytics-sidecar-client/src/main/java/org/apache/cassandra/clients/Sidecar.java index d5922c2d3..a12c9bb23 100644 --- a/cassandra-analytics-sidecar-client/src/main/java/org/apache/cassandra/clients/Sidecar.java +++ b/cassandra-analytics-sidecar-client/src/main/java/org/apache/cassandra/clients/Sidecar.java @@ -20,15 +20,18 @@ package org.apache.cassandra.clients; import java.io.IOException; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; +import o.a.c.sidecar.client.shaded.common.response.GossipInfoResponse; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -50,6 +53,7 @@ import org.apache.cassandra.spark.common.model.CassandraInstance; import org.apache.cassandra.spark.data.FileType; import org.apache.cassandra.spark.utils.BuildInfo; +import org.apache.cassandra.spark.utils.FutureUtils; import org.apache.cassandra.spark.utils.MapUtils; import org.apache.cassandra.spark.validation.KeyStoreValidation; import org.apache.cassandra.spark.validation.StartupValidator; @@ -149,6 +153,79 @@ public static List> allNodeSettings(SidecarClien .collect(Collectors.toList()); } + /** + * Retrieve gossip info from all nodes on the cluster + * @param client Sidecar client + * @param instances all Sidecar instances + * @return completable futures with GossipInfoResponse + */ + public static List> gossipInfoFromAllNodes(SidecarClient client, + Set instances) + { + return instances.stream() + .map(instance -> client + .gossipInfo(instance) + .exceptionally(throwable -> { + LOGGER.warn(String.format("Failed to retrieve gossipinfo from instance=%s", + instance), throwable); + return null; + })) + .collect(Collectors.toList()); + } + + /** + * Retrieves SSTable versions from all nodes in the cluster via gossip info. + * This method fetches gossip information from all Sidecar instances and extracts + * the SSTable versions running on each node. + * + * @param client Sidecar client + * @param instances all Sidecar instances in the cluster + * @param maxRetryDelayMillis maximum delay in milliseconds between retries + * @param maxRetries maximum number of retry attempts + * @return a set of SSTable versions across all nodes in the cluster + * @throws RuntimeException if unable to retrieve gossip info from any nodes + */ + public static Set getSSTableVersionsFromCluster(SidecarClient client, + Set instances, + long maxRetryDelayMillis, + int maxRetries) + { + LOGGER.debug("Retrieving SSTable versions from cluster via gossip..."); + + List> gossipInfoFutures = + gossipInfoFromAllNodes(client, instances); + + // Calculate total timeout + final long totalTimeout = maxRetryDelayMillis * maxRetries * gossipInfoFutures.size(); + + List gossipInfoResponses = + FutureUtils.bestEffortGet(gossipInfoFutures, totalTimeout, TimeUnit.MILLISECONDS); + + if (gossipInfoResponses.isEmpty()) + { + LOGGER.warn("Unable to retrieve gossip info from any nodes. 0/{} instances available.", + gossipInfoFutures.size()); + // do not fail here, bridge determination logic checks for feature flag and proceeds accordingly + return Collections.emptySet(); + } + else if (gossipInfoResponses.size() < gossipInfoFutures.size()) + { + LOGGER.warn("{}/{} instances were used to retrieve gossip info and determine SSTable versions", + gossipInfoResponses.size(), gossipInfoFutures.size()); + } + + // Extract and collect SSTable versions from all gossip info responses + Set sstableVersions = gossipInfoResponses.stream() + .flatMap(response -> response.values().stream()) + .map(GossipInfoResponse.GossipInfo::sstableVersions) + .filter(Objects::nonNull) + .flatMap(List::stream) + .collect(Collectors.toSet()); + + LOGGER.info("Detected SSTable versions on cluster: {}", sstableVersions); + return sstableVersions; + } + public static SidecarInstance toSidecarInstance(CassandraInstance instance, int sidecarPort) { return new SidecarInstanceImpl(instance.nodeName(), sidecarPort); diff --git a/cassandra-analytics-sidecar-client/src/test/java/org/apache/cassandra/clients/SidecarTest.java b/cassandra-analytics-sidecar-client/src/test/java/org/apache/cassandra/clients/SidecarTest.java new file mode 100644 index 000000000..82b454868 --- /dev/null +++ b/cassandra-analytics-sidecar-client/src/test/java/org/apache/cassandra/clients/SidecarTest.java @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.clients; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.CompletableFuture; + +import org.junit.jupiter.api.Test; + +import o.a.c.sidecar.client.shaded.client.SidecarClient; +import o.a.c.sidecar.client.shaded.client.SidecarInstance; +import o.a.c.sidecar.client.shaded.client.SidecarInstanceImpl; +import o.a.c.sidecar.client.shaded.common.response.GossipInfoResponse; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +/** + * Unit tests for Sidecar utility methods + */ +public class SidecarTest +{ + @Test + void testGetSSTableVersionsFromClusterWithSingleNode() + { + SidecarClient client = mock(SidecarClient.class); + SidecarInstance instance = new SidecarInstanceImpl("localhost", 9043); + Set instances = Collections.singleton(instance); + + // Mock gossip response + GossipInfoResponse response = new GossipInfoResponse(); + GossipInfoResponse.GossipInfo info = createGossipInfo(Arrays.asList("big-na", "big-nb")); + response.put("localhost", info); + + when(client.gossipInfo(any(SidecarInstance.class))) + .thenReturn(CompletableFuture.completedFuture(response)); + + Set versions = Sidecar.getSSTableVersionsFromCluster(client, instances, 1000L, 3); + + assertThat(versions) + .describedAs("Should extract SSTable versions from gossip info") + .containsExactlyInAnyOrder("big-na", "big-nb"); + } + + @Test + void testGetSSTableVersionsFromClusterWithMultipleNodesAndMixedVersions() + { + SidecarClient client = mock(SidecarClient.class); + SidecarInstance instance1 = new SidecarInstanceImpl("node1", 9043); + SidecarInstance instance2 = new SidecarInstanceImpl("node2", 9043); + Set instances = new HashSet<>(Arrays.asList(instance1, instance2)); + + // Node1 has C* 4.0 versions (big-na, big-nb) + GossipInfoResponse response1 = new GossipInfoResponse(); + GossipInfoResponse.GossipInfo info1 = createGossipInfo(Arrays.asList("big-na", "big-nb")); + response1.put("node1", info1); + + // Node2 has mixed versions: one C* 4.0 (big-nb) and C* 5.0 versions (big-oa, bti-da) + GossipInfoResponse response2 = new GossipInfoResponse(); + GossipInfoResponse.GossipInfo info2 = createGossipInfo(Arrays.asList("big-nb", "big-oa", "bti-da")); + response2.put("node2", info2); + + when(client.gossipInfo(instance1)) + .thenReturn(CompletableFuture.completedFuture(response1)); + when(client.gossipInfo(instance2)) + .thenReturn(CompletableFuture.completedFuture(response2)); + + Set versions = Sidecar.getSSTableVersionsFromCluster(client, instances, 1000L, 3); + + assertThat(versions) + .describedAs("Should aggregate and deduplicate SSTable versions from multiple nodes with mixed Cassandra versions") + .containsExactlyInAnyOrder("big-na", "big-nb", "big-oa", "bti-da"); + } + + @Test + void testGetSSTableVersionsFromClusterWithFailedNode() + { + SidecarClient client = mock(SidecarClient.class); + SidecarInstance instance1 = new SidecarInstanceImpl("node1", 9043); + SidecarInstance instance2 = new SidecarInstanceImpl("node2", 9043); + Set instances = new HashSet<>(Arrays.asList(instance1, instance2)); + + // Node1 succeeds + GossipInfoResponse response1 = new GossipInfoResponse(); + GossipInfoResponse.GossipInfo info1 = createGossipInfo(Arrays.asList("big-na", "big-nb")); + response1.put("node1", info1); + + when(client.gossipInfo(instance1)) + .thenReturn(CompletableFuture.completedFuture(response1)); + + // Node2 fails + when(client.gossipInfo(instance2)) + .thenReturn(CompletableFuture.failedFuture(new RuntimeException("Connection failed"))); + + Set versions = Sidecar.getSSTableVersionsFromCluster(client, instances, 1000L, 3); + + assertThat(versions) + .describedAs("Should return versions from successful nodes even when some fail") + .containsExactlyInAnyOrder("big-na", "big-nb"); + } + + @Test + void testGetSSTableVersionsFromClusterWhenAllNodesFail() + { + SidecarClient client = mock(SidecarClient.class); + SidecarInstance instance1 = new SidecarInstanceImpl("node1", 9043); + SidecarInstance instance2 = new SidecarInstanceImpl("node2", 9043); + Set instances = new HashSet<>(Arrays.asList(instance1, instance2)); + + // Both nodes fail + when(client.gossipInfo(any(SidecarInstance.class))) + .thenReturn(CompletableFuture.failedFuture(new RuntimeException("Connection failed"))); + + Set versions = Sidecar.getSSTableVersionsFromCluster(client, instances, 1000L, 3); + + assertThat(versions) + .describedAs("Should return empty set when all nodes fail") + .isEmpty(); + } + + @Test + void testGetSSTableVersionsFromClusterWithNullSSTableVersions() + { + SidecarClient client = mock(SidecarClient.class); + SidecarInstance instance = new SidecarInstanceImpl("localhost", 9043); + Set instances = Collections.singleton(instance); + + // Mock gossip response with null SSTable versions + GossipInfoResponse response = new GossipInfoResponse(); + GossipInfoResponse.GossipInfo info = createGossipInfo(null); + response.put("localhost", info); + + when(client.gossipInfo(any(SidecarInstance.class))) + .thenReturn(CompletableFuture.completedFuture(response)); + + Set versions = Sidecar.getSSTableVersionsFromCluster(client, instances, 1000L, 3); + + assertThat(versions) + .describedAs("Should return empty set when SSTable versions are null") + .isEmpty(); + } + + @Test + void testGetSSTableVersionsFromClusterWithEmptyInstancesSet() + { + SidecarClient client = mock(SidecarClient.class); + Set instances = Collections.emptySet(); + + Set versions = Sidecar.getSSTableVersionsFromCluster(client, instances, 1000L, 3); + + assertThat(versions) + .describedAs("Should return empty set when no instances provided") + .isEmpty(); + } + + private GossipInfoResponse.GossipInfo createGossipInfo(List sstableVersions) + { + GossipInfoResponse.GossipInfo info = new GossipInfoResponse.GossipInfo(); + info.put("status", "NORMAL"); + info.put("rack", "RACK1"); + info.put("datacenter", "DC1"); + info.put("releaseVersion", "4.0.0"); + info.put("rpcAddress", "127.0.0.1"); + if (sstableVersions != null && !sstableVersions.isEmpty()) + { + info.put("sstableVersions", String.join(",", sstableVersions)); + } + return info; + } +} diff --git a/cassandra-five-zero-bridge/src/test/java/org/apache/cassandra/io/sstable/format/bti/BtiReaderUtilsTest.java b/cassandra-five-zero-bridge/src/test/java/org/apache/cassandra/io/sstable/format/bti/BtiReaderUtilsTest.java index 8539b3ff2..11236cda8 100644 --- a/cassandra-five-zero-bridge/src/test/java/org/apache/cassandra/io/sstable/format/bti/BtiReaderUtilsTest.java +++ b/cassandra-five-zero-bridge/src/test/java/org/apache/cassandra/io/sstable/format/bti/BtiReaderUtilsTest.java @@ -61,7 +61,7 @@ public class BtiReaderUtilsTest public void testPartitionIndexTokenRange(boolean compression) { // Only test BTI format for Cassandra 5.0+ - assumeThat(CassandraVersion.sstableFormat()).isEqualTo("bti"); + assumeThat(CassandraVersion.configuredSSTableFormat()).isEqualTo("bti"); qt().forAll(arbitrary().enumValues(Partitioner.class)) .checkAssert(partitioner -> { @@ -103,7 +103,7 @@ public void testPartitionIndexTokenRange(boolean compression) public void testPrimaryIndexContainsAnyKey(boolean compression) { // Only test BTI format for Cassandra 5.0+ - assumeThat(CassandraVersion.sstableFormat()).isEqualTo("bti"); + assumeThat(CassandraVersion.configuredSSTableFormat()).isEqualTo("bti"); qt().forAll(arbitrary().enumValues(Partitioner.class)) .checkAssert(partitioner -> { @@ -162,7 +162,7 @@ public void testPrimaryIndexContainsAnyKey(boolean compression) public void testReadPrimaryIndex(boolean compression) { // Only test BTI format for Cassandra 5.0+ - assumeThat(CassandraVersion.sstableFormat()).isEqualTo("bti"); + assumeThat(CassandraVersion.configuredSSTableFormat()).isEqualTo("bti"); qt().forAll(arbitrary().enumValues(Partitioner.class)) .checkAssert(partitioner -> { @@ -221,7 +221,7 @@ public void testReadPrimaryIndex(boolean compression) public void testConsumePrimaryIndex(boolean compression) { // Only test BTI format for Cassandra 5.0+ - assumeThat(CassandraVersion.sstableFormat()).isEqualTo("bti"); + assumeThat(CassandraVersion.configuredSSTableFormat()).isEqualTo("bti"); qt().forAll(arbitrary().enumValues(Partitioner.class)) .checkAssert(partitioner -> { diff --git a/cassandra-five-zero-bridge/src/test/java/org/apache/cassandra/spark/reader/SummaryDbTests.java b/cassandra-five-zero-bridge/src/test/java/org/apache/cassandra/spark/reader/SummaryDbTests.java index b16330e9b..17621e346 100644 --- a/cassandra-five-zero-bridge/src/test/java/org/apache/cassandra/spark/reader/SummaryDbTests.java +++ b/cassandra-five-zero-bridge/src/test/java/org/apache/cassandra/spark/reader/SummaryDbTests.java @@ -86,7 +86,7 @@ public BigInteger tokenAt(int index) public void testSearchSummary() { // Summary.db file is present only in BIG sstables - assumeThat(CassandraVersion.sstableFormat()).isEqualTo("big"); + assumeThat(CassandraVersion.configuredSSTableFormat()).isEqualTo("big"); qt().forAll(arbitrary().enumValues(Partitioner.class)) .checkAssert(partitioner -> { @@ -149,7 +149,7 @@ public void testSearchSummary() public void testSummaryBinarySearch() { // Summary.db file is present only in BIG sstables - assumeThat(CassandraVersion.sstableFormat()).isEqualTo("big"); + assumeThat(CassandraVersion.configuredSSTableFormat()).isEqualTo("big"); SummaryDbUtils.TokenList list = new ArrayTokenList(LongStream.range(5, 10000).boxed().toArray(Long[]::new)); assertThat(SummaryDbUtils.binarySearchSummary(list, BigInteger.valueOf(154L))).isEqualTo(148); @@ -169,7 +169,7 @@ public void testSummaryBinarySearch() public void testSummaryBinarySearchSparse() { // Summary.db file is present only in BIG sstables - assumeThat(CassandraVersion.sstableFormat()).isEqualTo("big"); + assumeThat(CassandraVersion.configuredSSTableFormat()).isEqualTo("big"); SummaryDbUtils.TokenList list = new ArrayTokenList(5L, 10L, 15L, 20L, 25L); assertThat(SummaryDbUtils.binarySearchSummary(list, BigInteger.valueOf(-500L))).isEqualTo(0); diff --git a/cassandra-five-zero-types/src/main/java/org/apache/cassandra/bridge/CassandraTypesImplementation.java b/cassandra-five-zero-types/src/main/java/org/apache/cassandra/bridge/CassandraTypesImplementation.java index a13071c11..6d5804cbf 100644 --- a/cassandra-five-zero-types/src/main/java/org/apache/cassandra/bridge/CassandraTypesImplementation.java +++ b/cassandra-five-zero-types/src/main/java/org/apache/cassandra/bridge/CassandraTypesImplementation.java @@ -54,7 +54,7 @@ public static synchronized void setup(BridgeInitializationParameters params) config.diagnostic_events_enabled = false; config.max_mutation_size = new DataStorageSpec.IntKibibytesBound(config.commitlog_segment_size.toKibibytes() / 2); config.concurrent_compactors = 4; - config.sstable.selected_format = params.getSstableFormat(); + config.sstable.selected_format = params.getConfiguredSSTableFormat(); Path tempDirectory; try { From 422fca0985cd14303c953b40a7c80b8d727f88ec Mon Sep 17 00:00:00 2001 From: Shailaja Koppu Date: Tue, 16 Jun 2026 13:24:16 +0100 Subject: [PATCH 2/9] load all kryo versions --- .../apache/cassandra/spark/KryoRegister.java | 33 ++++++++----- .../CassandraClusterInfoGroup.java | 1 - .../cassandra/spark/KryoRegisterTest.java | 49 +++++++++++++++++++ .../BulkWriterConfigExtensibilityTest.java | 3 +- 4 files changed, 72 insertions(+), 14 deletions(-) diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/KryoRegister.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/KryoRegister.java index 054de0c7d..b22b671be 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/KryoRegister.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/KryoRegister.java @@ -22,7 +22,10 @@ import java.util.Arrays; import java.util.Collections; import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; @@ -31,7 +34,6 @@ import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.Serializer; -import org.apache.cassandra.bridge.BaseCassandraBridgeFactory; import org.apache.cassandra.bridge.BigNumberConfigImpl; import org.apache.cassandra.bridge.CassandraBridgeFactory; import org.apache.cassandra.bridge.CassandraVersion; @@ -142,26 +144,33 @@ public static void setup(@NotNull SparkConf configuration) LOGGER.info("Setting up Kryo"); configuration.set(SPARK_SERIALIZER, "org.apache.spark.serializer.KryoSerializer"); - // Add KryoRegister to SparkConf serialization if not already there + // Preserve any pre-existing (e.g. user-supplied) registrators and keep them first. + // LinkedHashSet gives a stable, predictable registration order on the driver; the same + // resulting spark.kryo.registrator string is then propagated to all executors via SparkConf. Set registratorsSet = Arrays.stream(configuration.get(SPARK_REGISTRATORS, "").split(",")) .filter(string -> string != null && !string.isEmpty()) - .collect(Collectors.toSet()); - - // TODO: Find a better way to initialize Kryo serializer, instead of relaying - // on Cassandra version specified as parameter of Spark job. Can we get Cassandra version from Sidecar? - CassandraVersion cassandraVersion = BaseCassandraBridgeFactory.getCassandraVersion(configuration.get(CASSANDRA_VERSION, "4.0.0")); - Class registratorClass = KRYO_REGISTRATORS.get(cassandraVersion); - if (registratorClass == null) + .collect(Collectors.toCollection(LinkedHashSet::new)); + + // SSTable based bridge selection feature selects the bridge version, which may differ + // from cassandra.version; registering every loadable bridge's registrator ensures Spark + // can serialize objects for whichever bridge is chosen. Only implemented (bundled) versions + // are used, so we never attempt to load a bridge JAR that is not available. + List> registratorClasses = Arrays.stream(CassandraVersion.implementedVersions()) + .map(KRYO_REGISTRATORS::get) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + if (registratorClasses.isEmpty()) { - throw new IllegalArgumentException("Kryo registrator not configured for Cassandra version: " + cassandraVersion); + throw new IllegalStateException("No Kryo registrators configured for implemented Cassandra versions: " + + Arrays.toString(CassandraVersion.implementedVersions())); } - registratorsSet.add(registratorClass.getName()); + registratorClasses.forEach(registratorClass -> registratorsSet.add(registratorClass.getName())); String registratorsString = String.join(",", registratorsSet); LOGGER.info("Setting kryo registrators: " + registratorsString); configuration.set(SPARK_REGISTRATORS, registratorsString); - configuration.registerKryoClasses(new Class[]{registratorClass}); + configuration.registerKryoClasses(registratorClasses.toArray(new Class[0])); } public static class V40 extends KryoRegister diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java index 365cef5a0..39535b01e 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java @@ -34,7 +34,6 @@ import java.util.function.Function; import java.util.stream.Collectors; -import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.Range; import org.apache.commons.lang3.StringUtils; diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/KryoRegisterTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/KryoRegisterTest.java index a4e55bf3e..7113ae61e 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/KryoRegisterTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/KryoRegisterTest.java @@ -19,9 +19,14 @@ package org.apache.cassandra.spark; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + import org.junit.jupiter.api.Test; import org.apache.cassandra.bridge.CassandraVersion; +import org.apache.spark.SparkConf; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatNoException; @@ -92,4 +97,48 @@ void testValidateWithNullCassandraVersionString() .describedAs("Validation should work with null cassandraVersion string") .isThrownBy(() -> KryoRegister.validateKryoRegistratorExists(CassandraVersion.FOURZERO, null)); } + + @Test + void testSetupRegistersAllImplementedVersions() + { + SparkConf conf = new SparkConf(); + KryoRegister.setup(conf); + + List expected = Arrays.stream(CassandraVersion.implementedVersions()) + .map(KryoRegister.KRYO_REGISTRATORS::get) + .map(Class::getName) + .collect(Collectors.toList()); + + // setup() must register a registrator for every implemented (bundled) bridge version, + // independent of spark.cassandra_analytics.cassandra.version, so serialization works for + // whichever bridge the SSTable-version analyzer selects at runtime. + assertThat(expected).isNotEmpty(); + List registrators = Arrays.asList(conf.get("spark.kryo.registrator").split(",")); + assertThat(registrators).containsAll(expected); + } + + @Test + void testSetupDoesNotDependOnCassandraVersionConfig() + { + // Even with a cassandra.version that differs from the bridge that may be selected, + // setup() registers all implemented versions and never throws based on that config. + SparkConf conf = new SparkConf() + .set("spark.cassandra_analytics.cassandra.version", "5.0.0"); + assertThatNoException().isThrownBy(() -> KryoRegister.setup(conf)); + assertThat(conf.get("spark.serializer")).isEqualTo("org.apache.spark.serializer.KryoSerializer"); + } + + @Test + void testSetupPreservesExistingRegistrators() + { + SparkConf conf = new SparkConf() + .set("spark.kryo.registrator", "com.example.CustomRegistrator"); + KryoRegister.setup(conf); + + List registrators = Arrays.asList(conf.get("spark.kryo.registrator").split(",")); + assertThat(registrators).contains("com.example.CustomRegistrator"); + assertThat(registrators.get(0)) + .describedAs("pre-existing registrators should be kept first (deterministic order)") + .isEqualTo("com.example.CustomRegistrator"); + } } diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfigExtensibilityTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfigExtensibilityTest.java index 25d867fa1..15cf046d4 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfigExtensibilityTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfigExtensibilityTest.java @@ -95,7 +95,8 @@ void testReconstructJobInfoOnExecutorCanBeOverridden() BulkSparkConf mockConf = mock(BulkSparkConf.class); BroadcastableJobInfo mockJobInfo = mock(BroadcastableJobInfo.class); BroadcastableSchemaInfo mockSchemaInfo = mock(BroadcastableSchemaInfo.class); - BulkWriterConfig config = new BulkWriterConfig(mockConf, 4, mockJobInfo, mock(IBroadcastableClusterInfo.class), mockSchemaInfo, CassandraVersion.FOURZERO); + BulkWriterConfig config = new BulkWriterConfig(mockConf, 4, mockJobInfo, mock(IBroadcastableClusterInfo.class), + mockSchemaInfo, CassandraVersion.FOURZERO); // Subclass that overrides reconstructJobInfoOnExecutor to return custom JobInfo TestBulkWriterContext context = new TestBulkWriterContext(config) From 0d9f186af3726c5b8e1e0d790e98a34b41a51fc9 Mon Sep 17 00:00:00 2001 From: Shailaja Koppu Date: Tue, 16 Jun 2026 15:38:10 +0100 Subject: [PATCH 3/9] minor --- .../bridge/SSTableVersionAnalyzer.java | 15 ++++++ .../apache/cassandra/spark/KryoRegister.java | 4 +- .../spark/bulkwriter/BulkSparkConf.java | 12 +++-- .../bulkwriter/CassandraClusterInfo.java | 7 ++- .../CassandraClusterInfoGroup.java | 2 +- .../spark/data/CassandraDataLayer.java | 35 ++++++-------- .../cassandra/spark/KryoRegisterTest.java | 2 +- .../bulkwriter/CassandraClusterInfoTest.java | 2 - .../CassandraDataLayerValidationTest.java | 46 +++---------------- 9 files changed, 55 insertions(+), 70 deletions(-) diff --git a/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java b/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java index a53fe8688..f565c3f17 100644 --- a/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java +++ b/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java @@ -196,6 +196,21 @@ public static String findHighestSSTableVersion(Set versions) throw new IllegalStateException("SSTable versions set cannot be empty"); } + // The Comparator below is never invoked for a single-element set, so an unknown version + // would otherwise slip through. Validate the sole element here; multi-element sets are + // fully validated by the Comparator (every element participates in at least one comparison). + if (versions.size() == 1) + { + String only = versions.iterator().next(); + if (!CassandraVersion.fromSSTableVersion(only).isPresent()) + { + throw new IllegalStateException( + String.format("Unknown SSTable version: %s. Cannot determine Cassandra version. " + + "To retry the job using a fallback Cassandra version, " + + "set %s=true", only, DISABLE_SSTABLE_VERSION_BASED_BRIDGE)); + } + } + Comparator sstableVersionComparator = (v1, v2) -> { // Find which CassandraVersion each SSTable version belongs to Optional v1Opt = CassandraVersion.fromSSTableVersion(v1); diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/KryoRegister.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/KryoRegister.java index b22b671be..087530be8 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/KryoRegister.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/KryoRegister.java @@ -92,7 +92,7 @@ public static void addSerializer(@NotNull Class type, @NotNull Serializer * * @param bridgeVersion the CassandraVersion to validate * @param clusterCassandraVersion optional Cassandra version string for additional context in error messages - * @throws IllegalStateException if no Kryo registrator is configured for this version + * @throws IllegalStateException if no Kryo registrator is registered for this version */ public static void validateKryoRegistratorExists(@NotNull CassandraVersion bridgeVersion, String clusterCassandraVersion) @@ -106,7 +106,7 @@ public static void validateKryoRegistratorExists(@NotNull CassandraVersion bridg .collect(Collectors.joining(", ")); throw new IllegalStateException( - String.format("No Kryo registrator configured for bridge version %s (%s). " + + String.format("No Kryo registrator registered for bridge version %s (%s). " + "Cluster Cassandra version: %s. " + "Available Kryo registrators: %s. " + "Cannot proceed with job as Spark serialization will fail. " + diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java index 555f321e5..f088e0547 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java @@ -50,6 +50,7 @@ import org.apache.cassandra.spark.utils.BuildInfo; import org.apache.cassandra.spark.utils.MapUtils; import org.apache.spark.SparkConf; +import org.apache.spark.SparkContext; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; @@ -746,14 +747,19 @@ public boolean isSSTableVersionBasedBridgeDisabled() * Utility method to retrieve the disable SSTable version-based bridge flag * from Spark configuration. This can be called from contexts where a BulkSparkConf * instance is not available. + *

+ * NOTE: must only be called on the driver. It relies on {@code SparkContext.getOrCreate()}, + * which has no active context on executors and would attempt to build a new (master-less) + * context there. The reader propagates the resolved decision to executors via the + * serialized {@code sstableVersionsOnCluster} set (empty == disabled). * * @return true if SSTable version-based bridge selection should be disabled */ public static boolean getDisableSSTableVersionBasedBridge() { - return org.apache.spark.SparkContext.getOrCreate() - .getConf() - .getBoolean(DISABLE_SSTABLE_VERSION_BASED_BRIDGE, false); + return SparkContext.getOrCreate() + .getConf() + .getBoolean(DISABLE_SSTABLE_VERSION_BASED_BRIDGE, false); } public StorageClientConfig getStorageClientConfig() diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java index 207bbc96b..ddda0fc4a 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java @@ -71,7 +71,7 @@ * and includes the result in the {@link BulkWriterConfig} that gets broadcast. *

* On executors, a new instance is reconstructed from {@link BroadcastableClusterInfo} - * using {@link #CassandraClusterInfo(BroadcastableClusterInfo)}, reusing broadcast-safe + * using {@link #CassandraClusterInfo(BroadcastableClusterInfo, CassandraVersion)}, reusing broadcast-safe * fields and fetching other data fresh from Sidecar. * * @see BroadcastableClusterInfo for the broadcast-safe subset of fields @@ -437,6 +437,11 @@ private synchronized List resolveAllNodeSettings() return resolvedNodeSettings; } + if (allNodeSettingFutures == null) + { + throw new IllegalStateException("allNodeSettingFutures is null"); + } + final long totalTimeout = conf.getSidecarRequestMaxRetryDelayMillis() * conf.getSidecarRequestRetries() * allNodeSettingFutures.size(); diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java index 39535b01e..1b3987834 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java @@ -115,7 +115,7 @@ public static CassandraClusterInfoGroup from(BroadcastableClusterInfoGroup broad } /** - * Similar to {@link #fromBulkSparkConf(BulkSparkConf, CassandraVersion)} (BulkSparkConf)} but takes additional function to create {@link ClusterInfo} + * Similar to {@link #fromBulkSparkConf(BulkSparkConf, CassandraVersion)} but takes additional function to create {@link ClusterInfo} */ public static CassandraClusterInfoGroup fromBulkSparkConf(BulkSparkConf conf, Function clusterInfoFactory) { diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java index ee4b47720..7df728b1d 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java @@ -359,10 +359,13 @@ protected CassandraVersion initializeSSTableVersionsAndBridgeVersion(String cass boolean isSSTableVersionBasedBridgeDisabled = isSSTableVersionBasedBridgeDisabled(); // Get SSTable versions from cluster only if SSTable version-based selection is enabled - // If disabled, skip retrieval to allow job to proceed even when SSTable version detection fails + // If disabled, use an empty set (never null) so downstream code - including executor-side + // validation and serialization - needs no null handling. On executors an empty set is the + // signal that the feature was disabled on the driver. if (isSSTableVersionBasedBridgeDisabled) { - this.sstableVersionsOnCluster = null; + //Use a HashSet, because Kryo serializer reads back via kryo.readObject(in, HashSet.class) + this.sstableVersionsOnCluster = new HashSet<>(); } else { @@ -945,7 +948,7 @@ void validateSStableVersions(Set sstableVersionsOnCluster, // Skip validation when fallback mode is enabled if (isSSTableVersionBasedBridgeDisabled) { - LOGGER.debug("Skipping SSTable version validation on driver - fallback mode enabled", + LOGGER.debug("Skipping SSTable version validation on driver - fallback mode enabled via {}=true", BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE); return; } @@ -1000,27 +1003,17 @@ void validateSStableVersions(Set sstableVersionsOnCluster, @VisibleForTesting void validateSStableVersions(List sstables) { - // Check if user has explicitly disabled SSTable version-based selection - boolean isSSTableVersionBasedBridgeDisabled = isSSTableVersionBasedBridgeDisabled(); - - // Skip validation when fallback mode is enabled - if (isSSTableVersionBasedBridgeDisabled) - { - LOGGER.debug("Skipping SSTable version validation on executor - " + - "fallback mode enabled via {}=true", - BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE); - return; - } - Set expectedVersions = this.sstableVersionsOnCluster; + // An empty (or null) expected set means SSTable version-based bridge selection was disabled + // on the driver. In enabled mode the driver fails fast when no versions are observed, so + // executors never see an empty set in that mode; hence empty here == disabled -> skip. + // This avoids reading Spark configuration on executors (no SparkContext is available there). if (expectedVersions == null || expectedVersions.isEmpty()) { - // Fail fast with helpful error message - throw new IllegalStateException( - "Unable to validate SSTable versions - no expected versions available from cluster. " + - "This is required for SSTable version-based bridge selection. " + - "If you want to bypass this check and use cassandra.version for bridge selection, " + - "set " + BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE + "=true"); + LOGGER.debug("Skipping SSTable version validation on executor - no expected versions; " + + "SSTable version-based bridge selection is disabled (set {}=true)", + BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE); + return; } for (SSTable ssTable : sstables) diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/KryoRegisterTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/KryoRegisterTest.java index 7113ae61e..b6ab06dbd 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/KryoRegisterTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/KryoRegisterTest.java @@ -66,7 +66,7 @@ void testValidateKryoRegistratorMissingForThreeZero() { assertThatThrownBy(() -> KryoRegister.validateKryoRegistratorExists(CassandraVersion.THREEZERO, "3.0.0")) .isInstanceOf(IllegalStateException.class) - .hasMessageContaining("No Kryo registrator configured for bridge version THREEZERO") + .hasMessageContaining("No Kryo registrator registered for bridge version THREEZERO") .hasMessageContaining("Cluster Cassandra version: 3.0.0") .hasMessageContaining("Available Kryo registrators:") .hasMessageContaining("FOURZERO") diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfoTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfoTest.java index 498a2119b..7c848a4f7 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfoTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfoTest.java @@ -108,6 +108,4 @@ private void mockCassandraContext(int allowanceMinutes, Instant remoteNow) when(cassandraContext.sidecarPort()).thenReturn(9043); } } - } - diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/CassandraDataLayerValidationTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/CassandraDataLayerValidationTest.java index 68bf29ad6..5a04971f3 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/CassandraDataLayerValidationTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/CassandraDataLayerValidationTest.java @@ -239,33 +239,19 @@ void testValidateSStableVersionsListWithUnexpectedBtiVersion() @Test void testValidateSStableVersionsListSkipsValidationWhenFeatureDisabled() { - Set expectedVersions = new HashSet<>(List.of("big-na")); - // Create a special test instance that returns true for isSSTableVersionBasedBridgeDisabled - CassandraDataLayer dataLayer = new TestCassandraDataLayerWithFeatureDisabled(expectedVersions); + // When the feature is disabled the driver leaves sstableVersionsOnCluster empty; on the executor + // an empty expected set is the signal that the feature is disabled, so validation is skipped. + CassandraDataLayer dataLayer = createTestDataLayerWithVersions(Collections.emptySet()); - // Create SSTable with unexpected version + // SSTable has a version that would be rejected if validation actually ran SSTable ssTable = createMockSSTable("big", "oa", "test-big-oa-Data.db"); List sstables = Collections.singletonList(ssTable); - // Should NOT throw exception because feature is disabled assertThatNoException() - .describedAs("Validation should be skipped when feature is disabled") + .describedAs("Validation should be skipped when feature is disabled (empty expected versions)") .isThrownBy(() -> dataLayer.validateSStableVersions(sstables)); } - @Test - void testValidateSStableVersionsListWithEmptyExpectedVersionsThrowsException() - { - CassandraDataLayer dataLayer = createTestDataLayerWithVersions(Collections.emptySet()); - - SSTable ssTable = createMockSSTable("big", "na", "test-big-na-Data.db"); - List sstables = Collections.singletonList(ssTable); - - assertThatThrownBy(() -> dataLayer.validateSStableVersions(sstables)) - .isInstanceOf(IllegalStateException.class) - .hasMessageContaining("Unable to validate SSTable versions - no expected versions available from cluster"); - } - // Tests for initializeSSTableVersionsAndBridgeVersion (called from initBulkReader - lines 283-313) @Test @@ -297,8 +283,8 @@ void testInitializeSSTableVersionsAndBridgeVersionWithFeatureDisabled() // Should return FOURZERO bridge version (fallback to cassandra.version) assertThat((Object) result).isEqualTo(CassandraVersion.FOURZERO); - // Should set sstableVersionsOnCluster to null (skipped retrieval) - assertThat((Object) dataLayer.sstableVersionsOnCluster).isNull(); + // Should set sstableVersionsOnCluster to an empty set (skipped retrieval, never null) + assertThat(dataLayer.sstableVersionsOnCluster).isEmpty(); } @Test @@ -422,24 +408,6 @@ protected boolean isSSTableVersionBasedBridgeDisabled() } } - /** - * Test subclass with feature disabled to test the skip validation path - */ - private static class TestCassandraDataLayerWithFeatureDisabled extends TestCassandraDataLayer - { - TestCassandraDataLayerWithFeatureDisabled(Set sstableVersions) - { - super(sstableVersions); - } - - @Override - protected boolean isSSTableVersionBasedBridgeDisabled() - { - // Return true to test the feature-disabled path - return true; - } - } - /** * Test subclass for testing initializeSSTableVersionsAndBridgeVersion method * Mocks the Sidecar interaction to avoid actual cluster calls From b5e13479cc3c0107b77bfbe329525ce90c97b9be Mon Sep 17 00:00:00 2001 From: Shailaja Koppu Date: Wed, 17 Jun 2026 20:11:02 +0100 Subject: [PATCH 4/9] sstable format verification --- ...eaderSSTableVersionBridgeDisabledTest.java | 107 ------- ...ReaderSSTableVersionBridgeEnabledTest.java | 160 ----------- ...ipBigSSTableVersionBridgeDisabledTest.java | 33 +++ ...ripBigSSTableVersionBridgeEnabledTest.java | 33 +++ ...ipBtiSSTableVersionBridgeDisabledTest.java | 39 +++ ...ripBtiSSTableVersionBridgeEnabledTest.java | 39 +++ ...RoundtripSSTableVersionBridgeTestBase.java | 268 ++++++++++++++++++ ...riterSSTableVersionBridgeDisabledTest.java | 117 -------- ...WriterSSTableVersionBridgeEnabledTest.java | 109 ------- ...SharedClusterSparkIntegrationTestBase.java | 74 +++++ 10 files changed, 486 insertions(+), 493 deletions(-) delete mode 100644 cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkReaderSSTableVersionBridgeDisabledTest.java delete mode 100644 cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkReaderSSTableVersionBridgeEnabledTest.java create mode 100644 cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBigSSTableVersionBridgeDisabledTest.java create mode 100644 cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBigSSTableVersionBridgeEnabledTest.java create mode 100644 cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBtiSSTableVersionBridgeDisabledTest.java create mode 100644 cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBtiSSTableVersionBridgeEnabledTest.java create mode 100644 cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripSSTableVersionBridgeTestBase.java delete mode 100644 cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriterSSTableVersionBridgeDisabledTest.java delete mode 100644 cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriterSSTableVersionBridgeEnabledTest.java diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkReaderSSTableVersionBridgeDisabledTest.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkReaderSSTableVersionBridgeDisabledTest.java deleted file mode 100644 index 6a601a3d2..000000000 --- a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkReaderSSTableVersionBridgeDisabledTest.java +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.cassandra.analytics; - -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; -import java.util.stream.Collectors; - -import org.junit.jupiter.api.Test; - -import org.apache.cassandra.distributed.api.ConsistencyLevel; -import org.apache.cassandra.distributed.api.IInstance; -import org.apache.cassandra.sidecar.testing.QualifiedName; -import org.apache.cassandra.testing.ClusterBuilderConfiguration; -import org.apache.spark.SparkConf; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; - -import static org.apache.cassandra.testing.TestUtils.DC1_RF3; -import static org.apache.cassandra.testing.TestUtils.TEST_KEYSPACE; -import static org.assertj.core.api.Assertions.assertThat; - -/** - * Integration tests for SSTable version-based bridge selection in bulk reader (feature DISABLED) - * This test class has the feature flag disabled via SparkConf override. - */ -class BulkReaderSSTableVersionBridgeDisabledTest extends SharedClusterSparkIntegrationTestBase -{ - static final List DATASET = Arrays.asList("alpha", "beta", "gamma", "delta", "epsilon"); - static final QualifiedName TABLE_DISABLED = new QualifiedName(TEST_KEYSPACE, "test_disabled"); - - @Override - protected ClusterBuilderConfiguration testClusterConfiguration() - { - return super.testClusterConfiguration() - .nodesPerDc(3); - } - - @Override - protected SparkConf getOrCreateSparkConf() - { - SparkConf conf = super.getOrCreateSparkConf(); - // Disable SSTable version-based bridge selection for this entire test class - conf.set("spark.cassandra_analytics.bridge.disable_sstable_version_based", "true"); - return conf; - } - - @Override - protected void initializeSchemaForTest() - { - createTestKeyspace(TEST_KEYSPACE, DC1_RF3); - - // Create table - createTestTable(TABLE_DISABLED, "CREATE TABLE %s (id int PRIMARY KEY, value text) WITH compression = {'enabled': false};"); - - IInstance firstRunningInstance = cluster.getFirstRunningInstance(); - - // Insert data for TABLE_DISABLED - for (int i = 0; i < DATASET.size(); i++) - { - String query = String.format("INSERT INTO %s (id, value) VALUES (%d, '%s');", TABLE_DISABLED, i, DATASET.get(i)); - firstRunningInstance.coordinator().execute(query, ConsistencyLevel.ALL); - } - - // Flush to create SSTables - cluster.stream().forEach(instance -> { - instance.nodetool("flush", TEST_KEYSPACE, TABLE_DISABLED.table()); - }); - } - - @Test - void testBulkReadWithSSTableVersionBasedBridgeDisabled() - { - // Read with feature disabled (fallback to cassandra.version) - Dataset df = bulkReaderDataFrame(TABLE_DISABLED).load(); - - List rows = df.collectAsList() - .stream() - .sorted(Comparator.comparing(row -> row.getInt(0))) - .collect(Collectors.toList()); - - assertThat(rows).hasSize(DATASET.size()); - for (int i = 0; i < DATASET.size(); i++) - { - assertThat(rows.get(i).getInt(0)).isEqualTo(i); - assertThat(rows.get(i).getString(1)).isEqualTo(DATASET.get(i)); - } - } -} diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkReaderSSTableVersionBridgeEnabledTest.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkReaderSSTableVersionBridgeEnabledTest.java deleted file mode 100644 index 295359fa6..000000000 --- a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkReaderSSTableVersionBridgeEnabledTest.java +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.cassandra.analytics; - -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; -import java.util.stream.Collectors; - -import org.junit.jupiter.api.Test; - -import org.apache.cassandra.distributed.api.ConsistencyLevel; -import org.apache.cassandra.distributed.api.IInstance; -import org.apache.cassandra.sidecar.testing.QualifiedName; -import org.apache.cassandra.testing.ClusterBuilderConfiguration; -import org.apache.spark.SparkConf; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; - -import static org.apache.cassandra.testing.TestUtils.DC1_RF3; -import static org.apache.cassandra.testing.TestUtils.TEST_KEYSPACE; -import static org.assertj.core.api.Assertions.assertThat; - -/** - * Integration tests for SSTable version-based bridge selection in bulk reader (feature ENABLED) - * This test class has the feature explicitly enabled via SparkConf override. - */ -class BulkReaderSSTableVersionBridgeEnabledTest extends SharedClusterSparkIntegrationTestBase -{ - static final List DATASET = Arrays.asList("alpha", "beta", "gamma", "delta", "epsilon"); - static final QualifiedName TABLE_MULTIPLE = new QualifiedName(TEST_KEYSPACE, "test_multiple"); - static final QualifiedName TABLE_SNAPSHOT = new QualifiedName(TEST_KEYSPACE, "test_snapshot"); - - @Override - protected ClusterBuilderConfiguration testClusterConfiguration() - { - return super.testClusterConfiguration() - .nodesPerDc(3); - } - - @Override - protected SparkConf getOrCreateSparkConf() - { - SparkConf conf = super.getOrCreateSparkConf(); - // Explicitly enable SSTable version-based bridge selection for this entire test class - conf.set("spark.cassandra_analytics.bridge.disable_sstable_version_based", "false"); - return conf; - } - - @Override - protected void initializeSchemaForTest() - { - createTestKeyspace(TEST_KEYSPACE, DC1_RF3); - - // Create tables - createTestTable(TABLE_MULTIPLE, "CREATE TABLE %s (id int PRIMARY KEY, value text) WITH compression = {'enabled': false};"); - createTestTable(TABLE_SNAPSHOT, "CREATE TABLE %s (id int PRIMARY KEY, value text) WITH compression = {'enabled': false};"); - - IInstance firstRunningInstance = cluster.getFirstRunningInstance(); - - // Insert data for TABLE_MULTIPLE (3 batches to create multiple SSTables) - for (int batch = 0; batch < 3; batch++) - { - for (int i = batch * 10; i < (batch + 1) * 10; i++) - { - String query = String.format("INSERT INTO %s (id, value) VALUES (%d, 'value_%d');", TABLE_MULTIPLE, i, i); - firstRunningInstance.coordinator().execute(query, ConsistencyLevel.ALL); - } - // Flush after each batch to create separate SSTables - cluster.stream().forEach(instance -> instance.nodetool("flush", TEST_KEYSPACE, TABLE_MULTIPLE.table())); - } - - // Insert data for TABLE_SNAPSHOT - for (int i = 0; i < DATASET.size(); i++) - { - String query = String.format("INSERT INTO %s (id, value) VALUES (%d, '%s');", TABLE_SNAPSHOT, i, DATASET.get(i)); - firstRunningInstance.coordinator().execute(query, ConsistencyLevel.ALL); - } - - // Flush TABLE_SNAPSHOT to create SSTables - cluster.stream().forEach(instance -> { - instance.nodetool("flush", TEST_KEYSPACE, TABLE_SNAPSHOT.table()); - }); - } - - @Test - void testBulkReadWithSSTableVersionBasedBridgeMultipleSSTables() - { - // Read with SSTable version-based bridge enabled - Dataset df = bulkReaderDataFrame(TABLE_MULTIPLE).load(); - - List rows = df.collectAsList(); - assertThat(rows).hasSize(30); // 3 batches * 10 rows each - - // Verify all rows are present - List ids = rows.stream() - .map(row -> row.getInt(0)) - .sorted() - .collect(Collectors.toList()); - for (int i = 0; i < 30; i++) - { - assertThat(ids.get(i)).isEqualTo(i); - } - } - - @Test - void testBulkReadWithSnapshot() - { - String snapshotName = "test_snapshot_" + System.currentTimeMillis(); - - // Create snapshot on all nodes - cluster.stream().forEach(instance -> { - instance.nodetool("snapshot", "-t", snapshotName, TEST_KEYSPACE); - }); - - try - { - // Read from snapshot with SSTable version-based bridge - Dataset df = bulkReaderDataFrame(TABLE_SNAPSHOT) - .option("snapshotName", snapshotName) - .load(); - - List rows = df.collectAsList() - .stream() - .sorted(Comparator.comparing(row -> row.getInt(0))) - .collect(Collectors.toList()); - - assertThat(rows).hasSize(DATASET.size()); - for (int i = 0; i < DATASET.size(); i++) - { - assertThat(rows.get(i).getInt(0)).isEqualTo(i); - assertThat(rows.get(i).getString(1)).isEqualTo(DATASET.get(i)); - } - } - finally - { - // Clean up snapshot - cluster.stream().forEach(instance -> { - instance.nodetool("clearsnapshot", "-t", snapshotName, TEST_KEYSPACE); - }); - } - } -} diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBigSSTableVersionBridgeDisabledTest.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBigSSTableVersionBridgeDisabledTest.java new file mode 100644 index 000000000..1e26dda5f --- /dev/null +++ b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBigSSTableVersionBridgeDisabledTest.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.analytics; + +/** + * Runs the {@link BulkRoundtripSSTableVersionBridgeTestBase} roundtrip suite with the BIG (big-oa/big-nb) SSTable + * format and SSTable version-based bridge selection DISABLED (fallback to {@code cassandra.version}). + */ +class BulkRoundtripBigSSTableVersionBridgeDisabledTest extends BulkRoundtripSSTableVersionBridgeTestBase +{ + @Override + protected boolean sstableVersionBridgeEnabled() + { + return false; + } +} diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBigSSTableVersionBridgeEnabledTest.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBigSSTableVersionBridgeEnabledTest.java new file mode 100644 index 000000000..1de367750 --- /dev/null +++ b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBigSSTableVersionBridgeEnabledTest.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.analytics; + +/** + * Runs the {@link BulkRoundtripSSTableVersionBridgeTestBase} roundtrip suite with the BIG (big-oa/big-nb) SSTable + * format and SSTable version-based bridge selection ENABLED. + */ +class BulkRoundtripBigSSTableVersionBridgeEnabledTest extends BulkRoundtripSSTableVersionBridgeTestBase +{ + @Override + protected boolean sstableVersionBridgeEnabled() + { + return true; + } +} diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBtiSSTableVersionBridgeDisabledTest.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBtiSSTableVersionBridgeDisabledTest.java new file mode 100644 index 000000000..f34af9fec --- /dev/null +++ b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBtiSSTableVersionBridgeDisabledTest.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.analytics; + +/** + * Runs the {@link BulkRoundtripSSTableVersionBridgeTestBase} roundtrip suite with the BTI (bti-da) SSTable format + * and SSTable version-based bridge selection DISABLED (fallback to {@code cassandra.version}). + */ +class BulkRoundtripBtiSSTableVersionBridgeDisabledTest extends BulkRoundtripSSTableVersionBridgeTestBase +{ + @Override + protected boolean sstableVersionBridgeEnabled() + { + return false; + } + + @Override + protected String sstableFormat() + { + return "bti"; + } +} diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBtiSSTableVersionBridgeEnabledTest.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBtiSSTableVersionBridgeEnabledTest.java new file mode 100644 index 000000000..b4f204ba9 --- /dev/null +++ b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBtiSSTableVersionBridgeEnabledTest.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.analytics; + +/** + * Runs the {@link BulkRoundtripSSTableVersionBridgeTestBase} roundtrip suite with the BTI (bti-da) SSTable format + * and SSTable version-based bridge selection ENABLED. + */ +class BulkRoundtripBtiSSTableVersionBridgeEnabledTest extends BulkRoundtripSSTableVersionBridgeTestBase +{ + @Override + protected boolean sstableVersionBridgeEnabled() + { + return true; + } + + @Override + protected String sstableFormat() + { + return "bti"; + } +} diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripSSTableVersionBridgeTestBase.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripSSTableVersionBridgeTestBase.java new file mode 100644 index 000000000..85d5e906f --- /dev/null +++ b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripSSTableVersionBridgeTestBase.java @@ -0,0 +1,268 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.analytics; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.junit.jupiter.api.Test; + +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.IInstance; +import org.apache.cassandra.sidecar.testing.QualifiedName; +import org.apache.cassandra.testing.ClusterBuilderConfiguration; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +import static org.apache.cassandra.testing.TestUtils.DC1_RF3; +import static org.apache.cassandra.testing.TestUtils.TEST_KEYSPACE; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.Assumptions.assumeTrue; + +/** + * Shared roundtrip integration tests for SSTable version-based bridge selection. The same test logic runs + * for both the feature-enabled and feature-disabled cases. + * + *

Each test exercises a full write/read cycle and asserts the on-disk SSTable files use the format and + * version expected for the Cassandra version under test.

+ *
    + *
  • {@link #testBulkWriteReadRoundtrip()} writes via the bulk writer, then reads back via the bulk reader.
  • + *
  • {@link #testBulkReadAcrossMultipleSSTables()} writes via CQL across several flushed SSTables, then bulk reads.
  • + *
  • {@link #testBulkReadFromSnapshot()} writes via CQL, snapshots, then bulk reads from the snapshot.
  • + *
+ */ +abstract class BulkRoundtripSSTableVersionBridgeTestBase extends SharedClusterSparkIntegrationTestBase +{ + static final List DATASET = Arrays.asList("alpha", "beta", "gamma", "delta", "epsilon"); + static final QualifiedName TABLE_ROUNDTRIP = new QualifiedName(TEST_KEYSPACE, "test_roundtrip"); + static final QualifiedName TABLE_MULTIPLE = new QualifiedName(TEST_KEYSPACE, "test_multiple"); + static final QualifiedName TABLE_SNAPSHOT = new QualifiedName(TEST_KEYSPACE, "test_snapshot"); + + static final StructType SCHEMA = DataTypes.createStructType(new StructField[]{ + DataTypes.createStructField("id", DataTypes.IntegerType, false), + DataTypes.createStructField("value", DataTypes.StringType, false) + }); + + /** + * @return whether SSTable version-based bridge selection should be enabled for this test class + */ + protected abstract boolean sstableVersionBridgeEnabled(); + + /** + * @return the SSTable format the analytics bulk writer should produce ({@code big} by default; + * overridden to {@code bti} by the BTI variant). + */ + protected String sstableFormat() + { + return "big"; + } + + @Override + protected ClusterBuilderConfiguration testClusterConfiguration() + { + ClusterBuilderConfiguration conf = super.testClusterConfiguration() + .nodesPerDc(3); + // Preserve the base instance config (e.g. storage_compatibility_mode) and pin the node SSTable format. + Map instanceConfig = new HashMap<>(); + if (conf.additionalInstanceConfig != null) + { + instanceConfig.putAll(conf.additionalInstanceConfig); + } + instanceConfig.put("sstable.selected_format", sstableFormat()); + return conf.additionalInstanceConfig(instanceConfig); + } + + @Override + protected void beforeClusterProvisioning() + { + // Set before CassandraVersion class initialization (which reads this property once, statically) so the bulk + // writer generates SSTables in the configured format. This runs at the start of cluster provisioning, before + // any analytics bridge usage, and relies on forkEvery = 1 (the integration-test default) for a fresh JVM per + // test class. + System.setProperty("cassandra.analytics.bridges.sstable_format", sstableFormat()); + if ("bti".equals(sstableFormat())) + { + // BTI (bti-da) is a Cassandra 5.0+ format; skip on older versions. + assumeTrue(testVersion.version().startsWith("5."), + "BTI format (bti-da) requires Cassandra 5.0+, but test version is " + testVersion.version()); + } + } + + @Override + protected SparkConf getOrCreateSparkConf() + { + SparkConf conf = super.getOrCreateSparkConf(); + // The feature is toggled via the "disable" flag, so it is the inverse of the enabled state. + conf.set("spark.cassandra_analytics.bridge.disable_sstable_version_based", + String.valueOf(!sstableVersionBridgeEnabled())); + return conf; + } + + @Override + protected void initializeSchemaForTest() + { + createTestKeyspace(TEST_KEYSPACE, DC1_RF3); + createTestTable(TABLE_ROUNDTRIP, "CREATE TABLE %s (id int PRIMARY KEY, value text) WITH compression = {'enabled': false};"); + createTestTable(TABLE_MULTIPLE, "CREATE TABLE %s (id int PRIMARY KEY, value text) WITH compression = {'enabled': false};"); + createTestTable(TABLE_SNAPSHOT, "CREATE TABLE %s (id int PRIMARY KEY, value text) WITH compression = {'enabled': false};"); + } + + @Test + void testBulkWriteReadRoundtrip() + { + SparkSession spark = getOrCreateSparkSession(); + + List data = Arrays.asList( + RowFactory.create(0, "a"), + RowFactory.create(1, "b"), + RowFactory.create(2, "c"), + RowFactory.create(3, "d"), + RowFactory.create(4, "e") + ); + Dataset dfWrite = spark.createDataFrame(data, SCHEMA); + + // Write through the bulk writer + bulkWriterDataFrameWriter(dfWrite, TABLE_ROUNDTRIP).save(); + flushKeyspace(); + + // The bulk writer should have produced SSTables in the expected format/version for this Cassandra version + assertExpectedSSTableFormat(TABLE_ROUNDTRIP); + + // Read the data back through the bulk reader and confirm the roundtrip + Dataset dfRead = bulkReaderDataFrame(TABLE_ROUNDTRIP).load(); + checkSmallDataFrameEquality(dfWrite, dfRead); + } + + @Test + void testBulkReadAcrossMultipleSSTables() + { + IInstance firstRunningInstance = cluster.getFirstRunningInstance(); + + // Insert in 3 batches, flushing between each to create multiple SSTables + for (int batch = 0; batch < 3; batch++) + { + for (int i = batch * 10; i < (batch + 1) * 10; i++) + { + String query = String.format("INSERT INTO %s (id, value) VALUES (%d, 'value_%d');", TABLE_MULTIPLE, i, i); + firstRunningInstance.coordinator().execute(query, ConsistencyLevel.ALL); + } + cluster.stream().forEach(instance -> instance.nodetool("flush", TEST_KEYSPACE, TABLE_MULTIPLE.table())); + } + + assertExpectedSSTableFormat(TABLE_MULTIPLE); + + Dataset df = bulkReaderDataFrame(TABLE_MULTIPLE).load(); + List ids = df.collectAsList() + .stream() + .map(row -> row.getInt(0)) + .sorted() + .collect(Collectors.toList()); + assertThat(ids).hasSize(30); // 3 batches * 10 rows each + for (int i = 0; i < 30; i++) + { + assertThat(ids.get(i)).isEqualTo(i); + } + } + + @Test + void testBulkReadFromSnapshot() + { + IInstance firstRunningInstance = cluster.getFirstRunningInstance(); + for (int i = 0; i < DATASET.size(); i++) + { + String query = String.format("INSERT INTO %s (id, value) VALUES (%d, '%s');", TABLE_SNAPSHOT, i, DATASET.get(i)); + firstRunningInstance.coordinator().execute(query, ConsistencyLevel.ALL); + } + cluster.stream().forEach(instance -> instance.nodetool("flush", TEST_KEYSPACE, TABLE_SNAPSHOT.table())); + + assertExpectedSSTableFormat(TABLE_SNAPSHOT); + + String snapshotName = "test_snapshot_" + System.currentTimeMillis(); + cluster.stream().forEach(instance -> instance.nodetool("snapshot", "-t", snapshotName, TEST_KEYSPACE)); + try + { + Dataset df = bulkReaderDataFrame(TABLE_SNAPSHOT) + .option("snapshotName", snapshotName) + .load(); + + List rows = df.collectAsList() + .stream() + .sorted(Comparator.comparing(row -> row.getInt(0))) + .collect(Collectors.toList()); + + assertThat(rows).hasSize(DATASET.size()); + for (int i = 0; i < DATASET.size(); i++) + { + assertThat(rows.get(i).getInt(0)).isEqualTo(i); + assertThat(rows.get(i).getString(1)).isEqualTo(DATASET.get(i)); + } + } + finally + { + cluster.stream().forEach(instance -> instance.nodetool("clearsnapshot", "-t", snapshotName, TEST_KEYSPACE)); + } + } + + private void flushKeyspace() + { + cluster.stream().forEach(instance -> instance.nodetool("flush", TEST_KEYSPACE)); + } + + /** + * Asserts the on-disk SSTables for the given table match the format produced by this test class: + * {@code bti-da} for the BTI variant (5.x only), otherwise {@code big} with the version expected for the + * Cassandra version under test ({@code oa} for 5.x, {@code nb} for 4.x). + */ + private void assertExpectedSSTableFormat(QualifiedName table) + { + if ("bti".equals(sstableFormat())) + { + assertSSTableFormatOnDisk(table, "bti", "da"); + } + else + { + String version = testVersion.version(); + String expectedSSTableVersion; + if (version.startsWith("5.")) + { + expectedSSTableVersion = "oa"; + } + else if (version.startsWith("4.")) + { + expectedSSTableVersion = "nb"; + } + else + { + throw new IllegalStateException("Unsupported Cassandra version for SSTable format assertion: " + version); + } + assertSSTableFormatOnDisk(table, "big", expectedSSTableVersion); + } + } +} diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriterSSTableVersionBridgeDisabledTest.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriterSSTableVersionBridgeDisabledTest.java deleted file mode 100644 index 5b97d60d1..000000000 --- a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriterSSTableVersionBridgeDisabledTest.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.cassandra.analytics; - -import java.util.Arrays; -import java.util.List; - -import org.junit.jupiter.api.Test; - -import org.apache.cassandra.distributed.api.ConsistencyLevel; -import org.apache.cassandra.distributed.api.SimpleQueryResult; -import org.apache.cassandra.sidecar.testing.QualifiedName; -import org.apache.cassandra.testing.ClusterBuilderConfiguration; -import org.apache.spark.SparkConf; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; - -import static org.apache.cassandra.testing.TestUtils.DC1_RF3; -import static org.apache.cassandra.testing.TestUtils.TEST_KEYSPACE; -import static org.assertj.core.api.Assertions.assertThat; - -/** - * Integration tests for SSTable version-based bridge selection in bulk writer, with feature DISABLED - */ -class BulkWriterSSTableVersionBridgeDisabledTest extends SharedClusterSparkIntegrationTestBase -{ - static final QualifiedName TABLE_DISABLED = new QualifiedName(TEST_KEYSPACE, "test_disabled"); - - @Override - protected ClusterBuilderConfiguration testClusterConfiguration() - { - return super.testClusterConfiguration() - .nodesPerDc(3); - } - - @Override - protected SparkConf getOrCreateSparkConf() - { - SparkConf conf = super.getOrCreateSparkConf(); - // Disable SSTable version-based bridge selection for this entire test class - conf.set("spark.cassandra_analytics.bridge.disable_sstable_version_based", "true"); - return conf; - } - - @Override - protected void initializeSchemaForTest() - { - createTestKeyspace(TEST_KEYSPACE, DC1_RF3); - cluster.schemaChangeIgnoringStoppedInstances( - "CREATE TABLE " + TABLE_DISABLED + " (id int PRIMARY KEY, value text) WITH compression = {'enabled': false};" - ); - } - - @Test - void testBulkWriteWithSSTableVersionBasedBridgeDisabled() - { - SparkSession spark = getOrCreateSparkSession(); - - // Create test data - List data = Arrays.asList( - RowFactory.create(0, "x"), - RowFactory.create(1, "y"), - RowFactory.create(2, "z") - ); - - StructType schema = DataTypes.createStructType(new StructField[]{ - DataTypes.createStructField("id", DataTypes.IntegerType, false), - DataTypes.createStructField("value", DataTypes.StringType, false) - }); - - Dataset df = spark.createDataFrame(data, schema); - - // Write with feature disabled (fallback to cassandra.version) - bulkWriterDataFrameWriter(df, TABLE_DISABLED).save(); - - // Verify data was written - SimpleQueryResult result = cluster.coordinator(1) - .executeWithResult("SELECT id, value FROM " + TABLE_DISABLED, - ConsistencyLevel.ALL); - Object[][] rows = result.toObjectArrays(); - - assertThat(rows.length).isEqualTo(data.size()); - - // Verify all expected IDs are present - List actualIds = Arrays.stream(rows) - .map(row -> (Integer) row[0]) - .sorted() - .collect(java.util.stream.Collectors.toList()); - List expectedIds = data.stream() - .map(row -> row.getInt(0)) - .sorted() - .collect(java.util.stream.Collectors.toList()); - assertThat(actualIds).isEqualTo(expectedIds); - } -} diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriterSSTableVersionBridgeEnabledTest.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriterSSTableVersionBridgeEnabledTest.java deleted file mode 100644 index f73254d0a..000000000 --- a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriterSSTableVersionBridgeEnabledTest.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.cassandra.analytics; - -import java.util.Arrays; -import java.util.List; - -import org.junit.jupiter.api.Test; - -import org.apache.cassandra.distributed.api.ConsistencyLevel; -import org.apache.cassandra.distributed.api.SimpleQueryResult; -import org.apache.cassandra.sidecar.testing.QualifiedName; -import org.apache.cassandra.testing.ClusterBuilderConfiguration; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; - -import static org.apache.cassandra.testing.TestUtils.DC1_RF3; -import static org.apache.cassandra.testing.TestUtils.TEST_KEYSPACE; -import static org.assertj.core.api.Assertions.assertThat; - -/** - * Integration tests for SSTable version-based bridge selection in bulk writer (feature ENABLED) - */ -class BulkWriterSSTableVersionBridgeEnabledTest extends SharedClusterSparkIntegrationTestBase -{ - static final QualifiedName TABLE_ENABLED = new QualifiedName(TEST_KEYSPACE, "test_enabled"); - - @Override - protected ClusterBuilderConfiguration testClusterConfiguration() - { - return super.testClusterConfiguration() - .nodesPerDc(3); - } - - @Override - protected void initializeSchemaForTest() - { - createTestKeyspace(TEST_KEYSPACE, DC1_RF3); - cluster.schemaChangeIgnoringStoppedInstances( - "CREATE TABLE " + TABLE_ENABLED + " (id int PRIMARY KEY, value text) WITH compression = {'enabled': false};" - ); - } - - @Test - void testBulkWriteWithSSTableVersionBasedBridgeEnabled() - { - SparkSession spark = getOrCreateSparkSession(); - - // Create test data - List data = Arrays.asList( - RowFactory.create(0, "a"), - RowFactory.create(1, "b"), - RowFactory.create(2, "c"), - RowFactory.create(3, "d"), - RowFactory.create(4, "e") - ); - - StructType schema = DataTypes.createStructType(new StructField[]{ - DataTypes.createStructField("id", DataTypes.IntegerType, false), - DataTypes.createStructField("value", DataTypes.StringType, false) - }); - - Dataset df = spark.createDataFrame(data, schema); - - // Write with SSTable version-based bridge enabled (default) - bulkWriterDataFrameWriter(df, TABLE_ENABLED).save(); - - // Verify data was written - SimpleQueryResult result = cluster.coordinator(1) - .executeWithResult("SELECT id, value FROM " + TABLE_ENABLED, - ConsistencyLevel.ALL); - Object[][] rows = result.toObjectArrays(); - - assertThat(rows.length).isEqualTo(data.size()); - - // Verify all expected IDs are present - List actualIds = Arrays.stream(rows) - .map(row -> (Integer) row[0]) - .sorted() - .collect(java.util.stream.Collectors.toList()); - List expectedIds = data.stream() - .map(row -> row.getInt(0)) - .sorted() - .collect(java.util.stream.Collectors.toList()); - assertThat(actualIds).isEqualTo(expectedIds); - } -} diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/SharedClusterSparkIntegrationTestBase.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/SharedClusterSparkIntegrationTestBase.java index c6a98ca73..9c4d9e8fa 100644 --- a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/SharedClusterSparkIntegrationTestBase.java +++ b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/SharedClusterSparkIntegrationTestBase.java @@ -19,6 +19,10 @@ package org.apache.cassandra.analytics; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.Collections; import java.util.HashSet; import java.util.List; @@ -26,6 +30,7 @@ import java.util.Set; import java.util.function.Function; import java.util.stream.Collectors; +import java.util.stream.Stream; import org.junit.jupiter.api.TestInstance; import org.junit.jupiter.api.TestInstance.Lifecycle; @@ -36,6 +41,7 @@ import io.vertx.junit5.VertxExtension; import org.apache.cassandra.bridge.CassandraBridge; import org.apache.cassandra.bridge.CassandraBridgeFactory; +import org.apache.cassandra.distributed.api.IInstance; import org.apache.cassandra.sidecar.testing.QualifiedName; import org.apache.cassandra.sidecar.testing.SharedClusterIntegrationTestBase; import org.apache.spark.SparkConf; @@ -190,6 +196,74 @@ public void checkSmallDataFrameEquality(Dataset expected, Dataset actu } } + /** + * Asserts that every on-disk SSTable data file for the given table matches the expected SSTable format and + * version across all running nodes. Data file names follow the pattern + * {@code ---Data.db} (e.g. {@code oa-1-big-Data.db}). The generation component is + * matched loosely since it may be sequence- or UUID-based depending on cluster configuration. + * + * @param table the table whose on-disk SSTables are inspected + * @param format the expected SSTable format (e.g. {@code big}) + * @param expectedVersion the expected SSTable version (e.g. {@code oa} or {@code nb}) + */ + protected void assertSSTableFormatOnDisk(QualifiedName table, String format, String expectedVersion) + { + String dataFileRegex = expectedVersion + "-[^-]+-" + format + "-Data\\.db"; + boolean foundDataFiles = false; + for (int i = 1; i <= cluster.size(); i++) + { + IInstance instance = cluster.get(i); + if (instance.isShutdown()) + { + continue; + } + for (String fileName : findSSTableDataFiles(instance, table)) + { + foundDataFiles = true; + assertThat(fileName) + .as("SSTable data file for %s on node %d should be in %s format with version %s: %s", + table, i, format, expectedVersion, fileName) + .matches(dataFileRegex); + } + } + assertThat(foundDataFiles) + .as("Expected to find at least one SSTable data file for %s on a running node", table) + .isTrue(); + } + + /** + * Finds the names of all SSTable {@code *-Data.db} files belonging to the given table on a single node, + * scanning every configured data directory and scoping to the table's own data subdirectory. + */ + private Set findSSTableDataFiles(IInstance instance, QualifiedName table) + { + String[] dataDirs = (String[]) instance.config().getParams().get("data_file_directories"); + Set dataFileNames = new HashSet<>(); + String tableDirPrefix = table.table() + "-"; + for (String dataDir : dataDirs) + { + Path keyspacePath = Paths.get(dataDir, table.keyspace()); + if (!Files.exists(keyspacePath)) + { + continue; + } + try (Stream walkStream = Files.walk(keyspacePath)) + { + walkStream.filter(Files::isRegularFile) + .filter(path -> path.getFileName().toString().endsWith("-Data.db")) + .filter(path -> path.getParent() != null + && path.getParent().getFileName().toString().startsWith(tableDirPrefix)) + .forEach(path -> dataFileNames.add(path.getFileName().toString())); + } + catch (IOException e) + { + throw new RuntimeException("Failed to list SSTable data files for " + table, e); + } + } + + return dataFileNames; + } + public void validateWritesWithDriverResultSet(List sparkData, ResultSet driverData, Function driverRowFormatter) { From f2c8e4397d9a9b74d0bfc744d57f9e819ccd265f Mon Sep 17 00:00:00 2001 From: Shailaja Koppu Date: Tue, 23 Jun 2026 14:26:46 +0100 Subject: [PATCH 5/9] resolve comments --- .../apache/cassandra/bridge/SSTableVersionAnalyzer.java | 8 ++++++-- .../spark/bulkwriter/AbstractBulkWriterContext.java | 5 ----- .../coordinated/CassandraClusterInfoGroup.java | 9 +++++---- .../CassandraCoordinatedBulkWriterContext.java | 2 +- .../apache/cassandra/spark/data/CassandraDataLayer.java | 5 ++++- .../coordinated/CassandraClusterInfoGroupTest.java | 7 +++---- .../BulkRoundtripSSTableVersionBridgeTestBase.java | 4 +++- 7 files changed, 22 insertions(+), 18 deletions(-) diff --git a/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java b/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java index f565c3f17..98d5e272c 100644 --- a/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java +++ b/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java @@ -82,6 +82,9 @@ public static CassandraVersion determineBridgeVersionForWrite(Set sstabl if (supportsRequestedFormat) { + LOGGER.info("Determined bridge version: {} for write based on SSTable versions on the cluster: {}, " + + "requested SSTable format: '{}', lowest Cassandra version on the cluster: {}", + highestCassandraVersion.versionName(), sstableVersionsOnCluster, requestedFormat, cassandraVersion); return highestCassandraVersion; } else @@ -121,8 +124,9 @@ public static CassandraVersion determineBridgeVersionForRead(Set sstable // Find highest Cassandra version based on SSTable versions CassandraVersion bridgeVersion = findHighestCassandraVersion(sstableVersionsOnCluster); - LOGGER.debug("Determined bridge version {} for read based on SSTable versions on cluster: {}", - bridgeVersion.versionName(), sstableVersionsOnCluster); + LOGGER.info("Determined bridge version: {} for read based on SSTable versions on the cluster: {}, " + + "lowest Cassandra version on the cluster: {}", + bridgeVersion.versionName(), sstableVersionsOnCluster, cassandraVersion); return bridgeVersion; } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java index b78d140cf..d8e62b047 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java @@ -120,11 +120,6 @@ protected AbstractBulkWriterContext(@NotNull BulkSparkConf conf, conf.isSSTableVersionBasedBridgeDisabled() ); - logger.info("Selected bridge version: {}, lowest Cassandra version: {}, SSTable versions: {}", - this.bridgeVersion.versionName(), - lowestCassandraVersion, - sstableVersionsOnCluster); - // Validate that Kryo registrator exists for this bridge version KryoRegister.validateKryoRegistratorExists(this.bridgeVersion, lowestCassandraVersion); diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java index 1b3987834..2846c26f7 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java @@ -90,12 +90,13 @@ public class CassandraClusterInfoGroup implements ClusterInfo, MultiClusterSuppo * Creates {@link CassandraClusterInfoGroup} with the list of {@link ClusterInfo} from {@link BulkSparkConf} and validation * The validation ensures non-empty list of {@link ClusterInfo}, where all objects have non-empty and unique clusterId * @param conf bulk write conf - * @param bridgeVersion bridge version (nullable for preliminary construction) * @return new {@link CassandraClusterInfoGroup} instance */ - public static CassandraClusterInfoGroup fromBulkSparkConf(BulkSparkConf conf, CassandraVersion bridgeVersion) + public static CassandraClusterInfoGroup fromBulkSparkConf(BulkSparkConf conf) { - return fromBulkSparkConf(conf, clusterId -> new CassandraClusterInfo(conf, clusterId, bridgeVersion)); + // bridgeVersion is null at construction time; the real version is applied later via setBridgeVersion(), + // once the cluster's SSTable versions have been read from the preliminary group. + return fromBulkSparkConf(conf, clusterId -> new CassandraClusterInfo(conf, clusterId, null)); } @@ -115,7 +116,7 @@ public static CassandraClusterInfoGroup from(BroadcastableClusterInfoGroup broad } /** - * Similar to {@link #fromBulkSparkConf(BulkSparkConf, CassandraVersion)} but takes additional function to create {@link ClusterInfo} + * Similar to {@link #fromBulkSparkConf(BulkSparkConf)} but takes additional function to create {@link ClusterInfo} */ public static CassandraClusterInfoGroup fromBulkSparkConf(BulkSparkConf conf, Function clusterInfoFactory) { diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraCoordinatedBulkWriterContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraCoordinatedBulkWriterContext.java index 410a2ab27..a94eb1643 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraCoordinatedBulkWriterContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraCoordinatedBulkWriterContext.java @@ -111,7 +111,7 @@ private CassandraClusterInfoGroup getOrCreatePreliminaryGroup(BulkSparkConf conf { if (preliminaryGroup == null) { - preliminaryGroup = CassandraClusterInfoGroup.fromBulkSparkConf(conf, (CassandraVersion) null); + preliminaryGroup = CassandraClusterInfoGroup.fromBulkSparkConf(conf); } return preliminaryGroup; diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java index 7df728b1d..51c80517e 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java @@ -369,7 +369,10 @@ protected CassandraVersion initializeSSTableVersionsAndBridgeVersion(String cass } else { - this.sstableVersionsOnCluster = retrieveSSTableVersionsFromCluster(); + // Wrap in a HashSet: retrieveSSTableVersionsFromCluster() may return Collections.emptySet() + // or an unspecified Set type from Collectors.toSet(), but Kryo reads this field back via + // kryo.readObject(in, HashSet.class), so the concrete type must be HashSet. + this.sstableVersionsOnCluster = new HashSet<>(retrieveSSTableVersionsFromCluster()); } // Determine bridge version diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroupTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroupTest.java index 783f05968..796e6f098 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroupTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroupTest.java @@ -34,7 +34,6 @@ import org.junit.jupiter.api.Test; import o.a.c.sidecar.client.shaded.common.response.TokenRangeReplicasResponse; -import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.bulkwriter.BroadcastableClusterInfoGroup; import org.apache.cassandra.spark.bulkwriter.BulkSparkConf; import org.apache.cassandra.spark.bulkwriter.CassandraClusterInfo; @@ -215,7 +214,7 @@ void testTimeSkewTooLarge() void testCreateClusterInfoListFailsDueToAbsentConfiguration() { BulkSparkConf conf = mock(BulkSparkConf.class); - assertThatThrownBy(() -> fromBulkSparkConf(conf, (CassandraVersion) null)) + assertThatThrownBy(() -> fromBulkSparkConf(conf)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("In order to create an instance of CassandraCoordinatedBulkWriterContext, " + "you must provide the appropriate coordinated write configuration by " + @@ -227,7 +226,7 @@ void testCreateClusterInfoListFailsDueToEmptyConfiguration() { BulkSparkConf conf = mock(BulkSparkConf.class, RETURNS_DEEP_STUBS); when(conf.coordinatedWriteConf().clusters()).thenReturn(Collections.emptyMap()); - assertThatThrownBy(() -> fromBulkSparkConf(conf, (CassandraVersion) null)) + assertThatThrownBy(() -> fromBulkSparkConf(conf)) .isInstanceOf(IllegalStateException.class) .hasMessageContaining("No cluster info is built from"); } @@ -238,7 +237,7 @@ void testCreateClusterInfoListFailsDueToEmptyClusterId() BulkSparkConf conf = mock(BulkSparkConf.class); CoordinatedWriteConf.ClusterConf clusterConf = new CoordinatedWriteConf.SimpleClusterConf(Collections.singletonList("localhost:9043"), "localDc"); when(conf.coordinatedWriteConf()).thenReturn(new CoordinatedWriteConf(Collections.singletonMap("", clusterConf))); - assertThatThrownBy(() -> fromBulkSparkConf(conf, (CassandraVersion) null)) + assertThatThrownBy(() -> fromBulkSparkConf(conf)) .isInstanceOf(IllegalStateException.class) .describedAs("The exception message should include the original json to help spot the wrong configuration (empty clusterId)") .hasMessage("Found coordinatedWriteConf with empty or null clusterId. " + diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripSSTableVersionBridgeTestBase.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripSSTableVersionBridgeTestBase.java index 85d5e906f..20a773d79 100644 --- a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripSSTableVersionBridgeTestBase.java +++ b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripSSTableVersionBridgeTestBase.java @@ -26,6 +26,7 @@ import java.util.Map; import java.util.stream.Collectors; +import com.vdurmont.semver4j.Semver; import org.junit.jupiter.api.Test; import org.apache.cassandra.distributed.api.ConsistencyLevel; @@ -110,7 +111,8 @@ protected void beforeClusterProvisioning() if ("bti".equals(sstableFormat())) { // BTI (bti-da) is a Cassandra 5.0+ format; skip on older versions. - assumeTrue(testVersion.version().startsWith("5."), + Semver version = new Semver(testVersion.version(), Semver.SemverType.LOOSE); + assumeTrue(version.isGreaterThanOrEqualTo(new Semver("5.0", Semver.SemverType.LOOSE)), "BTI format (bti-da) requires Cassandra 5.0+, but test version is " + testVersion.version()); } } From ccaebfa8b338a0cc1069663536c4d0a5eb9bc959 Mon Sep 17 00:00:00 2001 From: Shailaja Koppu Date: Thu, 25 Jun 2026 17:55:55 +0100 Subject: [PATCH 6/9] resolve comments --- .../bridge/SSTableVersionAnalyzer.java | 46 +++---- .../apache/cassandra/spark/KryoRegister.java | 40 ------ .../bulkwriter/AbstractBulkWriterContext.java | 4 - .../spark/bulkwriter/BulkSparkConf.java | 2 +- .../bulkwriter/CassandraClusterInfo.java | 19 +++ .../CassandraClusterInfoGroup.java | 6 + .../spark/data/CassandraDataLayer.java | 14 +- .../bridge/SSTableVersionAnalyzerTest.java | 12 +- .../cassandra/spark/KryoRegisterTest.java | 49 ------- .../spark/bulkwriter/BulkSparkConfTest.java | 6 +- .../VersionFeatureLegacyBridgeTest.java | 130 ++++++++++++++++++ .../CassandraDataLayerValidationTest.java | 12 +- ...ipBigSSTableVersionBridgeDisabledTest.java | 2 +- ...ipBtiSSTableVersionBridgeDisabledTest.java | 2 +- 14 files changed, 201 insertions(+), 143 deletions(-) rename {cassandra-analytics-common => cassandra-analytics-core}/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java (85%) rename {cassandra-analytics-common => cassandra-analytics-core}/src/test/java/org/apache/cassandra/bridge/SSTableVersionAnalyzerTest.java (94%) create mode 100644 cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/VersionFeatureLegacyBridgeTest.java diff --git a/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java similarity index 85% rename from cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java rename to cassandra-analytics-core/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java index 98d5e272c..bb55b64e5 100644 --- a/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java @@ -26,6 +26,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.cassandra.spark.bulkwriter.BulkSparkConf; + /** * Analyzes SSTable versions on a cluster to determine the appropriate * Cassandra bridge to load for bulk write/read operations. @@ -36,8 +38,6 @@ public final class SSTableVersionAnalyzer { private static final Logger LOGGER = LoggerFactory.getLogger(SSTableVersionAnalyzer.class); - static final String DISABLE_SSTABLE_VERSION_BASED_BRIDGE = - "spark.cassandra_analytics.bridge.disable_sstable_version_based"; private SSTableVersionAnalyzer() { @@ -51,7 +51,7 @@ private SSTableVersionAnalyzer() * * @param sstableVersionsOnCluster Set of SSTable versions found on cluster nodes * @param requestedFormat User's requested format, example: "big" or "bti" - * @param cassandraVersion Cassandra version string for fallback + * @param cassandraVersion Cassandra version string for legacy version based bridge selection * @param isSSTableVersionBasedBridgeDisabled flag to disable sstable version based bridge determination * @return CassandraVersion enum indicating which bridge to load * @throws UnsupportedOperationException if cluster doesn't support requested format @@ -62,11 +62,11 @@ public static CassandraVersion determineBridgeVersionForWrite(Set sstabl String cassandraVersion, boolean isSSTableVersionBasedBridgeDisabled) { - // Check for fallback mode - Optional fallback = resolveFallbackVersion(cassandraVersion, isSSTableVersionBasedBridgeDisabled); - if (fallback.isPresent()) + // Use legacy cassandra.version based bridge selection when the user has opted in via configuration + Optional version = resolveLegacyVersionBasedBridge(cassandraVersion, isSSTableVersionBasedBridgeDisabled); + if (version.isPresent()) { - return fallback.get(); + return version.get(); } // Validate SSTable versions are present @@ -102,7 +102,7 @@ public static CassandraVersion determineBridgeVersionForWrite(Set sstabl * - Highest SSTable version detected on cluster * * @param sstableVersionsOnCluster Set of SSTable versions found on cluster nodes - * @param cassandraVersion Cassandra version string for fallback + * @param cassandraVersion Cassandra version string for legacy version based bridge selection * @param isSSTableVersionBasedBridgeDisabled flag to disable sstable version based bridge determination * @return CassandraVersion enum indicating which bridge to load * @throws IllegalStateException if SSTable versions are empty/unknown @@ -111,11 +111,11 @@ public static CassandraVersion determineBridgeVersionForRead(Set sstable String cassandraVersion, boolean isSSTableVersionBasedBridgeDisabled) { - // Check for fallback mode - Optional fallback = resolveFallbackVersion(cassandraVersion, isSSTableVersionBasedBridgeDisabled); - if (fallback.isPresent()) + // Use legacy cassandra.version based bridge selection when the user has opted in via configuration + Optional version = resolveLegacyVersionBasedBridge(cassandraVersion, isSSTableVersionBasedBridgeDisabled); + if (version.isPresent()) { - return fallback.get(); + return version.get(); } // Validate SSTable versions are present @@ -131,15 +131,15 @@ public static CassandraVersion determineBridgeVersionForRead(Set sstable return bridgeVersion; } - private static Optional resolveFallbackVersion(String cassandraVersion, - boolean isSSTableVersionBasedBridgeDisabled) + private static Optional resolveLegacyVersionBasedBridge(String cassandraVersion, + boolean isSSTableVersionBasedBridgeDisabled) { if (!isSSTableVersionBasedBridgeDisabled) { return Optional.empty(); } - LOGGER.info("SSTable version-based bridge selection is disabled via configuration. " + + LOGGER.info("SSTable version based bridge selection is disabled via configuration. " + "Using cassandra.version for bridge selection: {}", cassandraVersion); return Optional.of(CassandraVersion.fromVersion(cassandraVersion) .orElseThrow(() -> new UnsupportedOperationException( @@ -158,9 +158,9 @@ private static void ensureSSTableVersionsNotEmpty(Set sstableVersionsOnC { throw new IllegalStateException(String.format( "Unable to retrieve SSTable versions from cluster. " + - "This is required for SSTable version-based bridge selection. " + + "This is required for SSTable version based bridge selection. " + "If you want to bypass this check and use cassandra.version for bridge selection, " + - "set %s=true", DISABLE_SSTABLE_VERSION_BASED_BRIDGE)); + "set %s=true", BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE)); } } @@ -178,10 +178,10 @@ private static CassandraVersion findHighestCassandraVersion(Set sstableV .orElseThrow(() -> new IllegalStateException( String.format("Unknown SSTable version: %s. Cannot determine bridge version. " + "SSTable versions on cluster: %s. " + - "To retry the job using a fallback Cassandra version, " + + "To retry using cassandra.version based bridge selection, " + "set %s=true", highestSSTableVersion, sstableVersionsOnCluster, - DISABLE_SSTABLE_VERSION_BASED_BRIDGE))); + BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE))); } /** @@ -210,8 +210,8 @@ public static String findHighestSSTableVersion(Set versions) { throw new IllegalStateException( String.format("Unknown SSTable version: %s. Cannot determine Cassandra version. " + - "To retry the job using a fallback Cassandra version, " + - "set %s=true", only, DISABLE_SSTABLE_VERSION_BASED_BRIDGE)); + "To retry using cassandra.version based bridge selection, " + + "set %s=true", only, BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE)); } } @@ -225,8 +225,8 @@ public static String findHighestSSTableVersion(Set versions) String unknownVersion = !v1Opt.isPresent() ? v1 : v2; throw new IllegalStateException( String.format("Unknown SSTable version: %s. Cannot determine Cassandra version. " + - "To retry the job using a fallback Cassandra version, " + - "set %s=true", unknownVersion, DISABLE_SSTABLE_VERSION_BASED_BRIDGE)); + "To retry using cassandra.version based bridge selection, " + + "set %s=true", unknownVersion, BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE)); } CassandraVersion cv1 = v1Opt.get(); diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/KryoRegister.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/KryoRegister.java index 087530be8..a0cdff760 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/KryoRegister.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/KryoRegister.java @@ -49,8 +49,6 @@ import org.apache.spark.serializer.KryoRegistrator; import org.jetbrains.annotations.NotNull; -import static org.apache.cassandra.spark.bulkwriter.BulkSparkConf.CASSANDRA_VERSION; - /** * Helper class to register classes for Kryo serialization */ @@ -85,44 +83,6 @@ public static void addSerializer(@NotNull Class type, @NotNull Serializer KRYO_SERIALIZERS.put(type, serializer); } - /** - * Validates that a Kryo registrator exists for the given Cassandra version. - * This should be called after bridge version determination to ensure that - * Spark can properly serialize objects for the selected bridge. - * - * @param bridgeVersion the CassandraVersion to validate - * @param clusterCassandraVersion optional Cassandra version string for additional context in error messages - * @throws IllegalStateException if no Kryo registrator is registered for this version - */ - public static void validateKryoRegistratorExists(@NotNull CassandraVersion bridgeVersion, - String clusterCassandraVersion) - { - Class registratorClass = KRYO_REGISTRATORS.get(bridgeVersion); - if (registratorClass == null) - { - // Build available versions list for suggestion - String availableKryoVersions = KRYO_REGISTRATORS.keySet().stream() - .map(v -> v.name() + " (" + v.versionName() + ")") - .collect(Collectors.joining(", ")); - - throw new IllegalStateException( - String.format("No Kryo registrator registered for bridge version %s (%s). " + - "Cluster Cassandra version: %s. " + - "Available Kryo registrators: %s. " + - "Cannot proceed with job as Spark serialization will fail. " + - "To resolve this issue, update the '%s' configuration property to match the bridge version: %s", - bridgeVersion.name(), - bridgeVersion.versionName(), - clusterCassandraVersion, - availableKryoVersions, - CASSANDRA_VERSION, - bridgeVersion.versionName())); - } - - LOGGER.debug("Validated Kryo registrator exists for bridge version {}: {}", - bridgeVersion.versionName(), registratorClass.getName()); - } - private final CassandraVersion cassandraVersion; protected KryoRegister(CassandraVersion cassandraVersion) diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java index d8e62b047..7645f2987 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java @@ -34,7 +34,6 @@ import org.apache.cassandra.bridge.CassandraBridgeFactory; import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.bridge.SSTableVersionAnalyzer; -import org.apache.cassandra.spark.KryoRegister; import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.CassandraClusterInfoGroup; import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.MultiClusterContainer; import org.apache.cassandra.spark.bulkwriter.token.TokenRangeMapping; @@ -120,9 +119,6 @@ protected AbstractBulkWriterContext(@NotNull BulkSparkConf conf, conf.isSSTableVersionBasedBridgeDisabled() ); - // Validate that Kryo registrator exists for this bridge version - KryoRegister.validateKryoRegistratorExists(this.bridgeVersion, lowestCassandraVersion); - // Build cluster info with determined bridge version this.clusterInfo = buildClusterInfo(this.bridgeVersion); this.clusterInfo.startupValidate(); diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java index f088e0547..b1e44e830 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java @@ -120,7 +120,7 @@ public class BulkSparkConf implements Serializable public static final String CASSANDRA_VERSION = SETTING_PREFIX + "cassandra.version"; // Disable SSTable version-based bridge determination. When true, falls back to using cassandra.version for bridge selection. // This provides a safety fallback mechanism if SSTable version detection fails or encounters issues. - public static final String DISABLE_SSTABLE_VERSION_BASED_BRIDGE = SETTING_PREFIX + "bridge.disable_sstable_version_based"; + public static final String DISABLE_SSTABLE_VERSION_BASED_BRIDGE = SETTING_PREFIX + "bridge.disable_sstable_version_based"; public static final String HTTP_MAX_CONNECTIONS = SETTING_PREFIX + "request.max_connections"; public static final String HTTP_RESPONSE_TIMEOUT = SETTING_PREFIX + "request.response_timeout"; public static final String HTTP_CONNECTION_TIMEOUT = SETTING_PREFIX + "request.connection_timeout"; diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java index ddda0fc4a..419800fe8 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java @@ -464,14 +464,33 @@ else if (allNodeSettings.size() < allNodeSettingFutures.size()) return resolvedNodeSettings; } + /** + * Allows a subclass to override the Cassandra version via a feature flag. + * Returns {@code null} by default, meaning the version is determined from the cluster. + * + * @return overridden Cassandra version, or {@code null} to determine it from the cluster + */ + public String getVersionFromFeature() + { + return null; + } + /** * Retrieves the lowest Cassandra version using the already-fired allNodeSettingFutures. * Reuses the existing CassandraContext instead of creating a separate one. + * If a version override is provided via {@link #getVersionFromFeature()}, that value is used instead. * * @return lowest Cassandra version string */ public String getLowestCassandraVersion() { + String versionFromFeature = getVersionFromFeature(); + if (versionFromFeature != null) + { + // Forcing writer to use a particular version + return versionFromFeature; + } + List allNodeSettings = resolveAllNodeSettings(); NodeSettings ns = allNodeSettings diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java index 2846c26f7..306e4634d 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java @@ -389,6 +389,12 @@ public void setBridgeVersion(CassandraVersion bridgeVersion) */ public String getLowestCassandraVersion() { + // Single cluster: return its version directly without aggregation + if (clusterInfos.size() == 1) + { + return ((CassandraClusterInfo) clusterInfos.get(0)).getLowestCassandraVersion(); + } + Map clusterVersions = new HashMap<>(); for (ClusterInfo ci : clusterInfos) { diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java index 51c80517e..bcc9d39af 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java @@ -78,7 +78,6 @@ import o.a.c.sidecar.client.shaded.client.SidecarInstanceImpl; import o.a.c.sidecar.client.shaded.client.SimpleSidecarInstancesProvider; import o.a.c.sidecar.client.shaded.client.exception.RetriesExhaustedException; -import org.apache.cassandra.spark.KryoRegister; import org.apache.cassandra.spark.bulkwriter.BulkSparkConf; import org.apache.cassandra.spark.common.SidecarInstanceFactory; import org.apache.cassandra.spark.common.SizingFactory; @@ -384,9 +383,6 @@ protected CassandraVersion initializeSSTableVersionsAndBridgeVersion(String cass // Validate SSTable versions validateSStableVersions(this.sstableVersionsOnCluster, bridgeVersion, isSSTableVersionBasedBridgeDisabled); - // Validate that Kryo registrator exists for this bridge version - KryoRegister.validateKryoRegistratorExists(bridgeVersion, cassandraVersion); - return bridgeVersion; } @@ -948,10 +944,10 @@ void validateSStableVersions(Set sstableVersionsOnCluster, CassandraVersion cassandraVersion, boolean isSSTableVersionBasedBridgeDisabled) { - // Skip validation when fallback mode is enabled + // Skip validation when legacy version-based bridge selection is enabled if (isSSTableVersionBasedBridgeDisabled) { - LOGGER.debug("Skipping SSTable version validation on driver - fallback mode enabled via {}=true", + LOGGER.debug("Skipping SSTable version validation on driver - legacy version-based bridge selection enabled via {}=true", BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE); return; } @@ -980,7 +976,7 @@ void validateSStableVersions(Set sstableVersionsOnCluster, "Detected unsupported SSTable version(s) %s for Cassandra version %s. " + "Supported versions: %s. " + "Observed SSTable versions in the cluster: %s. " + - "To retry the job using a fallback Cassandra version, " + + "To retry using cassandra.version based bridge selection, " + "set %s=true", unsupportedVersions, cassandraVersion, @@ -1014,7 +1010,7 @@ void validateSStableVersions(List sstables) if (expectedVersions == null || expectedVersions.isEmpty()) { LOGGER.debug("Skipping SSTable version validation on executor - no expected versions; " - + "SSTable version-based bridge selection is disabled (set {}=true)", + + "SSTable version based bridge selection is disabled (set {}=true)", BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE); return; } @@ -1030,7 +1026,7 @@ void validateSStableVersions(List sstables) String errorMessage = String.format( "SSTable '%s' has version '%s' which was not observed in cluster gossip info. " + "Expected versions from gossip: %s. " + - "To retry the job using a fallback Cassandra version, " + + "To retry using cassandra.version based bridge selection, " + "set %s=true", ssTableFileName, ssTableVersion, diff --git a/cassandra-analytics-common/src/test/java/org/apache/cassandra/bridge/SSTableVersionAnalyzerTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/bridge/SSTableVersionAnalyzerTest.java similarity index 94% rename from cassandra-analytics-common/src/test/java/org/apache/cassandra/bridge/SSTableVersionAnalyzerTest.java rename to cassandra-analytics-core/src/test/java/org/apache/cassandra/bridge/SSTableVersionAnalyzerTest.java index f517ce079..1ac88d138 100644 --- a/cassandra-analytics-common/src/test/java/org/apache/cassandra/bridge/SSTableVersionAnalyzerTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/bridge/SSTableVersionAnalyzerTest.java @@ -40,7 +40,7 @@ public class SSTableVersionAnalyzerTest { // --- determineBridgeVersionForWrite success cases (parameterized) --- - static Stream writeFallbackDisabledSuccessCases() + static Stream writeLegacyDisabledSuccessCases() { return Stream.of( Arguments.of(Collections.singleton("big-oa"), "big", "5.0.0", CassandraVersion.FIVEZERO), @@ -50,8 +50,8 @@ static Stream writeFallbackDisabledSuccessCases() } @ParameterizedTest - @MethodSource("writeFallbackDisabledSuccessCases") - void testDetermineBridgeVersionForWriteFallbackDisabled(Set versions, + @MethodSource("writeLegacyDisabledSuccessCases") + void testDetermineBridgeVersionForWriteLegacyDisabled(Set versions, String format, String cassandraVersion, CassandraVersion expected) @@ -85,7 +85,7 @@ void testDetermineBridgeVersionForWriteNullOrEmptyThrowsException(Set ve // --- determineBridgeVersionForWrite standalone tests --- @Test - void testDetermineBridgeVersionForWriteFallbackEnabled() + void testDetermineBridgeVersionForWriteLegacyEnabled() { CassandraVersion result = SSTableVersionAnalyzer.determineBridgeVersionForWrite( null, "big", "5.0.0", true @@ -106,7 +106,7 @@ void testDetermineBridgeVersionForWriteUnsupportedFormat() // --- determineBridgeVersionForRead standalone tests --- @Test - void testDetermineBridgeVersionForReadFallbackDisabled() + void testDetermineBridgeVersionForReadLegacyDisabled() { Set sstableVersions = Collections.singleton("big-oa"); CassandraVersion result = SSTableVersionAnalyzer.determineBridgeVersionForRead( @@ -116,7 +116,7 @@ void testDetermineBridgeVersionForReadFallbackDisabled() } @Test - void testDetermineBridgeVersionForReadFallbackEnabled() + void testDetermineBridgeVersionForReadLegacyEnabled() { CassandraVersion result = SSTableVersionAnalyzer.determineBridgeVersionForRead( null, "4.0.0", true diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/KryoRegisterTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/KryoRegisterTest.java index b6ab06dbd..396e28703 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/KryoRegisterTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/KryoRegisterTest.java @@ -30,52 +30,12 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatNoException; -import static org.assertj.core.api.Assertions.assertThatThrownBy; /** * Unit tests for KryoRegister */ public class KryoRegisterTest { - @Test - void testValidateKryoRegistratorExistsForFourZero() - { - assertThatNoException() - .describedAs("FOURZERO should have a Kryo registrator") - .isThrownBy(() -> KryoRegister.validateKryoRegistratorExists(CassandraVersion.FOURZERO, "4.0.0")); - } - - @Test - void testValidateKryoRegistratorExistsForFourOne() - { - assertThatNoException() - .describedAs("FOURONE should have a Kryo registrator") - .isThrownBy(() -> KryoRegister.validateKryoRegistratorExists(CassandraVersion.FOURONE, "4.1.0")); - } - - @Test - void testValidateKryoRegistratorExistsForFiveZero() - { - assertThatNoException() - .describedAs("FIVEZERO should have a Kryo registrator") - .isThrownBy(() -> KryoRegister.validateKryoRegistratorExists(CassandraVersion.FIVEZERO, "5.0.0")); - } - - @Test - void testValidateKryoRegistratorMissingForThreeZero() - { - assertThatThrownBy(() -> KryoRegister.validateKryoRegistratorExists(CassandraVersion.THREEZERO, "3.0.0")) - .isInstanceOf(IllegalStateException.class) - .hasMessageContaining("No Kryo registrator registered for bridge version THREEZERO") - .hasMessageContaining("Cluster Cassandra version: 3.0.0") - .hasMessageContaining("Available Kryo registrators:") - .hasMessageContaining("FOURZERO") - .hasMessageContaining("FOURONE") - .hasMessageContaining("FIVEZERO") - // should mention config param to update - .hasMessageContaining("spark.cassandra_analytics.cassandra.version"); - } - @Test void testKryoRegistratorClassesAreCorrect() { @@ -89,15 +49,6 @@ void testKryoRegistratorClassesAreCorrect() .isEqualTo(KryoRegister.V50.class); } - @Test - void testValidateWithNullCassandraVersionString() - { - // Should not throw - clusterCassandraVersion is optional for error message context - assertThatNoException() - .describedAs("Validation should work with null cassandraVersion string") - .isThrownBy(() -> KryoRegister.validateKryoRegistratorExists(CassandraVersion.FOURZERO, null)); - } - @Test void testSetupRegistersAllImplementedVersions() { diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConfTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConfTest.java index 048718501..239f15410 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConfTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConfTest.java @@ -401,7 +401,7 @@ void testDefaultDisableSSTableVersionBasedBridge() Map options = copyDefaultOptions(); BulkSparkConf conf = new BulkSparkConf(sparkConf, options); assertThat(conf.isSSTableVersionBasedBridgeDisabled()) - .describedAs("By default, SSTable version-based bridge should be enabled (false)") + .describedAs("By default, SSTable version based bridge should be enabled (false)") .isFalse(); } @@ -412,7 +412,7 @@ void testSSTableVersionBasedBridgeDisabledViaSparkConf() .set(BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE, "true"); BulkSparkConf bulkConf = new BulkSparkConf(conf, defaultOptions); assertThat(bulkConf.isSSTableVersionBasedBridgeDisabled()) - .describedAs("SSTable version-based bridge should be disabled when configured") + .describedAs("SSTable version based bridge should be disabled when configured") .isTrue(); } @@ -423,7 +423,7 @@ void testSSTableVersionBasedBridgeEnabledExplicitly() .set(BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE, "false"); BulkSparkConf bulkConf = new BulkSparkConf(conf, defaultOptions); assertThat(bulkConf.isSSTableVersionBasedBridgeDisabled()) - .describedAs("SSTable version-based bridge should be enabled") + .describedAs("SSTable version based bridge should be enabled") .isFalse(); } diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/VersionFeatureLegacyBridgeTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/VersionFeatureLegacyBridgeTest.java new file mode 100644 index 000000000..69a25f5b9 --- /dev/null +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/VersionFeatureLegacyBridgeTest.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark.bulkwriter; + +import java.util.Collections; +import java.util.Set; + +import org.junit.jupiter.api.Test; + +import o.a.c.sidecar.client.shaded.client.SidecarClient; +import o.a.c.sidecar.client.shaded.client.SidecarInstance; +import org.apache.cassandra.bridge.CassandraVersion; +import org.apache.cassandra.bridge.SSTableVersionAnalyzer; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Unit test that verifies the {@link CassandraClusterInfo#getVersionFromFeature()} override is honored in the + * legacy version-based bridge selection path (when SSTable version-based bridge selection is disabled). + * + *

{@link CassandraClusterInfo#getLowestCassandraVersion()} returns the override when it is non-null, + * short-circuiting any Sidecar/node-settings lookup. In legacy mode that value is what + * {@link SSTableVersionAnalyzer#determineBridgeVersionForWrite} uses to select the bridge + * (see {@code SSTableVersionAnalyzerTest} for the full version-to-bridge coverage). + * + *

The override is a subclass extension hook rather than a config flag, so it is exercised here via a + * test subclass. No mocking framework is used: the cluster info is backed by a lightweight in-memory + * {@link CassandraContext} (empty cluster, no Sidecar client). + */ +public class VersionFeatureLegacyBridgeTest +{ + @Test + void testFeatureOverrideDrivesLowestCassandraVersionWithoutQueryingCluster() + { + // The in-memory context has no nodes; if the override were ignored, getLowestCassandraVersion() would + // attempt to resolve node settings and fail, so returning the override proves it short-circuits. + CassandraClusterInfo clusterInfo = new FeatureOverrideClusterInfo("4.0.0"); + + assertThat(clusterInfo.getLowestCassandraVersion()) + .describedAs("getVersionFromFeature() override must take precedence over cluster-derived version") + .isEqualTo("4.0.0"); + } + + @Test + void testFeatureOverrideSelectsLegacyBridge() + { + // bridgeVersion field is intentionally FIVEZERO (see the subclass); the legacy bridge must instead + // come from the feature override (4.0.0 -> FOURZERO), proving the override is what drives selection. + CassandraClusterInfo clusterInfo = new FeatureOverrideClusterInfo("4.0.0"); + + // Mirror AbstractBulkWriterContext's legacy flow: sstableVersionsOnCluster is null when disabled, + // and the bridge is derived solely from getLowestCassandraVersion(). + CassandraVersion bridge = SSTableVersionAnalyzer.determineBridgeVersionForWrite( + null, "big", clusterInfo.getLowestCassandraVersion(), /* isSSTableVersionBasedBridgeDisabled */ true); + + assertThat(bridge) + .describedAs("In legacy mode the bridge must come from the feature-overridden version") + .isEqualTo(CassandraVersion.FOURZERO); + } + + /** + * {@link CassandraClusterInfo} that supplies a fixed {@link #getVersionFromFeature()} value and is backed + * by an in-memory {@link CassandraContext} so no real cluster connectivity is required. + */ + private static final class FeatureOverrideClusterInfo extends CassandraClusterInfo + { + private final String featureVersion; + + FeatureOverrideClusterInfo(String featureVersion) + { + // bridgeVersion deliberately differs from the override version under test + super((BulkSparkConf) null, CassandraVersion.FIVEZERO); + this.featureVersion = featureVersion; + } + + @Override + protected CassandraContext buildCassandraContext() + { + return new InMemoryCassandraContext(); + } + + @Override + public String getVersionFromFeature() + { + return featureVersion; + } + } + + /** + * A {@link CassandraContext} with no real Sidecar connectivity: an empty cluster and a null client. + * {@code Sidecar.allNodeSettings} is invoked with an empty instance set during construction, so the + * null client is never dereferenced. + */ + private static final class InMemoryCassandraContext extends CassandraContext + { + InMemoryCassandraContext() + { + super(null, null); + } + + @Override + protected Set createClusterConfig() + { + return Collections.emptySet(); + } + + @Override + protected SidecarClient initializeSidecarClient(BulkSparkConf conf) + { + return null; + } + } +} diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/CassandraDataLayerValidationTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/CassandraDataLayerValidationTest.java index 5a04971f3..41f5b73c8 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/CassandraDataLayerValidationTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/CassandraDataLayerValidationTest.java @@ -91,19 +91,19 @@ void testValidateSStableVersionsWithEmptyVersionsThrowsException() } @Test - void testValidateSStableVersionsSkipsValidationWhenFallbackEnabled() + void testValidateSStableVersionsSkipsValidationWhenLegacyEnabled() { CassandraDataLayer dataLayer = createTestDataLayer(); - // Test with invalid versions - should not throw when fallback enabled + // Test with invalid versions - should not throw when legacy version-based bridge selection is enabled Set invalidVersions = new HashSet<>(List.of("invalid-version")); assertThatNoException() - .describedAs("Validation should be skipped with invalid versions when fallback mode is enabled") + .describedAs("Validation should be skipped with invalid versions when legacy version-based bridge selection is enabled") .isThrownBy(() -> dataLayer.validateSStableVersions(invalidVersions, CassandraVersion.FOURZERO, true)); - // Test with null versions - should not throw when fallback enabled + // Test with null versions - should not throw when legacy version-based bridge selection is enabled assertThatNoException() - .describedAs("Validation should be skipped with null versions when fallback enabled") + .describedAs("Validation should be skipped with null versions when legacy version-based bridge selection is enabled") .isThrownBy(() -> dataLayer.validateSStableVersions(null, CassandraVersion.FOURZERO, true)); } @@ -280,7 +280,7 @@ void testInitializeSSTableVersionsAndBridgeVersionWithFeatureDisabled() CassandraVersion result = dataLayer.initializeSSTableVersionsAndBridgeVersion("4.0.0"); - // Should return FOURZERO bridge version (fallback to cassandra.version) + // Should return FOURZERO bridge version (legacy cassandra.version-based bridge selection) assertThat((Object) result).isEqualTo(CassandraVersion.FOURZERO); // Should set sstableVersionsOnCluster to an empty set (skipped retrieval, never null) diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBigSSTableVersionBridgeDisabledTest.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBigSSTableVersionBridgeDisabledTest.java index 1e26dda5f..c4c0bca5d 100644 --- a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBigSSTableVersionBridgeDisabledTest.java +++ b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBigSSTableVersionBridgeDisabledTest.java @@ -21,7 +21,7 @@ /** * Runs the {@link BulkRoundtripSSTableVersionBridgeTestBase} roundtrip suite with the BIG (big-oa/big-nb) SSTable - * format and SSTable version-based bridge selection DISABLED (fallback to {@code cassandra.version}). + * format and SSTable version-based bridge selection DISABLED (legacy {@code cassandra.version}-based bridge selection). */ class BulkRoundtripBigSSTableVersionBridgeDisabledTest extends BulkRoundtripSSTableVersionBridgeTestBase { diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBtiSSTableVersionBridgeDisabledTest.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBtiSSTableVersionBridgeDisabledTest.java index f34af9fec..b08213f5e 100644 --- a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBtiSSTableVersionBridgeDisabledTest.java +++ b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkRoundtripBtiSSTableVersionBridgeDisabledTest.java @@ -21,7 +21,7 @@ /** * Runs the {@link BulkRoundtripSSTableVersionBridgeTestBase} roundtrip suite with the BTI (bti-da) SSTable format - * and SSTable version-based bridge selection DISABLED (fallback to {@code cassandra.version}). + * and SSTable version-based bridge selection DISABLED (legacy {@code cassandra.version}-based bridge selection). */ class BulkRoundtripBtiSSTableVersionBridgeDisabledTest extends BulkRoundtripSSTableVersionBridgeTestBase { From c0abd8eb6ea768a6d338ece2fe97bfd56a497563 Mon Sep 17 00:00:00 2001 From: Shailaja Koppu Date: Fri, 26 Jun 2026 15:05:32 +0100 Subject: [PATCH 7/9] lowest version for write --- .../cassandra/bridge/CassandraVersion.java | 58 +---- .../bridge/CassandraVersionTest.java | 14 - .../bridge/SSTableVersionAnalyzer.java | 243 +++++------------- .../bulkwriter/AbstractBulkWriterContext.java | 39 +-- .../CassandraBulkWriterContext.java | 11 +- .../bulkwriter/CassandraClusterInfo.java | 45 +++- .../CassandraClusterInfoGroup.java | 60 ++--- ...CassandraCoordinatedBulkWriterContext.java | 11 +- .../spark/data/CassandraDataLayer.java | 97 ++----- .../bridge/SSTableVersionAnalyzerTest.java | 148 ++++------- .../AbstractBulkWriterContextTest.java | 63 +---- .../BulkWriterConfigExtensibilityTest.java | 8 +- ...CassandraClusterInfoBridgeVersionTest.java | 177 +++++++++++++ .../VersionFeatureLegacyBridgeTest.java | 130 ---------- .../CassandraClusterInfoGroupTest.java | 48 ++++ .../CassandraDataLayerValidationTest.java | 112 +------- 16 files changed, 457 insertions(+), 807 deletions(-) create mode 100644 cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfoBridgeVersionTest.java delete mode 100644 cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/VersionFeatureLegacyBridgeTest.java diff --git a/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/CassandraVersion.java b/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/CassandraVersion.java index 1ef30dc0f..b668ee631 100644 --- a/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/CassandraVersion.java +++ b/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/CassandraVersion.java @@ -56,38 +56,42 @@ public enum CassandraVersion "big-md", "big-me", "big-mf" - }), + }, 30), FOURZERO(40, "4.0", "four-zero", new String[]{"big"}, new String[]{ // Cassandra 4.0 native sstable versions "big-na", "big-nb", - }), + }, 30), FOURONE(41, "4.1", "four-zero", new String[]{"big"}, new String[]{ // Cassandra 4.1 did not introduce new native SSTable versions - }), + }, 30), FIVEZERO(50, "5.0", "five-zero", new String[]{"big", "bti"}, new String[]{ // Cassandra 5.0 native sstable versions "big-oa", "bti-da", - }); + }, 40); private final int number; private final String name; private final String jarBaseName; // Must match shadowJar.archiveFileName from Gradle configuration (without extension) private final Set sstableFormats; private final List nativeSStableVersions; + // Lowest Cassandra version number whose SSTables this version can read (inclusive). + private final int lowestCompatibleVersionNumber; - CassandraVersion(int number, String name, String jarBaseName, String[] sstableFormats, String[] nativeSStableVersions) + CassandraVersion(int number, String name, String jarBaseName, String[] sstableFormats, String[] nativeSStableVersions, + int lowestCompatibleVersionNumber) { this.number = number; this.name = name; this.jarBaseName = jarBaseName; this.sstableFormats = new HashSet<>(Arrays.asList(sstableFormats)); this.nativeSStableVersions = List.of(nativeSStableVersions); + this.lowestCompatibleVersionNumber = lowestCompatibleVersionNumber; } public int versionNumber() @@ -127,29 +131,18 @@ public List getNativeSStableVersions() /** * Get the set of SSTable version strings that this Cassandra version can read. - * This includes: - * - Native versions for this Cassandra version - * - All SSTable versions from the previous major version (including all minor versions) - * For example, Cassandra 5.0 can read: - * - 5.0 native versions (big-oa, bti-da) - * - 4.0 versions (big-na, big-nb) - * - 4.1 versions (if any) - * But NOT 3.0 versions + * This includes the native versions of every Cassandra version from {@link #lowestCompatibleVersionNumber} + * (inclusive) up to this version (inclusive). For example, Cassandra 5.0 can read 4.0, 4.1 and 5.0 + * SSTables (lowestCompatible=40) but NOT 3.x. * * @return Set of full SSTable version strings that can be read */ public Set getSupportedSStableVersionsForRead() { - Set readableVersions = new HashSet<>(this.nativeSStableVersions); - - int previousMajor = getPreviousMajorVersion(); - - // Add all SSTable versions from the previous major version and its minors - // E.g., C* 5.0 (version 50) can read C* 4.0 (40) and C* 4.1 (41) SSTables, but not C* 3.x (30) + Set readableVersions = new HashSet<>(); for (CassandraVersion version : CassandraVersion.values()) { - // Include versions from the previous major version family (e.g., 40-49 for C* 5.0) - if (version.versionNumber() >= previousMajor && version.versionNumber() < this.number) + if (version.number >= lowestCompatibleVersionNumber && version.number <= this.number) { readableVersions.addAll(version.nativeSStableVersions); } @@ -158,29 +151,6 @@ public Set getSupportedSStableVersionsForRead() return Collections.unmodifiableSet(readableVersions); } - /** - * Get the previous major version number for this Cassandra version. - * Calculates dynamically using: (majorVersion - 1) * 10 - * For example: - * - C5.0 (50) returns 40 (C4.x) - * - C4.1 (41) returns 30 (C3.x) - * - C4.0 (40) returns 30 (C3.x) - * - C3.0 (30) returns 20 (C2.x - which doesn't exist) - * - C10.0 (100) returns 90 (C9.x) - * - * @return previous major version number - */ - @VisibleForTesting - int getPreviousMajorVersion() - { - // Get major version: 50 -> 5, 41 -> 4, 40 -> 4, 30 -> 3 - int majorVersion = this.number / 10; - - // Calculate previous major version: (majorVersion - 1) * 10 - // E.g., 5 -> 40, 4 -> 30, 3 -> 20 - return (majorVersion - 1) * 10; - } - private static final String configuredSSTableFormat; private static final CassandraVersion[] implementedVersions; private static final String[] supportedVersions; diff --git a/cassandra-analytics-common/src/test/java/org/apache/cassandra/bridge/CassandraVersionTest.java b/cassandra-analytics-common/src/test/java/org/apache/cassandra/bridge/CassandraVersionTest.java index 1f8c1a107..2158cb3fa 100644 --- a/cassandra-analytics-common/src/test/java/org/apache/cassandra/bridge/CassandraVersionTest.java +++ b/cassandra-analytics-common/src/test/java/org/apache/cassandra/bridge/CassandraVersionTest.java @@ -138,20 +138,6 @@ void testGetNativeSStableVersionsFourOneEmpty() assertThat(nativeVersions).isEmpty(); } - @ParameterizedTest - @CsvSource({ - "FIVEZERO, 40", - "FOURONE, 30", - "FOURZERO, 30", - "THREEZERO, 20" - }) - void testGetPreviousMajorVersion(String versionName, int expectedPrevious) - { - CassandraVersion version = CassandraVersion.valueOf(versionName); - int previous = version.getPreviousMajorVersion(); - assertThat(previous).isEqualTo(expectedPrevious); - } - @Test void testConfiguredSSTableFormatDefault() { diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java index bb55b64e5..d79ef1446 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java @@ -20,232 +20,115 @@ package org.apache.cassandra.bridge; import java.util.Comparator; -import java.util.Optional; +import java.util.List; import java.util.Set; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.cassandra.spark.bulkwriter.BulkSparkConf; +import java.util.stream.Collectors; +import java.util.stream.Stream; /** - * Analyzes SSTable versions on a cluster to determine the appropriate - * Cassandra bridge to load for bulk write/read operations. - * - *

This class provides logic to select Cassandra bridge based on the highest SSTable - * version detected on the cluster and the user's requested format preference.

+ * Determines which Cassandra bridge to load from the SSTable versions present on a cluster. */ public final class SSTableVersionAnalyzer { - private static final Logger LOGGER = LoggerFactory.getLogger(SSTableVersionAnalyzer.class); - private SSTableVersionAnalyzer() { // Utility class } /** - * Determines which CassandraVersion bridge to load based on: - * - Highest SSTable version detected on cluster - * - User's format preference + * Determines the bridge version for bulk write from the SSTable versions present on the cluster. + * Picks the lowest Cassandra version present so the produced SSTables can be imported by every node. * - * @param sstableVersionsOnCluster Set of SSTable versions found on cluster nodes - * @param requestedFormat User's requested format, example: "big" or "bti" - * @param cassandraVersion Cassandra version string for legacy version based bridge selection - * @param isSSTableVersionBasedBridgeDisabled flag to disable sstable version based bridge determination - * @return CassandraVersion enum indicating which bridge to load - * @throws UnsupportedOperationException if cluster doesn't support requested format - * @throws IllegalStateException if SSTable versions are empty/unknown + * @param sstableVersionsOnCluster set of SSTable versions found on cluster nodes + * @param requestedFormat user's requested format, e.g. "big" or "bti" + * @return the CassandraVersion bridge to load for write + * @throws IllegalStateException if the versions are empty/null or contain an unknown version + * @throws UnsupportedOperationException if the versions are mutually incompatible, or the chosen + * bridge does not support the requested format */ - public static CassandraVersion determineBridgeVersionForWrite(Set sstableVersionsOnCluster, - String requestedFormat, - String cassandraVersion, - boolean isSSTableVersionBasedBridgeDisabled) + public static CassandraVersion determineBridgeVersionForWrite(Set sstableVersionsOnCluster, String requestedFormat) { - // Use legacy cassandra.version based bridge selection when the user has opted in via configuration - Optional version = resolveLegacyVersionBasedBridge(cassandraVersion, isSSTableVersionBasedBridgeDisabled); - if (version.isPresent()) - { - return version.get(); - } - - // Validate SSTable versions are present - ensureSSTableVersionsNotEmpty(sstableVersionsOnCluster); - - // Find highest Cassandra version based on SSTable versions - CassandraVersion highestCassandraVersion = findHighestCassandraVersion(sstableVersionsOnCluster); + // Every observed version must be mutually compatible (readable by the highest version present). + ensureMutuallyCompatibleVersions(sstableVersionsOnCluster, highestCompatibleVersion(sstableVersionsOnCluster)); - // Check if highestCassandraVersion supports the requested format - boolean supportsRequestedFormat = highestCassandraVersion.getNativeSStableVersions() - .stream() - .anyMatch(v -> v.startsWith(requestedFormat + "-")); + CassandraVersion bridgeVersion = cassandraVersionsFromSSTableVersions(sstableVersionsOnCluster) + .min(Comparator.comparingInt(CassandraVersion::versionNumber)) + .orElseThrow(() -> new IllegalStateException("Unable to determine the lowest SSTable version")); - if (supportsRequestedFormat) - { - LOGGER.info("Determined bridge version: {} for write based on SSTable versions on the cluster: {}, " - + "requested SSTable format: '{}', lowest Cassandra version on the cluster: {}", - highestCassandraVersion.versionName(), sstableVersionsOnCluster, requestedFormat, cassandraVersion); - return highestCassandraVersion; - } - else + if (!bridgeVersion.sstableFormats().contains(requestedFormat)) { throw new UnsupportedOperationException(String.format( - "Cluster does not support requested SSTable format '%s'. " + - "Bridge version determined is %s, which only supports formats: %s", - requestedFormat, highestCassandraVersion.versionName(), - highestCassandraVersion.sstableFormats())); + "Cluster does not support requested SSTable format '%s'. Bridge version determined is %s, " + + "which only supports formats: %s", + requestedFormat, bridgeVersion.versionName(), bridgeVersion.sstableFormats())); } + return bridgeVersion; } /** - * Determines which CassandraVersion bridge to load for read operations based on: - * - Highest SSTable version detected on cluster + * Determines the bridge version for bulk read from the SSTable versions present on the cluster. + * Picks the highest Cassandra version present, which can read all (older) versions present. * - * @param sstableVersionsOnCluster Set of SSTable versions found on cluster nodes - * @param cassandraVersion Cassandra version string for legacy version based bridge selection - * @param isSSTableVersionBasedBridgeDisabled flag to disable sstable version based bridge determination - * @return CassandraVersion enum indicating which bridge to load - * @throws IllegalStateException if SSTable versions are empty/unknown + * @param sstableVersionsOnCluster set of SSTable versions found on cluster nodes + * @return the CassandraVersion bridge to load for read + * @throws IllegalStateException if the versions are empty/null or contain an unknown version + * @throws UnsupportedOperationException if the versions are mutually incompatible */ - public static CassandraVersion determineBridgeVersionForRead(Set sstableVersionsOnCluster, - String cassandraVersion, - boolean isSSTableVersionBasedBridgeDisabled) + public static CassandraVersion determineBridgeVersionForRead(Set sstableVersionsOnCluster) { - // Use legacy cassandra.version based bridge selection when the user has opted in via configuration - Optional version = resolveLegacyVersionBasedBridge(cassandraVersion, isSSTableVersionBasedBridgeDisabled); - if (version.isPresent()) - { - return version.get(); - } - - // Validate SSTable versions are present - ensureSSTableVersionsNotEmpty(sstableVersionsOnCluster); - - // Find highest Cassandra version based on SSTable versions - CassandraVersion bridgeVersion = findHighestCassandraVersion(sstableVersionsOnCluster); - - LOGGER.info("Determined bridge version: {} for read based on SSTable versions on the cluster: {}, " - + "lowest Cassandra version on the cluster: {}", - bridgeVersion.versionName(), sstableVersionsOnCluster, cassandraVersion); - - return bridgeVersion; - } - - private static Optional resolveLegacyVersionBasedBridge(String cassandraVersion, - boolean isSSTableVersionBasedBridgeDisabled) - { - if (!isSSTableVersionBasedBridgeDisabled) - { - return Optional.empty(); - } - - LOGGER.info("SSTable version based bridge selection is disabled via configuration. " + - "Using cassandra.version for bridge selection: {}", cassandraVersion); - return Optional.of(CassandraVersion.fromVersion(cassandraVersion) - .orElseThrow(() -> new UnsupportedOperationException( - String.format("Unsupported Cassandra version: %s", cassandraVersion)))); + CassandraVersion highest = highestCompatibleVersion(sstableVersionsOnCluster); + ensureMutuallyCompatibleVersions(sstableVersionsOnCluster, highest); + return highest; } /** - * Ensures that SSTable versions from cluster are not null or empty. + * Verifies that every observed SSTable version is readable by the given highest version present; otherwise + * the cluster spans incompatible Cassandra majors and the job cannot proceed. * - * @param sstableVersionsOnCluster Set of SSTable versions to validate - * @throws IllegalStateException if versions are null or empty + * @param sstableVersionsOnCluster the observed SSTable versions + * @param highest the highest Cassandra version present (see {@link #highestCompatibleVersion(Set)}) + * @throws UnsupportedOperationException if any observed version cannot be read by {@code highest} */ - private static void ensureSSTableVersionsNotEmpty(Set sstableVersionsOnCluster) + private static void ensureMutuallyCompatibleVersions(Set sstableVersionsOnCluster, CassandraVersion highest) { - if (sstableVersionsOnCluster == null || sstableVersionsOnCluster.isEmpty()) + Set readable = highest.getSupportedSStableVersionsForRead(); + List incompatible = sstableVersionsOnCluster.stream() + .filter(version -> !readable.contains(version)) + .collect(Collectors.toList()); + if (!incompatible.isEmpty()) { - throw new IllegalStateException(String.format( - "Unable to retrieve SSTable versions from cluster. " + - "This is required for SSTable version based bridge selection. " + - "If you want to bypass this check and use cassandra.version for bridge selection, " + - "set %s=true", BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE)); + throw new UnsupportedOperationException(String.format( + "SSTable versions on the cluster are not mutually compatible: %s cannot be read by the highest " + + "version present (%s, which reads %s). Observed SSTable versions: %s", + incompatible, highest.versionName(), readable, sstableVersionsOnCluster)); } } /** - * Finds the highest Cassandra version based on SSTable versions found on cluster. + * Returns the highest Cassandra version among the observed SSTable versions. * - * @param sstableVersionsOnCluster Set of SSTable versions found on cluster - * @return CassandraVersion corresponding to the highest SSTable version - * @throws IllegalStateException if highest version is unknown + * @throws IllegalStateException if the versions are empty/null or contain an unknown version */ - private static CassandraVersion findHighestCassandraVersion(Set sstableVersionsOnCluster) + private static CassandraVersion highestCompatibleVersion(Set sstableVersionsOnCluster) { - String highestSSTableVersion = findHighestSSTableVersion(sstableVersionsOnCluster); - return CassandraVersion.fromSSTableVersion(highestSSTableVersion) - .orElseThrow(() -> new IllegalStateException( - String.format("Unknown SSTable version: %s. Cannot determine bridge version. " + - "SSTable versions on cluster: %s. " + - "To retry using cassandra.version based bridge selection, " + - "set %s=true", - highestSSTableVersion, sstableVersionsOnCluster, - BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE))); + return cassandraVersionsFromSSTableVersions(sstableVersionsOnCluster) + .max(Comparator.comparingInt(CassandraVersion::versionNumber)) + .orElseThrow(() -> new IllegalStateException("Unable to determine the highest SSTable version")); } - /** - * Finds the highest SSTable version from the set using CassandraVersion mappings. - * Ordering is based on CassandraVersion number (e.g., 5.0 > 4.0 > 3.0). - * Versions within the same CassandraVersion are considered equal. - * - * @param versions Set of SSTable version strings - * @return Highest SSTable version string - * @throws IllegalStateException if versions is empty, contains null values, or contains unknown versions - */ - public static String findHighestSSTableVersion(Set versions) + private static Stream cassandraVersionsFromSSTableVersions(Set sstableVersionsOnCluster) { - if (versions == null || versions.isEmpty()) - { - throw new IllegalStateException("SSTable versions set cannot be empty"); - } - - // The Comparator below is never invoked for a single-element set, so an unknown version - // would otherwise slip through. Validate the sole element here; multi-element sets are - // fully validated by the Comparator (every element participates in at least one comparison). - if (versions.size() == 1) + if (sstableVersionsOnCluster == null || sstableVersionsOnCluster.isEmpty()) { - String only = versions.iterator().next(); - if (!CassandraVersion.fromSSTableVersion(only).isPresent()) - { - throw new IllegalStateException( - String.format("Unknown SSTable version: %s. Cannot determine Cassandra version. " + - "To retry using cassandra.version based bridge selection, " + - "set %s=true", only, BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE)); - } + throw new IllegalStateException("Unable to determine Cassandra versions: no SSTable versions found on the cluster"); } - Comparator sstableVersionComparator = (v1, v2) -> { - // Find which CassandraVersion each SSTable version belongs to - Optional v1Opt = CassandraVersion.fromSSTableVersion(v1); - Optional v2Opt = CassandraVersion.fromSSTableVersion(v2); - - if (!v1Opt.isPresent() || !v2Opt.isPresent()) - { - String unknownVersion = !v1Opt.isPresent() ? v1 : v2; - throw new IllegalStateException( - String.format("Unknown SSTable version: %s. Cannot determine Cassandra version. " + - "To retry using cassandra.version based bridge selection, " + - "set %s=true", unknownVersion, BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE)); - } - - CassandraVersion cv1 = v1Opt.get(); - CassandraVersion cv2 = v2Opt.get(); - - // First, compare by CassandraVersion number - // FIVEZERO (50) > FOURONE (41) > FOURZERO (40) > THREEZERO (30) - int versionComparison = Integer.compare(cv1.versionNumber(), cv2.versionNumber()); - if (versionComparison != 0) - { - return versionComparison; - } - - // Same CassandraVersion - versions are considered equal - return 0; - }; + return sstableVersionsOnCluster.stream().map(SSTableVersionAnalyzer::toCassandraVersion); + } - return versions.stream() - .max(sstableVersionComparator) - .orElseThrow(() -> new IllegalStateException("Unable to find highest SSTable version")); + private static CassandraVersion toCassandraVersion(String sstableVersion) + { + return CassandraVersion.fromSSTableVersion(sstableVersion) + .orElseThrow(() -> new IllegalStateException("Unknown SSTable version: " + sstableVersion)); } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java index 7645f2987..c7f72ef1a 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java @@ -33,7 +33,6 @@ import org.apache.cassandra.bridge.CassandraBridge; import org.apache.cassandra.bridge.CassandraBridgeFactory; import org.apache.cassandra.bridge.CassandraVersion; -import org.apache.cassandra.bridge.SSTableVersionAnalyzer; import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.CassandraClusterInfoGroup; import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.MultiClusterContainer; import org.apache.cassandra.spark.bulkwriter.token.TokenRangeMapping; @@ -100,24 +99,8 @@ protected AbstractBulkWriterContext(@NotNull BulkSparkConf conf, this.conf = conf; this.sparkDefaultParallelism = sparkDefaultParallelism; - // Retrieve lowest Cassandra version without building full ClusterInfo - String lowestCassandraVersion = getLowestCassandraVersion(conf); - Set sstableVersionsOnCluster = null; - - // Get SSTable versions from cluster only if SSTable version-based selection is enabled - // If disabled, skip retrieval to allow job to proceed even when SSTable version detection fails - if (!conf.isSSTableVersionBasedBridgeDisabled()) - { - sstableVersionsOnCluster = getSSTableVersionsOnCluster(conf); - } - - // Determine bridge version - this.bridgeVersion = SSTableVersionAnalyzer.determineBridgeVersionForWrite( - sstableVersionsOnCluster, - CassandraVersion.configuredSSTableFormat(), - lowestCassandraVersion, - conf.isSSTableVersionBasedBridgeDisabled() - ); + // Determine the bridge version + this.bridgeVersion = getBridgeVersion(); // Build cluster info with determined bridge version this.clusterInfo = buildClusterInfo(this.bridgeVersion); @@ -178,23 +161,15 @@ public CassandraVersion bridgeVersion() return bridgeVersion; } - /*--- Methods to build required fields ---*/ - /** - * Retrieves the lowest Cassandra version from the cluster(s). + * Determines the Cassandra bridge version to use for this write. Implementations apply any version + * override, SSTable-version-based selection, or the legacy cassandra.version fallback as appropriate. * - * @param conf Bulk Spark configuration - * @return lowest Cassandra version string + * @return the determined Cassandra bridge version */ - protected abstract String getLowestCassandraVersion(@NotNull BulkSparkConf conf); + protected abstract CassandraVersion getBridgeVersion(); - /** - * Retrieves SSTable versions from the cluster(s). - * - * @param conf Bulk Spark configuration - * @return set of SSTable version strings present on the cluster(s) - */ - protected abstract Set getSSTableVersionsOnCluster(@NotNull BulkSparkConf conf); + /*--- Methods to build required fields ---*/ /** * Builds the ClusterInfo with the determined bridge version. diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraBulkWriterContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraBulkWriterContext.java index 14ab2958b..8f7b8f517 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraBulkWriterContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraBulkWriterContext.java @@ -19,7 +19,6 @@ package org.apache.cassandra.spark.bulkwriter; -import java.util.Set; import java.util.UUID; import com.google.common.base.Preconditions; @@ -66,15 +65,9 @@ protected CassandraBulkWriterContext(@NotNull BulkWriterConfig config) } @Override - protected String getLowestCassandraVersion(@NotNull BulkSparkConf conf) + protected CassandraVersion getBridgeVersion() { - return getOrCreatePreliminaryClusterInfo(conf).getLowestCassandraVersion(); - } - - @Override - protected Set getSSTableVersionsOnCluster(@NotNull BulkSparkConf conf) - { - return getOrCreatePreliminaryClusterInfo(conf).getSSTableVersionsOnCluster(); + return getOrCreatePreliminaryClusterInfo(bulkSparkConf()).getBridgeVersion(); } @Override diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java index 419800fe8..792627728 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java @@ -49,6 +49,7 @@ import org.apache.cassandra.bridge.CassandraBridgeFactory; import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.bridge.CassandraVersionFeatures; +import org.apache.cassandra.bridge.SSTableVersionAnalyzer; import org.apache.cassandra.clients.Sidecar; import o.a.c.sidecar.client.shaded.client.SidecarInstance; import o.a.c.sidecar.client.shaded.client.SidecarInstanceImpl; @@ -476,21 +477,47 @@ public String getVersionFromFeature() } /** - * Retrieves the lowest Cassandra version using the already-fired allNodeSettingFutures. - * Reuses the existing CassandraContext instead of creating a separate one. - * If a version override is provided via {@link #getVersionFromFeature()}, that value is used instead. + * Determines the Cassandra bridge version for this cluster, in priority order: + *
    + *
  1. an explicit version override from {@link #getVersionFromFeature()} (an operator escape hatch) — + * takes precedence even when SSTable-version-based selection is enabled;
  2. + *
  3. otherwise, when SSTable version-based selection is enabled, the lowest version derived from the + * SSTable versions present, so the produced SSTables remain importable by every node;
  4. + *
  5. otherwise (feature disabled), the lowest Cassandra release version reported by the cluster.
  6. + *
* - * @return lowest Cassandra version string + * @return the determined Cassandra bridge version */ - public String getLowestCassandraVersion() + public CassandraVersion getBridgeVersion() { - String versionFromFeature = getVersionFromFeature(); - if (versionFromFeature != null) + String versionOverride = getVersionFromFeature(); + if (versionOverride != null) { - // Forcing writer to use a particular version - return versionFromFeature; + return CassandraVersion.fromVersion(versionOverride) + .orElseThrow(() -> new UnsupportedOperationException( + "Unsupported Cassandra version override: " + versionOverride)); } + if (!conf.isSSTableVersionBasedBridgeDisabled()) + { + return SSTableVersionAnalyzer.determineBridgeVersionForWrite(getSSTableVersionsOnCluster(), + CassandraVersion.configuredSSTableFormat()); + } + + String lowestCassandraVersion = getLowestCassandraVersion(); + return CassandraVersion.fromVersion(lowestCassandraVersion) + .orElseThrow(() -> new UnsupportedOperationException( + "Unsupported Cassandra version: " + lowestCassandraVersion)); + } + + /** + * Retrieves the lowest Cassandra version reported by the cluster, using the already-fired + * allNodeSettingFutures. Reuses the existing CassandraContext instead of creating a separate one. + * + * @return lowest Cassandra version string + */ + public String getLowestCassandraVersion() + { List allNodeSettings = resolveAllNodeSettings(); NodeSettings ns = allNodeSettings diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java index 306e4634d..4245e7a8b 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java @@ -22,9 +22,9 @@ import java.math.BigInteger; import java.util.ArrayList; import java.util.Collections; +import java.util.Comparator; import java.util.EnumSet; import java.util.HashMap; -import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -41,7 +41,6 @@ import org.slf4j.LoggerFactory; import org.apache.cassandra.bridge.CassandraVersion; -import org.apache.cassandra.bridge.CassandraVersionFeatures; import org.apache.cassandra.spark.bulkwriter.BulkSparkConf; import org.apache.cassandra.spark.bulkwriter.CassandraClusterInfo; import org.apache.cassandra.spark.bulkwriter.CassandraContext; @@ -383,54 +382,27 @@ public void setBridgeVersion(CassandraVersion bridgeVersion) } /** - * Retrieves the lowest Cassandra version from all contained clusters. + * Determines the Cassandra bridge version for a coordinated write across all clusters. * - * @return lowest Cassandra version string across all clusters + *

Each cluster determines its own bridge version (see {@link CassandraClusterInfo#getBridgeVersion()}), + * which is the lowest mutually-compatible SSTable version present on that cluster. Across clusters the + * lowest of those is chosen so the produced SSTables remain importable by every cluster (a node can import + * its own and older SSTable versions, but not newer ones). + * + * @return the determined Cassandra bridge version */ - public String getLowestCassandraVersion() + public CassandraVersion getBridgeVersion() { - // Single cluster: return its version directly without aggregation + // Single cluster: use its bridge version directly without aggregation if (clusterInfos.size() == 1) { - return ((CassandraClusterInfo) clusterInfos.get(0)).getLowestCassandraVersion(); - } - - Map clusterVersions = new HashMap<>(); - for (ClusterInfo ci : clusterInfos) - { - CassandraClusterInfo cci = (CassandraClusterInfo) ci; - clusterVersions.put(ci.clusterId(), cci.getLowestCassandraVersion()); + return ((CassandraClusterInfo) clusterInfos.get(0)).getBridgeVersion(); } - // Find the lowest version across all clusters - List versions = clusterVersions.values() - .stream() - .map(CassandraVersionFeatures::cassandraVersionFeaturesFromCassandraVersion) - .sorted() - .collect(Collectors.toList()); - - CassandraVersionFeatures first = versions.get(0); - CassandraVersionFeatures last = versions.get(versions.size() - 1); - Preconditions.checkState(first.getMajorVersion() == last.getMajorVersion(), - "Cluster versions are not compatible. lowest=%s and highest=%s", - first.getRawVersionString(), last.getRawVersionString()); - - return first.getRawVersionString(); - } - - /** - * Retrieves aggregated SSTable versions from all contained clusters. - * - * @return set of SSTable versions present across all clusters - */ - public Set getSSTableVersionsOnCluster() - { - Set aggregatedSSTableVersions = new HashSet<>(); - for (ClusterInfo ci : clusterInfos) - { - CassandraClusterInfo cci = (CassandraClusterInfo) ci; - aggregatedSSTableVersions.addAll(cci.getSSTableVersionsOnCluster()); - } - return aggregatedSSTableVersions; + // Write at the lowest so every cluster can import the produced SSTables + return clusterInfos.stream() + .map(ci -> ((CassandraClusterInfo) ci).getBridgeVersion()) + .min(Comparator.comparingInt(CassandraVersion::versionNumber)) + .orElseThrow(() -> new IllegalStateException("No cluster bridge versions available")); } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraCoordinatedBulkWriterContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraCoordinatedBulkWriterContext.java index a94eb1643..eb101c699 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraCoordinatedBulkWriterContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraCoordinatedBulkWriterContext.java @@ -19,7 +19,6 @@ package org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated; -import java.util.Set; import java.util.UUID; import com.google.common.base.Preconditions; @@ -86,15 +85,9 @@ private void validateConfiguration(BulkSparkConf conf) } @Override - protected String getLowestCassandraVersion(@NotNull BulkSparkConf conf) + protected CassandraVersion getBridgeVersion() { - return getOrCreatePreliminaryGroup(conf).getLowestCassandraVersion(); - } - - @Override - protected Set getSSTableVersionsOnCluster(@NotNull BulkSparkConf conf) - { - return getOrCreatePreliminaryGroup(conf).getSSTableVersionsOnCluster(); + return getOrCreatePreliminaryGroup(bulkSparkConf()).getBridgeVersion(); } @Override diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java index bcc9d39af..63c9b9a7d 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java @@ -357,32 +357,38 @@ protected CassandraVersion initializeSSTableVersionsAndBridgeVersion(String cass // Check if user has explicitly disabled SSTable version-based selection via Spark configuration boolean isSSTableVersionBasedBridgeDisabled = isSSTableVersionBasedBridgeDisabled(); - // Get SSTable versions from cluster only if SSTable version-based selection is enabled - // If disabled, use an empty set (never null) so downstream code - including executor-side - // validation and serialization - needs no null handling. On executors an empty set is the - // signal that the feature was disabled on the driver. + CassandraVersion bridgeVersion; if (isSSTableVersionBasedBridgeDisabled) { - //Use a HashSet, because Kryo serializer reads back via kryo.readObject(in, HashSet.class) + // Disabled: skip cluster SSTable-version retrieval and fall back to the configured cassandra.version. + // Use an empty set (never null) so downstream code - including executor-side validation and + // serialization - needs no null handling; on executors an empty set signals the feature was disabled + // on the driver. HashSet specifically, because Kryo reads this field back via kryo.readObject(in, HashSet.class). this.sstableVersionsOnCluster = new HashSet<>(); + bridgeVersion = CassandraVersion.fromVersion(cassandraVersion) + .orElseThrow(() -> new UnsupportedOperationException( + "Unsupported Cassandra version: " + cassandraVersion)); } else { + Set retrieved = retrieveSSTableVersionsFromCluster(); + // Fail fast with an actionable hint when no SSTable versions could be retrieved while the + // feature is enabled (in disabled mode this branch is not taken, so empty here is unexpected). + if (retrieved.isEmpty()) + { + throw new IllegalStateException(String.format( + "Unable to retrieve SSTable versions from cluster. This is required for SSTable " + + "version-based bridge selection. To bypass this check and use cassandra.version for " + + "bridge selection, set %s=true", BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE)); + } // Wrap in a HashSet: retrieveSSTableVersionsFromCluster() may return Collections.emptySet() // or an unspecified Set type from Collectors.toSet(), but Kryo reads this field back via // kryo.readObject(in, HashSet.class), so the concrete type must be HashSet. - this.sstableVersionsOnCluster = new HashSet<>(retrieveSSTableVersionsFromCluster()); + this.sstableVersionsOnCluster = new HashSet<>(retrieved); + // Pick the highest (mutually-compatible) version present on the cluster. + bridgeVersion = SSTableVersionAnalyzer.determineBridgeVersionForRead(sstableVersionsOnCluster); } - // Determine bridge version - CassandraVersion bridgeVersion = SSTableVersionAnalyzer.determineBridgeVersionForRead( - sstableVersionsOnCluster, - cassandraVersion, - isSSTableVersionBasedBridgeDisabled); - - // Validate SSTable versions - validateSStableVersions(this.sstableVersionsOnCluster, bridgeVersion, isSSTableVersionBasedBridgeDisabled); - return bridgeVersion; } @@ -930,67 +936,6 @@ private static String readNullable(ObjectInputStream in) throws IOException return null; } - /** - * Validates that all SSTable versions are supported for the given Cassandra version. - * This validation runs on the Spark driver. - * - * @param sstableVersionsOnCluster set of SSTable versions across all nodes in the cluster - * @param cassandraVersion the Cassandra version - * @param isSSTableVersionBasedBridgeDisabled flag to disable sstable version based bridge determination - * @throws UnsupportedOperationException if any unsupported SSTable version is detected - */ - @VisibleForTesting - void validateSStableVersions(Set sstableVersionsOnCluster, - CassandraVersion cassandraVersion, - boolean isSSTableVersionBasedBridgeDisabled) - { - // Skip validation when legacy version-based bridge selection is enabled - if (isSSTableVersionBasedBridgeDisabled) - { - LOGGER.debug("Skipping SSTable version validation on driver - legacy version-based bridge selection enabled via {}=true", - BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE); - return; - } - - if (sstableVersionsOnCluster == null || sstableVersionsOnCluster.isEmpty()) - { - // Fail fast with helpful error message - throw new IllegalStateException(String.format( - "Unable to retrieve SSTable versions from cluster. " + - "This is required for SSTable version-based bridge selection. " + - "If you want to bypass this check and use cassandra.version for bridge selection, " + - "set %s=true", BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE)); - } - - // Get SSTable versions that can be read by this Cassandra version - Set supportedVersions = cassandraVersion.getSupportedSStableVersionsForRead(); - - // Find any unsupported versions - Set unsupportedVersions = sstableVersionsOnCluster.stream() - .filter(version -> !supportedVersions.contains(version)) - .collect(Collectors.toSet()); - - if (!unsupportedVersions.isEmpty()) - { - String errorMessage = String.format( - "Detected unsupported SSTable version(s) %s for Cassandra version %s. " + - "Supported versions: %s. " + - "Observed SSTable versions in the cluster: %s. " + - "To retry using cassandra.version based bridge selection, " + - "set %s=true", - unsupportedVersions, - cassandraVersion, - supportedVersions, - sstableVersionsOnCluster, - BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE); - LOGGER.error(errorMessage); - throw new UnsupportedOperationException(errorMessage); - } - - LOGGER.debug("SSTable version validation successful. All observed versions {} are supported for Cassandra version {}", - sstableVersionsOnCluster, cassandraVersion); - } - /** * Validates that all SSTables being read have versions that were observed in gossip info. * This catches cases where SSTables have unexpected versions that weren't seen during driver initialization. diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/bridge/SSTableVersionAnalyzerTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/bridge/SSTableVersionAnalyzerTest.java index 1ac88d138..068a68378 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/bridge/SSTableVersionAnalyzerTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/bridge/SSTableVersionAnalyzerTest.java @@ -38,149 +38,99 @@ */ public class SSTableVersionAnalyzerTest { - // --- determineBridgeVersionForWrite success cases (parameterized) --- + // --- determineBridgeVersionForWrite: picks the LOWEST mutually-compatible version --- - static Stream writeLegacyDisabledSuccessCases() + static Stream writeLowestVersionCases() { return Stream.of( - Arguments.of(Collections.singleton("big-oa"), "big", "5.0.0", CassandraVersion.FIVEZERO), - Arguments.of(new HashSet<>(Arrays.asList("big-na", "big-nb")), "big", "4.0.0", CassandraVersion.FOURZERO), - Arguments.of(new HashSet<>(Arrays.asList("big-na", "big-oa")), "big", "5.0.0", CassandraVersion.FIVEZERO) + Arguments.of(Collections.singleton("big-oa"), "big", CassandraVersion.FIVEZERO), + Arguments.of(new HashSet<>(Arrays.asList("big-na", "big-nb")), "big", CassandraVersion.FOURZERO), + // mixed 4.0 + 5.0 sstables: lowest that every node can import is 4.0 + Arguments.of(new HashSet<>(Arrays.asList("big-na", "big-oa")), "big", CassandraVersion.FOURZERO), + Arguments.of(Collections.singleton("bti-da"), "bti", CassandraVersion.FIVEZERO) ); } @ParameterizedTest - @MethodSource("writeLegacyDisabledSuccessCases") - void testDetermineBridgeVersionForWriteLegacyDisabled(Set versions, - String format, - String cassandraVersion, - CassandraVersion expected) + @MethodSource("writeLowestVersionCases") + void testDetermineBridgeVersionForWritePicksLowest(Set versions, String format, CassandraVersion expected) { - CassandraVersion result = SSTableVersionAnalyzer.determineBridgeVersionForWrite( - versions, format, cassandraVersion, false - ); - assertThat(result).isEqualTo(expected); - } - - // --- determineBridgeVersionForWrite null/empty exception cases (parameterized) --- - - static Stream writeNullOrEmptyVersionsCases() - { - return Stream.of( - Arguments.of(Collections.emptySet()), - Arguments.of((Set) null) - ); - } - - @ParameterizedTest - @MethodSource("writeNullOrEmptyVersionsCases") - void testDetermineBridgeVersionForWriteNullOrEmptyThrowsException(Set versions) - { - assertThatThrownBy(() -> SSTableVersionAnalyzer.determineBridgeVersionForWrite( - versions, "big", "5.0.0", false - )).isInstanceOf(IllegalStateException.class) - .hasMessageContaining("Unable to retrieve SSTable versions from cluster"); + assertThat(SSTableVersionAnalyzer.determineBridgeVersionForWrite(versions, format)).isEqualTo(expected); } - // --- determineBridgeVersionForWrite standalone tests --- - @Test - void testDetermineBridgeVersionForWriteLegacyEnabled() + void testDetermineBridgeVersionForWriteRejectsUnsupportedFormat() { - CassandraVersion result = SSTableVersionAnalyzer.determineBridgeVersionForWrite( - null, "big", "5.0.0", true - ); - assertThat(result).isEqualTo(CassandraVersion.FIVEZERO); + // lowest version is FOURZERO, which does not support bti + assertThatThrownBy(() -> SSTableVersionAnalyzer.determineBridgeVersionForWrite( + new HashSet<>(Arrays.asList("big-na", "big-oa")), "bti")) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessageContaining("does not support requested SSTable format 'bti'"); } @Test - void testDetermineBridgeVersionForWriteUnsupportedFormat() + void testDetermineBridgeVersionForWriteRejectsIncompatibleVersions() { - Set sstableVersions = Collections.singleton("big-na"); + // big-mf (3.0) cannot be read by the highest version present (5.0) assertThatThrownBy(() -> SSTableVersionAnalyzer.determineBridgeVersionForWrite( - sstableVersions, "bti", "4.0.0", false - )).isInstanceOf(UnsupportedOperationException.class) - .hasMessageContaining("Cluster does not support requested SSTable format 'bti'"); + new HashSet<>(Arrays.asList("big-mf", "big-oa")), "big")) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessageContaining("not mutually compatible"); } - // --- determineBridgeVersionForRead standalone tests --- - - @Test - void testDetermineBridgeVersionForReadLegacyDisabled() + static Stream nullOrEmptyCases() { - Set sstableVersions = Collections.singleton("big-oa"); - CassandraVersion result = SSTableVersionAnalyzer.determineBridgeVersionForRead( - sstableVersions, "5.0.0", false - ); - assertThat(result).isEqualTo(CassandraVersion.FIVEZERO); + return Stream.of(Arguments.of(Collections.emptySet()), Arguments.of((Set) null)); } - @Test - void testDetermineBridgeVersionForReadLegacyEnabled() + @ParameterizedTest + @MethodSource("nullOrEmptyCases") + void testDetermineBridgeVersionForWriteNullOrEmptyThrows(Set versions) { - CassandraVersion result = SSTableVersionAnalyzer.determineBridgeVersionForRead( - null, "4.0.0", true - ); - assertThat(result).isEqualTo(CassandraVersion.FOURZERO); + assertThatThrownBy(() -> SSTableVersionAnalyzer.determineBridgeVersionForWrite(versions, "big")) + .isInstanceOf(IllegalStateException.class); } @Test - void testDetermineBridgeVersionForReadEmptyVersionsThrowsException() + void testDetermineBridgeVersionForWriteUnknownVersionThrows() { - assertThatThrownBy(() -> SSTableVersionAnalyzer.determineBridgeVersionForRead( - Collections.emptySet(), "5.0.0", false - )).isInstanceOf(IllegalStateException.class) - .hasMessageContaining("Unable to retrieve SSTable versions from cluster"); + assertThatThrownBy(() -> SSTableVersionAnalyzer.determineBridgeVersionForWrite(Collections.singleton("unknown-xx"), "big")) + .isInstanceOf(IllegalStateException.class) + .hasMessageContaining("Unknown SSTable version"); } - // --- findHighestSSTableVersion success cases (parameterized) --- + // --- determineBridgeVersionForRead: picks the HIGHEST mutually-compatible version --- - static Stream findHighestSuccessCases() + static Stream readHighestVersionCases() { return Stream.of( - Arguments.of(Collections.singleton("big-na"), CassandraVersion.FOURZERO), + Arguments.of(Collections.singleton("big-oa"), CassandraVersion.FIVEZERO), Arguments.of(new HashSet<>(Arrays.asList("big-na", "big-nb")), CassandraVersion.FOURZERO), - Arguments.of(new HashSet<>(Arrays.asList("big-na", "big-oa")), CassandraVersion.FIVEZERO), - Arguments.of(new HashSet<>(Arrays.asList("big-oa", "bti-da")), CassandraVersion.FIVEZERO) + Arguments.of(new HashSet<>(Arrays.asList("big-na", "big-oa")), CassandraVersion.FIVEZERO) ); } @ParameterizedTest - @MethodSource("findHighestSuccessCases") - void testFindHighestSSTableVersion(Set versions, CassandraVersion expectedCassandraVersion) + @MethodSource("readHighestVersionCases") + void testDetermineBridgeVersionForReadPicksHighest(Set versions, CassandraVersion expected) { - String result = SSTableVersionAnalyzer.findHighestSSTableVersion(versions); - assertThat(CassandraVersion.fromSSTableVersion(result)).hasValue(expectedCassandraVersion); + assertThat(SSTableVersionAnalyzer.determineBridgeVersionForRead(versions)).isEqualTo(expected); } - // --- findHighestSSTableVersion null/empty exception cases (parameterized) --- - - static Stream findHighestNullOrEmptyCases() + @Test + void testDetermineBridgeVersionForReadRejectsIncompatibleVersions() { - return Stream.of( - Arguments.of(Collections.emptySet()), - Arguments.of((Set) null) - ); + assertThatThrownBy(() -> SSTableVersionAnalyzer.determineBridgeVersionForRead( + new HashSet<>(Arrays.asList("big-mf", "big-oa")))) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessageContaining("not mutually compatible"); } @ParameterizedTest - @MethodSource("findHighestNullOrEmptyCases") - void testFindHighestSSTableVersionNullOrEmptyThrowsException(Set versions) - { - assertThatThrownBy(() -> SSTableVersionAnalyzer.findHighestSSTableVersion(versions)) - .isInstanceOf(IllegalStateException.class) - .hasMessageContaining("SSTable versions set cannot be empty"); - } - - // --- findHighestSSTableVersion standalone test --- - - @Test - void testFindHighestSSTableVersionUnknownVersionThrowsException() + @MethodSource("nullOrEmptyCases") + void testDetermineBridgeVersionForReadNullOrEmptyThrows(Set versions) { - Set versions = new HashSet<>(Arrays.asList("unknown-xx", "unknown-yy")); - assertThatThrownBy(() -> SSTableVersionAnalyzer.findHighestSSTableVersion(versions)) - .isInstanceOf(IllegalStateException.class) - .hasMessageContaining("Unknown SSTable version:") - .hasMessageContaining("disable_sstable_version_based=true"); + assertThatThrownBy(() -> SSTableVersionAnalyzer.determineBridgeVersionForRead(versions)) + .isInstanceOf(IllegalStateException.class); } } diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContextTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContextTest.java index 70b416149..f9f36cd6f 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContextTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContextTest.java @@ -19,8 +19,6 @@ package org.apache.cassandra.spark.bulkwriter; -import java.util.Collections; -import java.util.Set; import java.util.UUID; import org.junit.jupiter.api.Test; @@ -32,7 +30,6 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; class AbstractBulkWriterContextTest { @@ -48,49 +45,25 @@ void testKryoRegistrationWarningMessage() } @Test - void testSSTableVersionBasedBridgeDisabled() + void testContextUsesDeterminedBridgeVersion() { + // The context's bridge version is whatever getBridgeVersion() resolves to. The determination logic + // itself (override / SSTable-version-based / legacy) lives on CassandraClusterInfo and is covered by + // CassandraClusterInfoBridgeVersionTest. BulkSparkConf conf = mock(BulkSparkConf.class); - when(conf.isSSTableVersionBasedBridgeDisabled()).thenReturn(true); - StructType schema = mock(StructType.class); - TestBulkWriterContext context = TestBulkWriterContext.create(conf, schema, 1, "4.0.0", null); - - // Verify bridge version was still determined - assertThat(context.bridgeVersion()).isEqualTo(CassandraVersion.FOURZERO); - - // Verify that SSTable versions retrieval was not called (disabled) - assertThat(TestBulkWriterContext.sstableVersionRetrievalCount).isEqualTo(0); - assertThat(TestBulkWriterContext.versionRetrievalCount).isEqualTo(1); - } - - @Test - void testSSTableVersionBasedBridgeEnabled() - { - BulkSparkConf conf = mock(BulkSparkConf.class); - when(conf.isSSTableVersionBasedBridgeDisabled()).thenReturn(false); + TestBulkWriterContext context = TestBulkWriterContext.create(conf, schema, 1, CassandraVersion.FIVEZERO); - Set sstableVersions = Collections.singleton("big-oa"); - StructType schema = mock(StructType.class); - TestBulkWriterContext context = TestBulkWriterContext.create(conf, schema, 1, "5.0.0", sstableVersions); - - // Verify bridge version was determined assertThat(context.bridgeVersion()).isEqualTo(CassandraVersion.FIVEZERO); - - // Verify that both version and SSTable versions retrieval were called - assertThat(TestBulkWriterContext.versionRetrievalCount).isEqualTo(1); - assertThat(TestBulkWriterContext.sstableVersionRetrievalCount).isEqualTo(1); } /** - * Concrete test implementation of AbstractBulkWriterContext for testing + * Concrete test implementation of AbstractBulkWriterContext for testing. The bridge version is supplied + * directly (the real determination is exercised at the ClusterInfo level). */ static class TestBulkWriterContext extends AbstractBulkWriterContext { - private static String staticLowestVersion; - private static Set staticSSTableVersions; - static int versionRetrievalCount = 0; - static int sstableVersionRetrievalCount = 0; + private static CassandraVersion staticBridgeVersion; private TestBulkWriterContext(@NotNull BulkSparkConf conf, @NotNull StructType structType, @@ -102,28 +75,16 @@ private TestBulkWriterContext(@NotNull BulkSparkConf conf, static TestBulkWriterContext create(@NotNull BulkSparkConf conf, @NotNull StructType structType, int sparkDefaultParallelism, - @NotNull String lowestVersion, - Set sstableVersions) + CassandraVersion bridgeVersion) { - staticLowestVersion = lowestVersion; - staticSSTableVersions = sstableVersions; - versionRetrievalCount = 0; - sstableVersionRetrievalCount = 0; + staticBridgeVersion = bridgeVersion; return new TestBulkWriterContext(conf, structType, sparkDefaultParallelism); } @Override - protected String getLowestCassandraVersion(@NotNull BulkSparkConf conf) - { - versionRetrievalCount++; - return staticLowestVersion; - } - - @Override - protected Set getSSTableVersionsOnCluster(@NotNull BulkSparkConf conf) + protected CassandraVersion getBridgeVersion() { - sstableVersionRetrievalCount++; - return staticSSTableVersions; + return staticBridgeVersion; } @Override diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfigExtensibilityTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfigExtensibilityTest.java index 15cf046d4..c57a4a265 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfigExtensibilityTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfigExtensibilityTest.java @@ -129,13 +129,7 @@ protected ClusterInfo buildClusterInfo(CassandraVersion bridgeVersion) } @Override - protected String getLowestCassandraVersion(@NotNull BulkSparkConf conf) - { - throw new UnsupportedOperationException("Driver-only"); - } - - @Override - protected java.util.Set getSSTableVersionsOnCluster(@NotNull BulkSparkConf conf) + protected CassandraVersion getBridgeVersion() { throw new UnsupportedOperationException("Driver-only"); } diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfoBridgeVersionTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfoBridgeVersionTest.java new file mode 100644 index 000000000..f7c873d39 --- /dev/null +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfoBridgeVersionTest.java @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark.bulkwriter; + +import java.util.Collections; +import java.util.Set; + +import org.junit.jupiter.api.Test; + +import o.a.c.sidecar.client.shaded.client.SidecarClient; +import o.a.c.sidecar.client.shaded.client.SidecarInstance; +import org.apache.cassandra.bridge.CassandraVersion; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +/** + * Unit tests for {@link CassandraClusterInfo#getBridgeVersion()} — the bridge-version priority chain: + * version override (operator escape hatch) > SSTable-version-based selection > legacy cassandra.version. + * + *

No mocking framework is used for the cluster connectivity (an in-memory {@link CassandraContext} with no + * Sidecar client); only the {@link BulkSparkConf} feature flag is stubbed. The cluster-derived inputs + * (feature override, SSTable versions, lowest version) are supplied via a test subclass that also records + * whether each was consulted. + */ +public class CassandraClusterInfoBridgeVersionTest +{ + @Test + void testOverrideWinsWhenFeatureEnabled() + { + TestClusterInfo info = new TestClusterInfo(conf(false), "4.0.0", Collections.singleton("big-oa"), "5.0.0"); + + assertThat(info.getBridgeVersion()).isEqualTo(CassandraVersion.FOURZERO); + // override short-circuits: neither SSTable versions nor the lowest version are consulted + assertThat(info.sstableVersionsCalls).isEqualTo(0); + assertThat(info.lowestVersionCalls).isEqualTo(0); + } + + @Test + void testOverrideWinsWhenFeatureDisabled() + { + TestClusterInfo info = new TestClusterInfo(conf(true), "4.0.0", Collections.singleton("big-oa"), "5.0.0"); + + assertThat(info.getBridgeVersion()).isEqualTo(CassandraVersion.FOURZERO); + assertThat(info.sstableVersionsCalls).isEqualTo(0); + assertThat(info.lowestVersionCalls).isEqualTo(0); + } + + @Test + void testSSTableVersionBasedWhenEnabledAndNoOverride() + { + // SSTable-version-based selection picks the lowest version whose SSTables every node can import. + // The viable versions depend on the configured write format: bti is a 5.0-only format, so a bti job + // resolves to 5.0; a big job with mixed 4.0 + 5.0 SSTables writes at the lowest importable (4.0). + boolean bti = "bti".equals(CassandraVersion.configuredSSTableFormat()); + Set sstableVersions = bti + ? Collections.singleton("bti-da") + : new java.util.HashSet<>(java.util.Arrays.asList("big-na", "big-oa")); + CassandraVersion expected = bti ? CassandraVersion.FIVEZERO : CassandraVersion.FOURZERO; + TestClusterInfo info = new TestClusterInfo(conf(false), null, sstableVersions, "5.0.0"); + + assertThat(info.getBridgeVersion()).isEqualTo(expected); + assertThat(info.sstableVersionsCalls).isEqualTo(1); + // SSTable-based selection does not consult the legacy lowest version + assertThat(info.lowestVersionCalls).isEqualTo(0); + } + + @Test + void testLegacyVersionWhenDisabledAndNoOverride() + { + TestClusterInfo info = new TestClusterInfo(conf(true), null, Collections.singleton("big-oa"), "5.0.0"); + + assertThat(info.getBridgeVersion()).isEqualTo(CassandraVersion.FIVEZERO); + assertThat(info.lowestVersionCalls).isEqualTo(1); + // legacy path does not consult SSTable versions + assertThat(info.sstableVersionsCalls).isEqualTo(0); + } + + private static BulkSparkConf conf(boolean sstableVersionBasedBridgeDisabled) + { + BulkSparkConf conf = mock(BulkSparkConf.class); + when(conf.isSSTableVersionBasedBridgeDisabled()).thenReturn(sstableVersionBasedBridgeDisabled); + return conf; + } + + /** + * {@link CassandraClusterInfo} backed by an in-memory {@link CassandraContext} (no Sidecar), with the + * cluster-derived determination inputs supplied directly and call-counting so tests can assert which + * branch of the priority chain was taken. + */ + private static final class TestClusterInfo extends CassandraClusterInfo + { + private final String versionFromFeature; + private final Set sstableVersions; + private final String lowestVersion; + private int sstableVersionsCalls = 0; + private int lowestVersionCalls = 0; + + TestClusterInfo(BulkSparkConf conf, String versionFromFeature, Set sstableVersions, String lowestVersion) + { + // bridgeVersion arg is unused by getBridgeVersion(); pass any value + super(conf, CassandraVersion.FIVEZERO); + this.versionFromFeature = versionFromFeature; + this.sstableVersions = sstableVersions; + this.lowestVersion = lowestVersion; + } + + @Override + protected CassandraContext buildCassandraContext() + { + return new InMemoryCassandraContext(); + } + + @Override + public String getVersionFromFeature() + { + return versionFromFeature; + } + + @Override + public Set getSSTableVersionsOnCluster() + { + sstableVersionsCalls++; + return sstableVersions; + } + + @Override + public String getLowestCassandraVersion() + { + lowestVersionCalls++; + return lowestVersion; + } + } + + /** + * A {@link CassandraContext} with no real Sidecar connectivity: an empty cluster and a null client. + * {@code Sidecar.allNodeSettings} is invoked with an empty instance set during construction, so the + * null client is never dereferenced. + */ + private static final class InMemoryCassandraContext extends CassandraContext + { + InMemoryCassandraContext() + { + super(null, null); + } + + @Override + protected Set createClusterConfig() + { + return Collections.emptySet(); + } + + @Override + protected SidecarClient initializeSidecarClient(BulkSparkConf conf) + { + return null; + } + } +} diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/VersionFeatureLegacyBridgeTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/VersionFeatureLegacyBridgeTest.java deleted file mode 100644 index 69a25f5b9..000000000 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/VersionFeatureLegacyBridgeTest.java +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.cassandra.spark.bulkwriter; - -import java.util.Collections; -import java.util.Set; - -import org.junit.jupiter.api.Test; - -import o.a.c.sidecar.client.shaded.client.SidecarClient; -import o.a.c.sidecar.client.shaded.client.SidecarInstance; -import org.apache.cassandra.bridge.CassandraVersion; -import org.apache.cassandra.bridge.SSTableVersionAnalyzer; - -import static org.assertj.core.api.Assertions.assertThat; - -/** - * Unit test that verifies the {@link CassandraClusterInfo#getVersionFromFeature()} override is honored in the - * legacy version-based bridge selection path (when SSTable version-based bridge selection is disabled). - * - *

{@link CassandraClusterInfo#getLowestCassandraVersion()} returns the override when it is non-null, - * short-circuiting any Sidecar/node-settings lookup. In legacy mode that value is what - * {@link SSTableVersionAnalyzer#determineBridgeVersionForWrite} uses to select the bridge - * (see {@code SSTableVersionAnalyzerTest} for the full version-to-bridge coverage). - * - *

The override is a subclass extension hook rather than a config flag, so it is exercised here via a - * test subclass. No mocking framework is used: the cluster info is backed by a lightweight in-memory - * {@link CassandraContext} (empty cluster, no Sidecar client). - */ -public class VersionFeatureLegacyBridgeTest -{ - @Test - void testFeatureOverrideDrivesLowestCassandraVersionWithoutQueryingCluster() - { - // The in-memory context has no nodes; if the override were ignored, getLowestCassandraVersion() would - // attempt to resolve node settings and fail, so returning the override proves it short-circuits. - CassandraClusterInfo clusterInfo = new FeatureOverrideClusterInfo("4.0.0"); - - assertThat(clusterInfo.getLowestCassandraVersion()) - .describedAs("getVersionFromFeature() override must take precedence over cluster-derived version") - .isEqualTo("4.0.0"); - } - - @Test - void testFeatureOverrideSelectsLegacyBridge() - { - // bridgeVersion field is intentionally FIVEZERO (see the subclass); the legacy bridge must instead - // come from the feature override (4.0.0 -> FOURZERO), proving the override is what drives selection. - CassandraClusterInfo clusterInfo = new FeatureOverrideClusterInfo("4.0.0"); - - // Mirror AbstractBulkWriterContext's legacy flow: sstableVersionsOnCluster is null when disabled, - // and the bridge is derived solely from getLowestCassandraVersion(). - CassandraVersion bridge = SSTableVersionAnalyzer.determineBridgeVersionForWrite( - null, "big", clusterInfo.getLowestCassandraVersion(), /* isSSTableVersionBasedBridgeDisabled */ true); - - assertThat(bridge) - .describedAs("In legacy mode the bridge must come from the feature-overridden version") - .isEqualTo(CassandraVersion.FOURZERO); - } - - /** - * {@link CassandraClusterInfo} that supplies a fixed {@link #getVersionFromFeature()} value and is backed - * by an in-memory {@link CassandraContext} so no real cluster connectivity is required. - */ - private static final class FeatureOverrideClusterInfo extends CassandraClusterInfo - { - private final String featureVersion; - - FeatureOverrideClusterInfo(String featureVersion) - { - // bridgeVersion deliberately differs from the override version under test - super((BulkSparkConf) null, CassandraVersion.FIVEZERO); - this.featureVersion = featureVersion; - } - - @Override - protected CassandraContext buildCassandraContext() - { - return new InMemoryCassandraContext(); - } - - @Override - public String getVersionFromFeature() - { - return featureVersion; - } - } - - /** - * A {@link CassandraContext} with no real Sidecar connectivity: an empty cluster and a null client. - * {@code Sidecar.allNodeSettings} is invoked with an empty instance set during construction, so the - * null client is never dereferenced. - */ - private static final class InMemoryCassandraContext extends CassandraContext - { - InMemoryCassandraContext() - { - super(null, null); - } - - @Override - protected Set createClusterConfig() - { - return Collections.emptySet(); - } - - @Override - protected SidecarClient initializeSidecarClient(BulkSparkConf conf) - { - return null; - } - } -} diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroupTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroupTest.java index 796e6f098..f5d14575a 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroupTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroupTest.java @@ -34,6 +34,7 @@ import org.junit.jupiter.api.Test; import o.a.c.sidecar.client.shaded.common.response.TokenRangeReplicasResponse; +import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.bulkwriter.BroadcastableClusterInfoGroup; import org.apache.cassandra.spark.bulkwriter.BulkSparkConf; import org.apache.cassandra.spark.bulkwriter.CassandraClusterInfo; @@ -287,6 +288,53 @@ void testSerDeser() assertThat(clusterCount[0]).isEqualTo(2); } + @Test + void testGetBridgeVersionSingleClusterDelegates() + { + CassandraClusterInfoGroup group = mockClusterGroup(1, index -> { + CassandraClusterInfo clusterInfo = mockClusterInfo("cluster" + index); + when(clusterInfo.getBridgeVersion()).thenReturn(CassandraVersion.FIVEZERO); + return clusterInfo; + }); + assertThat(group.getBridgeVersion()).isEqualTo(CassandraVersion.FIVEZERO); + } + + @Test + void testGetBridgeVersionMultiClusterSameVersionReturnsIt() + { + // All clusters resolve to the same version -> compatible; the (equal) lowest is returned + CassandraClusterInfoGroup group = mockClusterGroup(2, index -> { + CassandraClusterInfo clusterInfo = mockClusterInfo("cluster" + index); + when(clusterInfo.getBridgeVersion()).thenReturn(CassandraVersion.FOURZERO); + return clusterInfo; + }); + assertThat(group.getBridgeVersion()).isEqualTo(CassandraVersion.FOURZERO); + } + + @Test + void testGetBridgeVersionMultiClusterAcrossMajorsReturnsLowest() + { + // 4.0 and 5.0 clusters -> write at the lowest (4.0) so the produced SSTables are importable by both + CassandraClusterInfoGroup group = mockClusterGroup(2, index -> { + CassandraClusterInfo clusterInfo = mockClusterInfo("cluster" + index); + when(clusterInfo.getBridgeVersion()).thenReturn(index == 0 ? CassandraVersion.FOURZERO : CassandraVersion.FIVEZERO); + return clusterInfo; + }); + assertThat(group.getBridgeVersion()).isEqualTo(CassandraVersion.FOURZERO); + } + + @Test + void testGetBridgeVersionFourZeroAndFourOneReturnsLowest() + { + // 4.0 and 4.1 clusters -> write at the lowest (4.0) + CassandraClusterInfoGroup group = mockClusterGroup(2, index -> { + CassandraClusterInfo clusterInfo = mockClusterInfo("cluster" + index); + when(clusterInfo.getBridgeVersion()).thenReturn(index == 0 ? CassandraVersion.FOURZERO : CassandraVersion.FOURONE); + return clusterInfo; + }); + assertThat(group.getBridgeVersion()).isEqualTo(CassandraVersion.FOURZERO); + } + private CassandraClusterInfoGroup mockClusterGroup(int size, Function clusterInfoCreator) { diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/CassandraDataLayerValidationTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/CassandraDataLayerValidationTest.java index 41f5b73c8..23a6c83e3 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/CassandraDataLayerValidationTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/CassandraDataLayerValidationTest.java @@ -44,106 +44,6 @@ */ public class CassandraDataLayerValidationTest { - @Test - void testValidateSStableVersionsWithAllSupportedVersions() - { - CassandraDataLayer dataLayer = createTestDataLayer(); - Set sstableVersions = new HashSet<>(Arrays.asList("big-na", "big-nb")); - - assertThatNoException() - .describedAs("All versions are supported by FOURZERO") - .isThrownBy(() -> dataLayer.validateSStableVersions(sstableVersions, CassandraVersion.FOURZERO, false)); - } - - @Test - void testValidateSStableVersionsWithUnsupportedVersion() - { - CassandraDataLayer dataLayer = createTestDataLayer(); - // C* 4.0 cannot read C* 5.0 SSTable versions - Set sstableVersions = new HashSet<>(Arrays.asList("big-na", "big-oa")); - - assertThatThrownBy(() -> dataLayer.validateSStableVersions(sstableVersions, CassandraVersion.FOURZERO, false)) - .isInstanceOf(UnsupportedOperationException.class) - .hasMessageContaining("Detected unsupported SSTable version(s)") - .hasMessageContaining("big-oa") - .hasMessageContaining("FOURZERO") - .hasMessageContaining("set spark.cassandra_analytics.bridge.disable_sstable_version_based=true"); - } - - @Test - void testValidateSStableVersionsWithNullVersionsThrowsException() - { - CassandraDataLayer dataLayer = createTestDataLayer(); - - assertThatThrownBy(() -> dataLayer.validateSStableVersions(null, CassandraVersion.FOURZERO, false)) - .isInstanceOf(IllegalStateException.class) - .hasMessageContaining("Unable to retrieve SSTable versions from cluster"); - } - - @Test - void testValidateSStableVersionsWithEmptyVersionsThrowsException() - { - CassandraDataLayer dataLayer = createTestDataLayer(); - - assertThatThrownBy(() -> dataLayer.validateSStableVersions(Collections.emptySet(), CassandraVersion.FOURZERO, false)) - .isInstanceOf(IllegalStateException.class) - .hasMessageContaining("Unable to retrieve SSTable versions from cluster"); - } - - @Test - void testValidateSStableVersionsSkipsValidationWhenLegacyEnabled() - { - CassandraDataLayer dataLayer = createTestDataLayer(); - - // Test with invalid versions - should not throw when legacy version-based bridge selection is enabled - Set invalidVersions = new HashSet<>(List.of("invalid-version")); - assertThatNoException() - .describedAs("Validation should be skipped with invalid versions when legacy version-based bridge selection is enabled") - .isThrownBy(() -> dataLayer.validateSStableVersions(invalidVersions, CassandraVersion.FOURZERO, true)); - - // Test with null versions - should not throw when legacy version-based bridge selection is enabled - assertThatNoException() - .describedAs("Validation should be skipped with null versions when legacy version-based bridge selection is enabled") - .isThrownBy(() -> dataLayer.validateSStableVersions(null, CassandraVersion.FOURZERO, true)); - } - - @Test - void testValidateSStableVersionsForFiveZeroWithBackwardCompatibility() - { - CassandraDataLayer dataLayer = createTestDataLayer(); - // C* 5.0 should be able to read C* 4.0 SSTable versions - Set sstableVersions = new HashSet<>(Arrays.asList("big-na", "big-nb", "big-oa")); - - assertThatNoException() - .describedAs("FIVEZERO should support reading FOURZERO versions") - .isThrownBy(() -> dataLayer.validateSStableVersions(sstableVersions, CassandraVersion.FIVEZERO, false)); - } - - @Test - void testValidateSStableVersionsForFiveZeroWithBtiFormat() - { - CassandraDataLayer dataLayer = createTestDataLayer(); - Set sstableVersions = new HashSet<>(Arrays.asList("big-oa", "bti-da")); - - assertThatNoException() - .describedAs("FIVEZERO should support both big and bti formats") - .isThrownBy(() -> dataLayer.validateSStableVersions(sstableVersions, CassandraVersion.FIVEZERO, false)); - } - - @Test - void testValidateSStableVersionsErrorMessageIncludesAllDetails() - { - CassandraDataLayer dataLayer = createTestDataLayer(); - Set sstableVersions = new HashSet<>(List.of("big-oa")); - - assertThatThrownBy(() -> dataLayer.validateSStableVersions(sstableVersions, CassandraVersion.FOURZERO, false)) - .isInstanceOf(UnsupportedOperationException.class) - .hasMessageContaining("Detected unsupported SSTable version(s)") - .hasMessageContaining("Supported versions:") - .hasMessageContaining("Observed SSTable versions in the cluster:") - .hasMessageContaining("set spark.cassandra_analytics.bridge.disable_sstable_version_based=true"); - } - @Test void testValidateSStableVersionsListWithValidVersions() { @@ -324,10 +224,16 @@ void testInitializeSSTableVersionsAndBridgeVersionWithMixedVersions() .containsExactlyInAnyOrder("big-na", "big-oa"); } - private CassandraDataLayer createTestDataLayer() + @Test + void testInitializeSSTableVersionsAndBridgeVersionFailsFastWhenEnabledAndNoVersions() { - // Use TestCassandraDataLayer to avoid SparkContext initialization in unit tests - return new TestCassandraDataLayer(null); + // Feature enabled but the cluster returns no SSTable versions -> fail fast with an actionable hint + CassandraDataLayer dataLayer = new TestCassandraDataLayerForInitTests(Collections.emptySet(), false); + + assertThatThrownBy(() -> dataLayer.initializeSSTableVersionsAndBridgeVersion("5.0.0")) + .isInstanceOf(IllegalStateException.class) + .hasMessageContaining("Unable to retrieve SSTable versions from cluster") + .hasMessageContaining("set spark.cassandra_analytics.bridge.disable_sstable_version_based=true"); } private CassandraDataLayer createTestDataLayerWithVersions(Set sstableVersions) From 073f42a62740fd818423f7b587ffab740a032502 Mon Sep 17 00:00:00 2001 From: Shailaja Koppu Date: Fri, 26 Jun 2026 18:28:08 +0100 Subject: [PATCH 8/9] remove major version checks --- .../cassandra/bridge/CassandraVersion.java | 14 ++++ .../bridge/SSTableVersionAnalyzer.java | 65 ++++++++++++++++++ .../spark/bulkwriter/BulkSparkConf.java | 20 ------ .../bulkwriter/CassandraClusterInfo.java | 7 +- .../CassandraClusterInfoGroup.java | 16 +++-- .../spark/data/CassandraDataLayer.java | 46 +++++++------ .../bridge/SSTableVersionAnalyzerTest.java | 44 ++++++++++++- .../spark/bulkwriter/BulkSparkConfTest.java | 66 ------------------- .../CassandraClusterInfoGroupTest.java | 14 ++++ .../CassandraDataLayerValidationTest.java | 44 ++++++++----- 10 files changed, 205 insertions(+), 131 deletions(-) diff --git a/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/CassandraVersion.java b/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/CassandraVersion.java index b668ee631..904573995 100644 --- a/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/CassandraVersion.java +++ b/cassandra-analytics-common/src/main/java/org/apache/cassandra/bridge/CassandraVersion.java @@ -151,6 +151,20 @@ public Set getSupportedSStableVersionsForRead() return Collections.unmodifiableSet(readableVersions); } + /** + * Whether this Cassandra version can read (and therefore import) SSTables natively written by {@code other}. + * A version can read SSTables from every version in the inclusive range + * [{@link #lowestCompatibleVersionNumber}, this version]. For example, Cassandra 5.0 can read 4.0/4.1/5.0 + * SSTables but not 3.x. + * + * @param other the Cassandra version whose SSTables would be read + * @return {@code true} if this version can read {@code other}'s SSTables + */ + public boolean canRead(CassandraVersion other) + { + return other.number >= this.lowestCompatibleVersionNumber && other.number <= this.number; + } + private static final String configuredSSTableFormat; private static final CassandraVersion[] implementedVersions; private static final String[] supportedVersions; diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java index d79ef1446..0a0807a14 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/bridge/SSTableVersionAnalyzer.java @@ -20,11 +20,14 @@ package org.apache.cassandra.bridge; import java.util.Comparator; +import java.util.Collection; import java.util.List; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.cassandra.spark.bulkwriter.BulkSparkConf; + /** * Determines which Cassandra bridge to load from the SSTable versions present on a cluster. */ @@ -48,6 +51,8 @@ private SSTableVersionAnalyzer() */ public static CassandraVersion determineBridgeVersionForWrite(Set sstableVersionsOnCluster, String requestedFormat) { + requireSSTableVersionsPresent(sstableVersionsOnCluster); + // Every observed version must be mutually compatible (readable by the highest version present). ensureMutuallyCompatibleVersions(sstableVersionsOnCluster, highestCompatibleVersion(sstableVersionsOnCluster)); @@ -76,11 +81,71 @@ public static CassandraVersion determineBridgeVersionForWrite(Set sstabl */ public static CassandraVersion determineBridgeVersionForRead(Set sstableVersionsOnCluster) { + requireSSTableVersionsPresent(sstableVersionsOnCluster); + CassandraVersion highest = highestCompatibleVersion(sstableVersionsOnCluster); ensureMutuallyCompatibleVersions(sstableVersionsOnCluster, highest); return highest; } + /** + * Fails fast with an actionable hint when no SSTable versions could be retrieved while SSTable version-based + * bridge selection is enabled. Shared by the read and write paths. + * + * @param sstableVersionsOnCluster the observed SSTable versions + * @throws IllegalStateException if {@code sstableVersionsOnCluster} is null or empty + */ + private static void requireSSTableVersionsPresent(Set sstableVersionsOnCluster) + { + if (sstableVersionsOnCluster == null || sstableVersionsOnCluster.isEmpty()) + { + throw new IllegalStateException(String.format( + "Unable to retrieve SSTable versions from cluster. This is required for SSTable version-based " + + "bridge selection. To bypass this check and use cassandra.version for bridge selection, set %s=true", + BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE)); + } + } + + /** + * Determines the bridge version for a coordinated bulk write across multiple clusters, given the per-cluster + * bridge versions. Writes at the lowest version present so the produced SSTables are importable by + * every cluster, but first verifies that lowest version's SSTables can actually be read (imported) by the + * highest version present; otherwise the clusters span incompatible Cassandra majors and the + * coordinated write cannot proceed. + * + * @param bridgeVersions the per-cluster bridge versions + * @return the lowest version, which is the version to write at + * @throws IllegalStateException if {@code bridgeVersions} is null or empty + * @throws UnsupportedOperationException if the lowest version's SSTables cannot be imported by the highest + * version present + */ + public static CassandraVersion lowestCompatibleWriteVersionForCoordinatedWrites(Collection bridgeVersions) + { + if (bridgeVersions == null || bridgeVersions.isEmpty()) + { + throw new IllegalStateException("No cluster bridge versions available"); + } + + CassandraVersion highest = bridgeVersions.stream() + .max(Comparator.comparingInt(CassandraVersion::versionNumber)) + .orElseThrow(() -> new IllegalStateException("No cluster bridge versions available")); + CassandraVersion lowest = bridgeVersions.stream() + .min(Comparator.comparingInt(CassandraVersion::versionNumber)) + .orElseThrow(() -> new IllegalStateException("No cluster bridge versions available")); + + // Writing at the lowest version is only safe if the highest version present can import those SSTables. + if (!highest.canRead(lowest)) + { + throw new UnsupportedOperationException(String.format( + "Cluster bridge versions are not mutually compatible for a coordinated write: SSTables written at the " + + "lowest version present (%s) cannot be imported by the highest version present (%s, which reads %s). " + + "Per-cluster bridge versions: %s", + lowest.versionName(), highest.versionName(), highest.getSupportedSStableVersionsForRead(), bridgeVersions)); + } + + return lowest; + } + /** * Verifies that every observed SSTable version is readable by the given highest version present; otherwise * the cluster spans incompatible Cassandra majors and the job cannot proceed. diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java index b1e44e830..d6c41ab7d 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConf.java @@ -50,7 +50,6 @@ import org.apache.cassandra.spark.utils.BuildInfo; import org.apache.cassandra.spark.utils.MapUtils; import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; @@ -743,25 +742,6 @@ public boolean isSSTableVersionBasedBridgeDisabled() return disableSSTableVersionBasedBridge; } - /** - * Utility method to retrieve the disable SSTable version-based bridge flag - * from Spark configuration. This can be called from contexts where a BulkSparkConf - * instance is not available. - *

- * NOTE: must only be called on the driver. It relies on {@code SparkContext.getOrCreate()}, - * which has no active context on executors and would attempt to build a new (master-less) - * context there. The reader propagates the resolved decision to executors via the - * serialized {@code sstableVersionsOnCluster} set (empty == disabled). - * - * @return true if SSTable version-based bridge selection should be disabled - */ - public static boolean getDisableSSTableVersionBasedBridge() - { - return SparkContext.getOrCreate() - .getConf() - .getBoolean(DISABLE_SSTABLE_VERSION_BASED_BRIDGE, false); - } - public StorageClientConfig getStorageClientConfig() { return storageClientConfig; diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java index 792627728..0f11070da 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java @@ -86,7 +86,7 @@ public class CassandraClusterInfo implements ClusterInfo, Closeable // Changes here must be reflected in BroadcastableClusterInfo. protected final BulkSparkConf conf; protected final String clusterId; - protected Partitioner partitioner; + protected volatile Partitioner partitioner; // -- Driver-only fields (not broadcast) -- // NOT included in BroadcastableClusterInfo. Either expensive to serialize @@ -118,6 +118,8 @@ public CassandraClusterInfo(BulkSparkConf conf, String clusterId, CassandraVersi this.clusterId = clusterId; this.bridgeVersion = bridgeVersion; this.cassandraContext = buildCassandraContext(); + // Fire the node-settings requests upfront so they are in-flight while the rest of driver-side + // initialization proceeds. LOGGER.info("Getting Cassandra versions from all nodes"); this.allNodeSettingFutures = Sidecar.allNodeSettings(cassandraContext.getSidecarClient(), cassandraContext.getCluster()); @@ -440,7 +442,8 @@ private synchronized List resolveAllNodeSettings() if (allNodeSettingFutures == null) { - throw new IllegalStateException("allNodeSettingFutures is null"); + throw new IllegalStateException("getAllNodeSettings should not be called on executor. " + + "Cassandra version is pre-computed on driver and broadcast to executors."); } final long totalTimeout = conf.getSidecarRequestMaxRetryDelayMillis() * diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java index 4245e7a8b..1368680f6 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java @@ -22,7 +22,6 @@ import java.math.BigInteger; import java.util.ArrayList; import java.util.Collections; -import java.util.Comparator; import java.util.EnumSet; import java.util.HashMap; import java.util.LinkedHashMap; @@ -41,6 +40,7 @@ import org.slf4j.LoggerFactory; import org.apache.cassandra.bridge.CassandraVersion; +import org.apache.cassandra.bridge.SSTableVersionAnalyzer; import org.apache.cassandra.spark.bulkwriter.BulkSparkConf; import org.apache.cassandra.spark.bulkwriter.CassandraClusterInfo; import org.apache.cassandra.spark.bulkwriter.CassandraContext; @@ -387,7 +387,8 @@ public void setBridgeVersion(CassandraVersion bridgeVersion) *

Each cluster determines its own bridge version (see {@link CassandraClusterInfo#getBridgeVersion()}), * which is the lowest mutually-compatible SSTable version present on that cluster. Across clusters the * lowest of those is chosen so the produced SSTables remain importable by every cluster (a node can import - * its own and older SSTable versions, but not newer ones). + * its own and older SSTable versions, but not newer ones). The selection fails if the lowest version's + * SSTables cannot be imported by the highest version present across clusters. * * @return the determined Cassandra bridge version */ @@ -399,10 +400,11 @@ public CassandraVersion getBridgeVersion() return ((CassandraClusterInfo) clusterInfos.get(0)).getBridgeVersion(); } - // Write at the lowest so every cluster can import the produced SSTables - return clusterInfos.stream() - .map(ci -> ((CassandraClusterInfo) ci).getBridgeVersion()) - .min(Comparator.comparingInt(CassandraVersion::versionNumber)) - .orElseThrow(() -> new IllegalStateException("No cluster bridge versions available")); + // Write at the lowest so every cluster can import the produced SSTables, but only after verifying the + // lowest version's SSTables are importable by the highest version present. + List bridgeVersions = clusterInfos.stream() + .map(ci -> ((CassandraClusterInfo) ci).getBridgeVersion()) + .collect(Collectors.toList()); + return SSTableVersionAnalyzer.lowestCompatibleWriteVersionForCoordinatedWrites(bridgeVersions); } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java index 63c9b9a7d..e2931aa63 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java @@ -100,6 +100,7 @@ import org.apache.cassandra.spark.validation.SidecarValidation; import org.apache.cassandra.spark.validation.StartupValidatable; import org.apache.cassandra.spark.validation.StartupValidator; +import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.types.DataType; import org.apache.spark.util.ShutdownHookManager; @@ -334,15 +335,21 @@ private int initBulkReader(@NotNull ClientConfig options) throws ExecutionExcept } /** - * Checks if SSTable version-based bridge selection is disabled. - * Protected to allow test overrides without SparkContext. + * Checks if SSTable version-based bridge selection is disabled, reading the flag from the active + * (driver-side) {@link SparkSession}. Protected to allow test overrides without a SparkSession. * * @return true if disabled, false if enabled */ @VisibleForTesting protected boolean isSSTableVersionBasedBridgeDisabled() { - return BulkSparkConf.getDisableSSTableVersionBasedBridge(); + // Read from the active SparkSession on the driver. This intentionally does not create a SparkContext: + // SparkSession.active() fails fast if called without an active/default session (e.g. on an executor), + // rather than silently building a master-less context. + return SparkSession.active() + .sparkContext() + .getConf() + .getBoolean(BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE, false); } /** @@ -371,21 +378,12 @@ protected CassandraVersion initializeSSTableVersionsAndBridgeVersion(String cass } else { - Set retrieved = retrieveSSTableVersionsFromCluster(); - // Fail fast with an actionable hint when no SSTable versions could be retrieved while the - // feature is enabled (in disabled mode this branch is not taken, so empty here is unexpected). - if (retrieved.isEmpty()) - { - throw new IllegalStateException(String.format( - "Unable to retrieve SSTable versions from cluster. This is required for SSTable " - + "version-based bridge selection. To bypass this check and use cassandra.version for " - + "bridge selection, set %s=true", BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE)); - } // Wrap in a HashSet: retrieveSSTableVersionsFromCluster() may return Collections.emptySet() // or an unspecified Set type from Collectors.toSet(), but Kryo reads this field back via // kryo.readObject(in, HashSet.class), so the concrete type must be HashSet. - this.sstableVersionsOnCluster = new HashSet<>(retrieved); - // Pick the highest (mutually-compatible) version present on the cluster. + this.sstableVersionsOnCluster = new HashSet<>(retrieveSSTableVersionsFromCluster()); + // Pick the highest (mutually-compatible) version present on the cluster. determineBridgeVersionForRead + // fails fast with an actionable hint when no SSTable versions were retrieved while the feature is enabled. bridgeVersion = SSTableVersionAnalyzer.determineBridgeVersionForRead(sstableVersionsOnCluster); } @@ -960,22 +958,30 @@ void validateSStableVersions(List sstables) return; } + // Accept any SSTable version the already-selected read bridge can read, not just the exact set observed + // in the driver's gossip snapshot. The bridge was determined on the driver (the highest version present) + // and reconstructed here; a flush/compaction producing a still-readable version between the snapshot and + // the read should not spuriously fail. Genuinely unreadable versions (e.g. a newer major than the bridge) + // are still rejected. + Set readableVersions = bridge().getVersion().getSupportedSStableVersionsForRead(); + for (SSTable ssTable : sstables) { String ssTableFileName = ssTable.getDataFileName(); // Extract full version string (e.g., "big-nb" from the filename) String ssTableVersion = ssTable.getFormat() + "-" + ssTable.getVersion(); - if (!expectedVersions.contains(ssTableVersion)) + if (!readableVersions.contains(ssTableVersion)) { String errorMessage = String.format( - "SSTable '%s' has version '%s' which was not observed in cluster gossip info. " + - "Expected versions from gossip: %s. " + + "SSTable '%s' has version '%s' which is not readable by the bridge selected from cluster gossip info. " + + "Versions observed in gossip: %s. Versions readable by the selected bridge: %s. " + "To retry using cassandra.version based bridge selection, " + "set %s=true", ssTableFileName, ssTableVersion, expectedVersions, + readableVersions, BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE); LOGGER.error(errorMessage); throw new UnsupportedOperationException(errorMessage); @@ -984,8 +990,8 @@ void validateSStableVersions(List sstables) if (!sstables.isEmpty()) { - LOGGER.debug("Validated {} SSTable(s) against expected versions from gossip: {}", - sstables.size(), expectedVersions); + LOGGER.debug("Validated {} SSTable(s) against versions readable by the selected bridge: {}", + sstables.size(), readableVersions); } } diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/bridge/SSTableVersionAnalyzerTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/bridge/SSTableVersionAnalyzerTest.java index 068a68378..b34f22fab 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/bridge/SSTableVersionAnalyzerTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/bridge/SSTableVersionAnalyzerTest.java @@ -88,7 +88,9 @@ static Stream nullOrEmptyCases() void testDetermineBridgeVersionForWriteNullOrEmptyThrows(Set versions) { assertThatThrownBy(() -> SSTableVersionAnalyzer.determineBridgeVersionForWrite(versions, "big")) - .isInstanceOf(IllegalStateException.class); + .isInstanceOf(IllegalStateException.class) + .hasMessageContaining("Unable to retrieve SSTable versions from cluster") + .hasMessageContaining("disable_sstable_version_based"); } @Test @@ -131,6 +133,46 @@ void testDetermineBridgeVersionForReadRejectsIncompatibleVersions() void testDetermineBridgeVersionForReadNullOrEmptyThrows(Set versions) { assertThatThrownBy(() -> SSTableVersionAnalyzer.determineBridgeVersionForRead(versions)) + .isInstanceOf(IllegalStateException.class) + .hasMessageContaining("Unable to retrieve SSTable versions from cluster") + .hasMessageContaining("disable_sstable_version_based"); + } + + // --- lowestCompatibleWriteVersionForCoordinatedWrites: coordinated multi-cluster write target --- + + @Test + void testLowestCompatibleWriteVersionPicksLowestWhenCompatible() + { + // 4.0 + 5.0: SSTables written at 4.0 are importable by 5.0, so the lowest (4.0) is chosen + assertThat(SSTableVersionAnalyzer.lowestCompatibleWriteVersionForCoordinatedWrites( + Arrays.asList(CassandraVersion.FOURZERO, CassandraVersion.FIVEZERO))) + .isEqualTo(CassandraVersion.FOURZERO); + } + + @Test + void testLowestCompatibleWriteVersionSameVersion() + { + assertThat(SSTableVersionAnalyzer.lowestCompatibleWriteVersionForCoordinatedWrites( + Arrays.asList(CassandraVersion.FIVEZERO, CassandraVersion.FIVEZERO))) + .isEqualTo(CassandraVersion.FIVEZERO); + } + + @Test + void testLowestCompatibleWriteVersionRejectsIncompatibleMajors() + { + // 3.0 + 5.0: SSTables written at 3.0 cannot be imported by 5.0 -> reject + assertThatThrownBy(() -> SSTableVersionAnalyzer.lowestCompatibleWriteVersionForCoordinatedWrites( + Arrays.asList(CassandraVersion.THREEZERO, CassandraVersion.FIVEZERO))) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessageContaining("not mutually compatible"); + } + + @Test + void testLowestCompatibleWriteVersionNullOrEmptyThrows() + { + assertThatThrownBy(() -> SSTableVersionAnalyzer.lowestCompatibleWriteVersionForCoordinatedWrites(Collections.emptyList())) + .isInstanceOf(IllegalStateException.class); + assertThatThrownBy(() -> SSTableVersionAnalyzer.lowestCompatibleWriteVersionForCoordinatedWrites(null)) .isInstanceOf(IllegalStateException.class); } } diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConfTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConfTest.java index 239f15410..5fbca3504 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConfTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkSparkConfTest.java @@ -33,7 +33,6 @@ import org.apache.cassandra.spark.bulkwriter.util.SbwKryoRegistrator; import org.apache.cassandra.spark.utils.BuildInfo; import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; import org.jetbrains.annotations.NotNull; import static org.apache.cassandra.spark.bulkwriter.BulkSparkConf.DEFAULT_SIDECAR_PORT; @@ -427,71 +426,6 @@ void testSSTableVersionBasedBridgeEnabledExplicitly() .isFalse(); } - @Test - void testGetDisableSSTableVersionBasedBridgeDefaultValue() - { - // Create a SparkContext without setting the DISABLE_SSTABLE_VERSION_BASED_BRIDGE flag - SparkConf conf = new SparkConf() - .setMaster("local[1]") - .setAppName("test-static-get-default"); - SparkContext sc = new SparkContext(conf); - try - { - boolean result = BulkSparkConf.getDisableSSTableVersionBasedBridge(); - assertThat(result) - .describedAs("getDisableSSTableVersionBasedBridge should return false (enabled) when flag is not set") - .isFalse(); - } - finally - { - sc.stop(); - } - } - - @Test - void testGetDisableSSTableVersionBasedBridgeWhenDisabled() - { - // Create a SparkContext with DISABLE_SSTABLE_VERSION_BASED_BRIDGE set to true - SparkConf conf = new SparkConf() - .setMaster("local[1]") - .setAppName("test-static-get-disabled") - .set(BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE, "true"); - SparkContext sc = new SparkContext(conf); - try - { - boolean result = BulkSparkConf.getDisableSSTableVersionBasedBridge(); - assertThat(result) - .describedAs("getDisableSSTableVersionBasedBridge should return true (disabled) when flag is set to true") - .isTrue(); - } - finally - { - sc.stop(); - } - } - - @Test - void testGetDisableSSTableVersionBasedBridgeWhenEnabled() - { - // Create a SparkContext with DISABLE_SSTABLE_VERSION_BASED_BRIDGE explicitly set to false - SparkConf conf = new SparkConf() - .setMaster("local[1]") - .setAppName("test-static-get-enabled") - .set(BulkSparkConf.DISABLE_SSTABLE_VERSION_BASED_BRIDGE, "false"); - SparkContext sc = new SparkContext(conf); - try - { - boolean result = BulkSparkConf.getDisableSSTableVersionBasedBridge(); - assertThat(result) - .describedAs("getDisableSSTableVersionBasedBridge should return false (enabled) when flag is explicitly set to false") - .isFalse(); - } - finally - { - sc.stop(); - } - } - private Map copyDefaultOptions() { TreeMap map = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroupTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroupTest.java index f5d14575a..04716c50e 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroupTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroupTest.java @@ -335,6 +335,20 @@ void testGetBridgeVersionFourZeroAndFourOneReturnsLowest() assertThat(group.getBridgeVersion()).isEqualTo(CassandraVersion.FOURZERO); } + @Test + void testGetBridgeVersionIncompatibleMajorsThrows() + { + // 3.0 and 5.0 clusters -> SSTables written at 3.0 cannot be imported by 5.0, so the coordinated write fails + CassandraClusterInfoGroup group = mockClusterGroup(2, index -> { + CassandraClusterInfo clusterInfo = mockClusterInfo("cluster" + index); + when(clusterInfo.getBridgeVersion()).thenReturn(index == 0 ? CassandraVersion.THREEZERO : CassandraVersion.FIVEZERO); + return clusterInfo; + }); + assertThatThrownBy(group::getBridgeVersion) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessageContaining("not mutually compatible"); + } + private CassandraClusterInfoGroup mockClusterGroup(int size, Function clusterInfoCreator) { diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/CassandraDataLayerValidationTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/CassandraDataLayerValidationTest.java index 23a6c83e3..c70a96cac 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/CassandraDataLayerValidationTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/CassandraDataLayerValidationTest.java @@ -28,6 +28,7 @@ import org.junit.jupiter.api.Test; import org.apache.cassandra.bridge.CassandraVersion; +import org.apache.cassandra.bridge.SSTableVersionAnalyzer; import org.apache.cassandra.clients.Sidecar; import org.apache.cassandra.spark.data.partitioner.ConsistencyLevel; import org.apache.cassandra.spark.data.partitioner.TokenPartitioner; @@ -60,8 +61,10 @@ void testValidateSStableVersionsListWithValidVersions() } @Test - void testValidateSStableVersionsListWithUnexpectedVersion() + void testValidateSStableVersionsAcceptsReadableVersionNotInGossip() { + // Only big-na was observed in gossip, but the selected read bridge (4.0) can also read big-nb. A version + // that appears after the gossip snapshot but is still readable by the bridge must not fail validation. Set expectedVersions = new HashSet<>(List.of("big-na")); CassandraDataLayer dataLayer = createTestDataLayerWithVersions(expectedVersions); @@ -69,11 +72,9 @@ void testValidateSStableVersionsListWithUnexpectedVersion() SSTable ssTable2 = createMockSSTable("big", "nb", "test2-big-nb-Data.db"); List sstables = Arrays.asList(ssTable1, ssTable2); - assertThatThrownBy(() -> dataLayer.validateSStableVersions(sstables)) - .isInstanceOf(UnsupportedOperationException.class) - .hasMessageContaining("has version 'big-nb' which was not observed in cluster gossip info") - .hasMessageContaining("test2-big-nb-Data.db") - .hasMessageContaining("set spark.cassandra_analytics.bridge.disable_sstable_version_based=true"); + assertThatNoException() + .describedAs("A version readable by the selected bridge but not in the gossip snapshot should be accepted") + .isThrownBy(() -> dataLayer.validateSStableVersions(sstables)); } @Test @@ -99,7 +100,7 @@ void testValidateSStableVersionsListErrorMessageIncludesFileName() assertThatThrownBy(() -> dataLayer.validateSStableVersions(sstables)) .isInstanceOf(UnsupportedOperationException.class) .hasMessageContaining("keyspace-table-big-oa-Data.db") - .hasMessageContaining("Expected versions from gossip:") + .hasMessageContaining("Versions observed in gossip:") .hasMessageContaining("set spark.cassandra_analytics.bridge.disable_sstable_version_based=true"); } @@ -119,20 +120,22 @@ void testValidateSStableVersionsListWithMixedBigAndBtiFormats() } @Test - void testValidateSStableVersionsListWithUnexpectedBtiVersion() + void testValidateSStableVersionsRejectsUnreadableOlderVersion() { + // The selected read bridge (5.0, from big-oa) cannot read 3.x SSTables (big-mf), so it must be rejected + // even though both are "big" format. Set expectedVersions = new HashSet<>(List.of("big-oa")); CassandraDataLayer dataLayer = createTestDataLayerWithVersions(expectedVersions); SSTable ssTable1 = createMockSSTable("big", "oa", "test1-big-oa-Data.db"); - SSTable ssTable2 = createMockSSTable("bti", "da", "keyspace-table-bti-da-Data.db"); + SSTable ssTable2 = createMockSSTable("big", "mf", "keyspace-table-big-mf-Data.db"); List sstables = Arrays.asList(ssTable1, ssTable2); assertThatThrownBy(() -> dataLayer.validateSStableVersions(sstables)) .isInstanceOf(UnsupportedOperationException.class) - .hasMessageContaining("has version 'bti-da' which was not observed in cluster gossip info") - .hasMessageContaining("keyspace-table-bti-da-Data.db") - .hasMessageContaining("Expected versions from gossip:") + .hasMessageContaining("has version 'big-mf' which is not readable by the bridge") + .hasMessageContaining("keyspace-table-big-mf-Data.db") + .hasMessageContaining("Versions observed in gossip:") .hasMessageContaining("set spark.cassandra_analytics.bridge.disable_sstable_version_based=true"); } @@ -268,7 +271,7 @@ private static class TestCassandraDataLayer extends CassandraDataLayer null, // sslConfig mock(CqlTable.class), // cqlTable mock(TokenPartitioner.class), // tokenPartitioner - CassandraVersion.FOURZERO, // bridgeVersion + bridgeVersionFor(sstableVersions), // bridgeVersion (matches what the driver would select) ConsistencyLevel.LOCAL_QUORUM, // consistencyLevel "127.0.0.1", // sidecarInstances 9043, // sidecarPort @@ -286,6 +289,18 @@ private static class TestCassandraDataLayer extends CassandraDataLayer this.sstableVersionsOnCluster = sstableVersions; } + /** + * Derives the bridge version the driver would select for these SSTable versions, so that + * {@code bridge().getVersion()} (used by validateSStableVersions) matches the scenario. Falls back to + * 4.0 when no versions are supplied (disabled-feature / init-only cases). + */ + private static CassandraVersion bridgeVersionFor(Set sstableVersions) + { + return (sstableVersions == null || sstableVersions.isEmpty()) + ? CassandraVersion.FOURZERO + : SSTableVersionAnalyzer.determineBridgeVersionForRead(sstableVersions); + } + @Override public void startupValidate() { @@ -307,8 +322,7 @@ protected void initInstanceMap() @Override protected boolean isSSTableVersionBasedBridgeDisabled() { - // Override to avoid calling BulkSparkConf.getDisableSSTableVersionBasedBridge() - // which would try to initialize SparkContext in unit tests + // Override to avoid reading from the active SparkSession, which is not available in unit tests. // Always return false to test the validation logic return false; } From 6394ca7f244e094d180ec7af70e7862107cd9fcc Mon Sep 17 00:00:00 2001 From: Shailaja Koppu Date: Fri, 26 Jun 2026 22:39:40 +0100 Subject: [PATCH 9/9] follow existing style --- .../bulkwriter/AbstractBulkWriterContext.java | 58 ++--- .../bulkwriter/BroadcastableClusterInfo.java | 19 +- .../BroadcastableClusterInfoGroup.java | 28 +- .../bulkwriter/BroadcastableTableSchema.java | 7 +- .../spark/bulkwriter/BulkWriterConfig.java | 7 +- .../spark/bulkwriter/BulkWriterContext.java | 3 - .../CassandraBulkWriterContext.java | 31 +-- .../bulkwriter/CassandraClusterInfo.java | 239 +++++++++--------- .../CassandraDirectDataTransportContext.java | 5 +- .../spark/bulkwriter/ClusterInfo.java | 5 +- .../bulkwriter/IBroadcastableClusterInfo.java | 12 +- .../bulkwriter/SSTableWriterFactory.java | 17 +- .../spark/bulkwriter/SortedSSTableWriter.java | 29 ++- .../spark/bulkwriter/TableSchema.java | 8 +- .../CloudStorageStreamSession.java | 3 +- .../CassandraClusterInfoGroup.java | 99 ++++---- ...CassandraCoordinatedBulkWriterContext.java | 34 +-- .../AbstractBulkWriterContextTest.java | 25 +- .../BulkWriterConfigExtensibilityTest.java | 18 +- ...CassandraClusterInfoBridgeVersionTest.java | 13 +- .../bulkwriter/CassandraClusterInfoTest.java | 3 +- .../bulkwriter/MockBulkWriterContext.java | 10 +- .../bulkwriter/TableSchemaTestCommon.java | 5 +- .../CassandraClusterInfoGroupTest.java | 24 +- 24 files changed, 321 insertions(+), 381 deletions(-) diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java index c7f72ef1a..7f051e77a 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java @@ -32,7 +32,6 @@ import com.esotericsoftware.kryo.io.Output; import org.apache.cassandra.bridge.CassandraBridge; import org.apache.cassandra.bridge.CassandraBridgeFactory; -import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.CassandraClusterInfoGroup; import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.MultiClusterContainer; import org.apache.cassandra.spark.bulkwriter.token.TokenRangeMapping; @@ -77,7 +76,7 @@ public abstract class AbstractBulkWriterContext implements BulkWriterContext, Kr private final JobInfo jobInfo; private final ClusterInfo clusterInfo; private final SchemaInfo schemaInfo; - private final CassandraVersion bridgeVersion; + private final String bridgeVersion; // Note: do not declare transient fields as final; but they need to be volatile as there could be contention when recreating them after deserialization // For the transient field, they are assigned null once deserialized, remember to use getOrRebuildAfterDeserialization for their getters private transient volatile CassandraBridge bridge; @@ -99,13 +98,10 @@ protected AbstractBulkWriterContext(@NotNull BulkSparkConf conf, this.conf = conf; this.sparkDefaultParallelism = sparkDefaultParallelism; - // Determine the bridge version - this.bridgeVersion = getBridgeVersion(); - - // Build cluster info with determined bridge version - this.clusterInfo = buildClusterInfo(this.bridgeVersion); + // Build everything fresh on driver + this.clusterInfo = buildClusterInfo(); this.clusterInfo.startupValidate(); - + this.bridgeVersion = findBridgeVersion(); this.bridge = buildCassandraBridge(); this.jobInfo = buildJobInfo(); this.schemaInfo = buildSchemaInfo(structType); @@ -125,12 +121,9 @@ protected AbstractBulkWriterContext(@NotNull BulkWriterConfig config) this.conf = config.getConf(); this.sparkDefaultParallelism = config.getSparkDefaultParallelism(); - // Get bridge version from broadcast config + // Reconstruct from broadcast data on executor + this.clusterInfo = reconstructClusterInfoOnExecutor(config.getBroadcastableClusterInfo()); this.bridgeVersion = config.getBridgeVersion(); - - // Reconstruct from broadcast data on executor with bridge version - this.clusterInfo = reconstructClusterInfoOnExecutor(config.getBroadcastableClusterInfo(), this.bridgeVersion); - this.bridge = buildCassandraBridge(); this.jobInfo = reconstructJobInfoOnExecutor(config.getBroadcastableJobInfo()); this.schemaInfo = reconstructSchemaInfoOnExecutor(config.getBroadcastableSchemaInfo()); @@ -148,36 +141,14 @@ protected final int sparkDefaultParallelism() return sparkDefaultParallelism; } - @Override - public CassandraVersion bridgeVersion() + protected String bridgeVersion() { - if (bridgeVersion == null) - { - throw new IllegalStateException( - "Bridge version must be determined before accessing it. " + - "Ensure SSTable versions are retrieved from cluster and bridge version is set."); - } - return bridgeVersion; } - /** - * Determines the Cassandra bridge version to use for this write. Implementations apply any version - * override, SSTable-version-based selection, or the legacy cassandra.version fallback as appropriate. - * - * @return the determined Cassandra bridge version - */ - protected abstract CassandraVersion getBridgeVersion(); - /*--- Methods to build required fields ---*/ - /** - * Builds the ClusterInfo with the determined bridge version. - * - * @param bridgeVersion the determined Cassandra bridge version - * @return ClusterInfo instance with bridge version set - */ - protected abstract ClusterInfo buildClusterInfo(CassandraVersion bridgeVersion); + protected abstract ClusterInfo buildClusterInfo(); /** * Reconstructs ClusterInfo on executors from broadcastable versions. @@ -186,13 +157,11 @@ public CassandraVersion bridgeVersion() * into the appropriate full ClusterInfo implementation. * * @param clusterInfo the BroadcastableClusterInfo from broadcast - * @param bridgeVersion the bridge version from broadcast * @return reconstructed ClusterInfo (CassandraClusterInfo or CassandraClusterInfoGroup) */ - protected ClusterInfo reconstructClusterInfoOnExecutor(IBroadcastableClusterInfo clusterInfo, - CassandraVersion bridgeVersion) + protected ClusterInfo reconstructClusterInfoOnExecutor(IBroadcastableClusterInfo clusterInfo) { - return clusterInfo.reconstruct(bridgeVersion); + return clusterInfo.reconstruct(); } /** @@ -260,6 +229,11 @@ protected JobStatsPublisher buildJobStatsPublisher() return new LogStatsPublisher(); } + protected String findBridgeVersion() + { + return cluster().getBridgeVersion(); + } + protected SchemaInfo buildSchemaInfo(StructType structType) { QualifiedTableName tableName = job().qualifiedTableName(); @@ -339,7 +313,7 @@ public void shutdown() protected TableSchema initializeTableSchema(@NotNull BulkSparkConf conf, @NotNull StructType dfSchema, TableInfoProvider tableInfoProvider, - CassandraVersion bridgeVersion) + String bridgeVersion) { return new TableSchema(dfSchema, tableInfoProvider, diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableClusterInfo.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableClusterInfo.java index e2cd8e87b..ec36fcb99 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableClusterInfo.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableClusterInfo.java @@ -19,7 +19,6 @@ package org.apache.cassandra.spark.bulkwriter; -import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.data.partitioner.Partitioner; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; @@ -46,6 +45,7 @@ public final class BroadcastableClusterInfo implements IBroadcastableClusterInfo // Essential fields broadcast to executors private final Partitioner partitioner; + private final String bridgeVersion; private final String clusterId; private final BulkSparkConf conf; @@ -58,26 +58,31 @@ public final class BroadcastableClusterInfo implements IBroadcastableClusterInfo */ public static BroadcastableClusterInfo from(@NotNull ClusterInfo source, @NotNull BulkSparkConf conf) { - return new BroadcastableClusterInfo(source.getPartitioner(), - source.clusterId(), - conf); + return new BroadcastableClusterInfo(source.getPartitioner(), source.getBridgeVersion(), source.clusterId(), conf); } private BroadcastableClusterInfo(@NotNull Partitioner partitioner, + @NotNull String bridgeVersion, @Nullable String clusterId, @NotNull BulkSparkConf conf) { this.partitioner = partitioner; + this.bridgeVersion = bridgeVersion; this.clusterId = clusterId; this.conf = conf; } - @NotNull public BulkSparkConf getConf() { return conf; } + @Override + public String getBridgeVersion() + { + return bridgeVersion; + } + @Override public Partitioner getPartitioner() { @@ -92,8 +97,8 @@ public String clusterId() } @Override - public ClusterInfo reconstruct(CassandraVersion bridgeVersion) + public ClusterInfo reconstruct() { - return new CassandraClusterInfo(this, bridgeVersion); + return new CassandraClusterInfo(this); } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableClusterInfoGroup.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableClusterInfoGroup.java index 9cba76fca..5084b72b6 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableClusterInfoGroup.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableClusterInfoGroup.java @@ -24,7 +24,6 @@ import java.util.List; import java.util.function.BiConsumer; -import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.CassandraClusterInfoGroup; import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.MultiClusterSupport; import org.apache.cassandra.spark.data.partitioner.Partitioner; @@ -35,7 +34,7 @@ * Broadcastable wrapper for coordinated writes with ZERO transient fields to optimize Spark broadcasting. *

* This class wraps multiple BroadcastableCluster instances for multi-cluster scenarios. - * Pre-computed values (partitioner) are extracted from CassandraClusterInfoGroup on the driver + * Pre-computed values (partitioner, bridgeVersion) are extracted from CassandraClusterInfoGroup on the driver * to avoid duplicating aggregation/validation logic on executors. *

* Why ZERO transient fields matters:
@@ -57,10 +56,12 @@ public final class BroadcastableClusterInfoGroup implements IBroadcastableCluste private final String clusterId; private final BulkSparkConf conf; private final Partitioner partitioner; + private final String bridgeVersion; /** * Creates a BroadcastableClusterInfoGroup from a source ClusterInfo group. - * Extracts pre-computed partitioner from the source to avoid duplicating validation logic on executors. + * Extracts pre-computed values (partitioner, bridgeVersion) from the source + * to avoid duplicating aggregation/validation logic on executors. * * @param source the source CassandraClusterInfoGroup * @param conf the BulkSparkConf needed to connect to Sidecar on executors @@ -71,22 +72,25 @@ public static BroadcastableClusterInfoGroup from(@NotNull CassandraClusterInfoGr List broadcastableInfos = new ArrayList<>(); source.forEach((clusterId, clusterInfo) -> broadcastableInfos.add(BroadcastableClusterInfo.from(clusterInfo, conf))); - // Extract pre-computed value from CassandraClusterInfoGroup + // Extract pre-computed values from CassandraClusterInfoGroup // These have already been validated/computed on the driver Partitioner partitioner = source.getPartitioner(); + String bridgeVersion = source.getBridgeVersion(); - return new BroadcastableClusterInfoGroup(broadcastableInfos, source.clusterId(), conf, partitioner); + return new BroadcastableClusterInfoGroup(broadcastableInfos, source.clusterId(), conf, partitioner, bridgeVersion); } private BroadcastableClusterInfoGroup(List clusterInfos, String clusterId, BulkSparkConf conf, - Partitioner partitioner) + Partitioner partitioner, + String bridgeVersion) { this.clusterInfos = Collections.unmodifiableList(clusterInfos); this.conf = conf; this.clusterId = clusterId; this.partitioner = partitioner; + this.bridgeVersion = bridgeVersion; } @Override @@ -96,6 +100,14 @@ public BulkSparkConf getConf() return conf; } + @Override + public String getBridgeVersion() + { + // Return pre-computed value from CassandraClusterInfoGroup + // No need to duplicate aggregation/validation logic + return bridgeVersion; + } + @Override public Partitioner getPartitioner() { @@ -131,8 +143,8 @@ public IBroadcastableClusterInfo getValueOrNull(@NotNull String clusterId) } @Override - public ClusterInfo reconstruct(CassandraVersion bridgeVersion) + public ClusterInfo reconstruct() { - return CassandraClusterInfoGroup.from(this, bridgeVersion); + return CassandraClusterInfoGroup.from(this); } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableTableSchema.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableTableSchema.java index bdcd91d43..69aebf9b2 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableTableSchema.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableTableSchema.java @@ -24,7 +24,6 @@ import com.google.common.base.Preconditions; -import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.common.schema.ColumnType; import org.jetbrains.annotations.NotNull; @@ -59,7 +58,7 @@ public final class BroadcastableTableSchema implements Serializable private final WriteMode writeMode; private final TTLOption ttlOption; private final TimestampOption timestampOption; - private final CassandraVersion bridgeVersion; + private final String bridgeVersion; private final boolean quoteIdentifiers; /** @@ -95,7 +94,7 @@ private BroadcastableTableSchema(String createStatement, WriteMode writeMode, TTLOption ttlOption, TimestampOption timestampOption, - CassandraVersion bridgeVersion, + String bridgeVersion, boolean quoteIdentifiers) { this.createStatement = createStatement; @@ -156,7 +155,7 @@ public TimestampOption getTimestampOption() return timestampOption; } - public CassandraVersion getBridgeVersion() + public String getBridgeVersion() { return bridgeVersion; } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfig.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfig.java index 7e69ec1ef..621acaf48 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfig.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfig.java @@ -57,8 +57,7 @@ public class BulkWriterConfig implements Serializable // BroadcastableClusterInfo can be either BroadcastableCluster or BroadcastableClusterInfoGroup private final IBroadcastableClusterInfo clusterInfo; private final BroadcastableSchemaInfo schemaInfo; - // SSTable version-based bridge loading fields - private final CassandraVersion bridgeVersion; + private final String bridgeVersion; /** * Creates a new immutable BulkWriterConfig with pre-computed values @@ -75,7 +74,7 @@ public BulkWriterConfig(@NotNull BulkSparkConf conf, @NotNull BroadcastableJobInfo jobInfo, @NotNull IBroadcastableClusterInfo clusterInfo, @NotNull BroadcastableSchemaInfo schemaInfo, - @NotNull CassandraVersion bridgeVersion) + @NotNull String bridgeVersion) { this.conf = conf; this.sparkDefaultParallelism = sparkDefaultParallelism; @@ -110,7 +109,7 @@ public BroadcastableSchemaInfo getBroadcastableSchemaInfo() return schemaInfo; } - public CassandraVersion getBridgeVersion() + public String getBridgeVersion() { return bridgeVersion; } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkWriterContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkWriterContext.java index a717f1fb9..8f240215c 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkWriterContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BulkWriterContext.java @@ -22,7 +22,6 @@ import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.CassandraCoordinatedBulkWriterContext; import org.apache.cassandra.spark.common.stats.JobStatsPublisher; import org.apache.cassandra.bridge.CassandraBridge; -import org.apache.cassandra.bridge.CassandraVersion; import org.apache.spark.api.java.JavaSparkContext; /** @@ -48,8 +47,6 @@ public interface BulkWriterContext CassandraBridge bridge(); - CassandraVersion bridgeVersion(); - // NOTE: This interface intentionally does *not* implement AutoClosable as Spark can close Broadcast variables // that implement AutoClosable while they are still in use, causing the underlying object to become unusable void shutdown(); diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraBulkWriterContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraBulkWriterContext.java index 8f7b8f517..92721c5a4 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraBulkWriterContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraBulkWriterContext.java @@ -24,7 +24,6 @@ import com.google.common.base.Preconditions; import org.apache.commons.lang3.StringUtils; -import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.MultiClusterContainer; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.types.StructType; @@ -39,13 +38,6 @@ // CHECKSTYLE IGNORE: This class cannot be declared as final, because consumers should be able to extend it public class CassandraBulkWriterContext extends AbstractBulkWriterContext { - // A temporary CassandraClusterInfo created with bridgeVersion=null during driver-side initialization. - // This preliminaryClusterInfo provides the Sidecar connectivity needed to determine the bridge version. - // Once bridge version is determined, buildClusterInfo() promotes this by setting its bridge version. - // Marked transient because it is only used during the driver-side constructor and must not be serialized - // when broadcasting to executors. - private transient CassandraClusterInfo preliminaryClusterInfo; - protected CassandraBulkWriterContext(@NotNull BulkSparkConf conf, @NotNull StructType structType, int sparkDefaultParallelism) @@ -65,28 +57,9 @@ protected CassandraBulkWriterContext(@NotNull BulkWriterConfig config) } @Override - protected CassandraVersion getBridgeVersion() - { - return getOrCreatePreliminaryClusterInfo(bulkSparkConf()).getBridgeVersion(); - } - - @Override - protected ClusterInfo buildClusterInfo(CassandraVersion bridgeVersion) + protected ClusterInfo buildClusterInfo() { - CassandraClusterInfo clusterInfo = getOrCreatePreliminaryClusterInfo(bulkSparkConf()); - preliminaryClusterInfo = null; - clusterInfo.setBridgeVersion(bridgeVersion); - return clusterInfo; - } - - private CassandraClusterInfo getOrCreatePreliminaryClusterInfo(BulkSparkConf conf) - { - if (preliminaryClusterInfo == null) - { - preliminaryClusterInfo = new CassandraClusterInfo(conf, null); - } - - return preliminaryClusterInfo; + return new CassandraClusterInfo(bulkSparkConf()); } @Override diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java index 0f11070da..7ef25b099 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfo.java @@ -33,7 +33,7 @@ import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; - +import java.util.concurrent.atomic.AtomicReference; import java.util.stream.Collectors; import com.google.common.annotations.VisibleForTesting; @@ -72,7 +72,7 @@ * and includes the result in the {@link BulkWriterConfig} that gets broadcast. *

* On executors, a new instance is reconstructed from {@link BroadcastableClusterInfo} - * using {@link #CassandraClusterInfo(BroadcastableClusterInfo, CassandraVersion)}, reusing broadcast-safe + * using {@link #CassandraClusterInfo(BroadcastableClusterInfo)}, reusing broadcast-safe * fields and fetching other data fresh from Sidecar. * * @see BroadcastableClusterInfo for the broadcast-safe subset of fields @@ -86,7 +86,8 @@ public class CassandraClusterInfo implements ClusterInfo, Closeable // Changes here must be reflected in BroadcastableClusterInfo. protected final BulkSparkConf conf; protected final String clusterId; - protected volatile Partitioner partitioner; + protected String bridgeVersion; + protected Partitioner partitioner; // -- Driver-only fields (not broadcast) -- // NOT included in BroadcastableClusterInfo. Either expensive to serialize @@ -96,53 +97,43 @@ public class CassandraClusterInfo implements ClusterInfo, Closeable protected volatile String keyspaceSchema; protected volatile ReplicationFactor replicationFactor; protected volatile CassandraContext cassandraContext; - protected volatile CassandraVersion bridgeVersion; - private final List> allNodeSettingFutures; - private List resolvedNodeSettings; + protected final AtomicReference nodeSettings; + protected final List> allNodeSettingFutures; - public CassandraClusterInfo(BulkSparkConf conf, CassandraVersion bridgeVersion) + public CassandraClusterInfo(BulkSparkConf conf) { - this(conf, null, bridgeVersion); + this(conf, null); } - /** - * Constructor with bridge version for driver-side usage. - * - * @param conf Bulk Spark configuration - * @param clusterId Optional cluster identifier - * @param bridgeVersion Determined bridge version (nullable for preliminary construction) - */ - public CassandraClusterInfo(BulkSparkConf conf, String clusterId, CassandraVersion bridgeVersion) + // Used by CassandraClusterInfoGroup + public CassandraClusterInfo(BulkSparkConf conf, String clusterId) { this.conf = conf; this.clusterId = clusterId; - this.bridgeVersion = bridgeVersion; this.cassandraContext = buildCassandraContext(); - // Fire the node-settings requests upfront so they are in-flight while the rest of driver-side - // initialization proceeds. LOGGER.info("Getting Cassandra versions from all nodes"); + this.nodeSettings = new AtomicReference<>(null); this.allNodeSettingFutures = Sidecar.allNodeSettings(cassandraContext.getSidecarClient(), cassandraContext.getCluster()); } /** * Reconstruct from BroadcastableCluster on executor. - * Reuses partitioner and bridge version from broadcast, + * Reuses bridgeVersion and partitioner from broadcast, * fetches other data (tokenRangeMapping, replicationFactor, keyspaceSchema, writeAvailability) fresh from Sidecar. * * @param broadcastable the broadcastable cluster info from broadcast - * @param bridgeVersion the bridge version from broadcast */ - public CassandraClusterInfo(BroadcastableClusterInfo broadcastable, CassandraVersion bridgeVersion) + public CassandraClusterInfo(BroadcastableClusterInfo broadcastable) { this.conf = broadcastable.getConf(); this.clusterId = broadcastable.clusterId(); this.partitioner = broadcastable.getPartitioner(); - this.bridgeVersion = bridgeVersion; + this.bridgeVersion = broadcastable.getBridgeVersion(); this.cassandraContext = buildCassandraContext(); - LOGGER.info("Reconstructing CassandraClusterInfo on executor from BroadcastableCluster. clusterId={}, bridgeVersion={}", - clusterId, bridgeVersion != null ? bridgeVersion.versionName() : "null"); - // Executors do not need to query all node settings since cassandraVersion is already set from broadcast + LOGGER.info("Reconstructing CassandraClusterInfo on executor from BroadcastableCluster. clusterId={}", clusterId); + this.nodeSettings = new AtomicReference<>(null); + // Executors do not need to query all node settings since the bridge version is already set from broadcast this.allNodeSettingFutures = null; } @@ -219,9 +210,24 @@ public Partitioner getPartitioner() { if (partitioner == null) { - List settings = resolveAllNodeSettings(); - String partitionerString = settings.get(0).partitioner(); - partitioner = Partitioner.from(partitionerString); + try + { + String partitionerString; + NodeSettings currentNodeSettings = nodeSettings.get(); + if (currentNodeSettings != null) + { + partitionerString = currentNodeSettings.partitioner(); + } + else + { + partitionerString = getCassandraContext().getSidecarClient().nodeSettings().get().partitioner(); + } + partitioner = Partitioner.from(partitionerString); + } + catch (ExecutionException | InterruptedException exception) + { + throw new RuntimeException("Unable to retrieve partitioner information", exception); + } } return partitioner; } @@ -386,6 +392,68 @@ public TokenRangeMapping getTokenRangeMapping(boolean cached) } } + /** + * Returns the Cassandra bridge version for this cluster, computed lazily on first use and cached. + * Mirrors {@link #getVersionFromSidecar()}'s lazy pattern: it works on the driver (where the value is + * computed from the cluster) and on executors (where the cached value is seeded from the broadcast + * {@link BroadcastableClusterInfo}, so no re-query is needed). The determination itself is in + * {@link #determineBridgeVersion()}. + * + * @return the determined Cassandra bridge version string (e.g. "5.0.0") + */ + @Override + public String getBridgeVersion() + { + String currentBridgeVersion = bridgeVersion; + if (currentBridgeVersion != null) + { + return currentBridgeVersion; + } + + synchronized (this) + { + if (bridgeVersion == null) + { + bridgeVersion = determineBridgeVersion(); + } + return bridgeVersion; + } + } + + /** + * Determines the Cassandra bridge version for this cluster, in priority order: + *

    + *
  1. an explicit version override from {@link #getVersionFromFeature()} (an operator escape hatch) — + * takes precedence even when SSTable-version-based selection is enabled;
  2. + *
  3. otherwise, when SSTable version-based selection is enabled, the lowest version derived from the + * SSTable versions present, so the produced SSTables remain importable by every node;
  4. + *
  5. otherwise (feature disabled), the lowest Cassandra release version reported by the cluster.
  6. + *
+ * + * @return a version string for the determined bridge (e.g. "5.0.0") + */ + private String determineBridgeVersion() + { + String versionOverride = getVersionFromFeature(); + if (versionOverride != null) + { + // Forcing writer to use a particular version; validate it is a supported version up front + CassandraVersion.fromVersion(versionOverride) + .orElseThrow(() -> new UnsupportedOperationException( + "Unsupported Cassandra version override: " + versionOverride)); + return versionOverride; + } + + if (!conf.isSSTableVersionBasedBridgeDisabled()) + { + CassandraVersion determined = SSTableVersionAnalyzer.determineBridgeVersionForWrite(getSSTableVersionsOnCluster(), + CassandraVersion.configuredSSTableFormat()); + // Return a full major.minor.patch string so it parses via CassandraVersionFeatures downstream + return determined.versionName() + ".0"; + } + + return getVersionFromSidecar(); + } @Override public Map clusterWriteAvailability() @@ -416,36 +484,22 @@ private TokenRangeMapping getTokenRangeReplicasFromSidecar() metadata -> new RingInstance(metadata, clusterId())); } - /** - * Sets the bridge version after preliminary construction. - * This allows constructing CassandraClusterInfo with a null bridgeVersion for early - * context reuse, then setting the version once it has been determined. - * - * @param bridgeVersion the determined Cassandra bridge version - */ - public void setBridgeVersion(CassandraVersion bridgeVersion) + public String getVersionFromFeature() { - this.bridgeVersion = bridgeVersion; + return null; } - /** - * Resolves the node settings futures on first call and caches the result. - * - * @return list of resolved NodeSettings from all nodes - */ - private synchronized List resolveAllNodeSettings() + protected List getAllNodeSettings() { - if (resolvedNodeSettings != null) - { - return resolvedNodeSettings; - } - if (allNodeSettingFutures == null) { throw new IllegalStateException("getAllNodeSettings should not be called on executor. " + "Cassandra version is pre-computed on driver and broadcast to executors."); } + // Worst-case, the http client is configured for 1 worker pool. + // In that case, each future can take the full retry delay * number of retries, + // and each instance will be processed serially. final long totalTimeout = conf.getSidecarRequestMaxRetryDelayMillis() * conf.getSidecarRequestRetries() * allNodeSettingFutures.size(); @@ -464,72 +518,37 @@ else if (allNodeSettings.size() < allNodeSettingFutures.size()) allNodeSettings.size(), allNodeSettingFutures.size()); } - resolvedNodeSettings = allNodeSettings; - return resolvedNodeSettings; + return allNodeSettings; } - /** - * Allows a subclass to override the Cassandra version via a feature flag. - * Returns {@code null} by default, meaning the version is determined from the cluster. - * - * @return overridden Cassandra version, or {@code null} to determine it from the cluster - */ - public String getVersionFromFeature() - { - return null; - } - - /** - * Determines the Cassandra bridge version for this cluster, in priority order: - *
    - *
  1. an explicit version override from {@link #getVersionFromFeature()} (an operator escape hatch) — - * takes precedence even when SSTable-version-based selection is enabled;
  2. - *
  3. otherwise, when SSTable version-based selection is enabled, the lowest version derived from the - * SSTable versions present, so the produced SSTables remain importable by every node;
  4. - *
  5. otherwise (feature disabled), the lowest Cassandra release version reported by the cluster.
  6. - *
- * - * @return the determined Cassandra bridge version - */ - public CassandraVersion getBridgeVersion() + public String getVersionFromSidecar() { - String versionOverride = getVersionFromFeature(); - if (versionOverride != null) + NodeSettings nodeSettings = this.nodeSettings.get(); + if (nodeSettings != null) { - return CassandraVersion.fromVersion(versionOverride) - .orElseThrow(() -> new UnsupportedOperationException( - "Unsupported Cassandra version override: " + versionOverride)); + return nodeSettings.releaseVersion(); } - if (!conf.isSSTableVersionBasedBridgeDisabled()) - { - return SSTableVersionAnalyzer.determineBridgeVersionForWrite(getSSTableVersionsOnCluster(), - CassandraVersion.configuredSSTableFormat()); - } - - String lowestCassandraVersion = getLowestCassandraVersion(); - return CassandraVersion.fromVersion(lowestCassandraVersion) - .orElseThrow(() -> new UnsupportedOperationException( - "Unsupported Cassandra version: " + lowestCassandraVersion)); + return getLowestVersion(getAllNodeSettings()); } - /** - * Retrieves the lowest Cassandra version reported by the cluster, using the already-fired - * allNodeSettingFutures. Reuses the existing CassandraContext instead of creating a separate one. - * - * @return lowest Cassandra version string - */ - public String getLowestCassandraVersion() + @VisibleForTesting + public String getLowestVersion(List allNodeSettings) { - List allNodeSettings = resolveAllNodeSettings(); - - NodeSettings ns = allNodeSettings - .stream() - .filter(settings -> !settings.releaseVersion().equalsIgnoreCase("unknown")) - .min(Comparator.comparing(settings -> - CassandraVersionFeatures.cassandraVersionFeaturesFromCassandraVersion(settings.releaseVersion()))) - .orElseThrow(() -> new RuntimeException("No valid Cassandra Versions were returned from Cassandra Sidecar")); + NodeSettings ns = this.nodeSettings.get(); + if (ns != null) + { + return ns.releaseVersion(); + } + // It is possible to run the below computation multiple times. Since the computation is local-only, it is OK. + ns = allNodeSettings + .stream() + .filter(settings -> !settings.releaseVersion().equalsIgnoreCase("unknown")) + .min(Comparator.comparing(settings -> + CassandraVersionFeatures.cassandraVersionFeaturesFromCassandraVersion(settings.releaseVersion()))) + .orElseThrow(() -> new RuntimeException("No valid Cassandra Versions were returned from Cassandra Sidecar")); + nodeSettings.compareAndSet(null, ns); return ns.releaseVersion(); } @@ -553,15 +572,7 @@ public Set getSSTableVersionsOnCluster() protected CassandraBridge bridge() { - // Use the pre-determined bridgeVersion if available - if (bridgeVersion != null) - { - return CassandraBridgeFactory.get(bridgeVersion); - } - - // Bridge version must be set before accessing bridge - throw new IllegalStateException( - "Bridge version must be set during construction before using bridge()."); + return CassandraBridgeFactory.get(getBridgeVersion()); } // Startup Validation diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java index 7f2db54ca..68eb19b7e 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java @@ -25,6 +25,7 @@ import com.google.common.collect.Range; import org.apache.cassandra.bridge.CassandraBridge; +import org.apache.cassandra.bridge.CassandraBridgeFactory; import org.apache.cassandra.spark.bulkwriter.token.ReplicaAwareFailureHandler; import org.jetbrains.annotations.NotNull; @@ -35,15 +36,12 @@ public class CassandraDirectDataTransportContext implements TransportContext.Dir @NotNull private final ClusterInfo clusterInfo; @NotNull - private final CassandraBridge bridge; - @NotNull private final DirectDataTransferApi dataTransferApi; public CassandraDirectDataTransportContext(@NotNull BulkWriterContext bulkWriterContext) { this.jobInfo = bulkWriterContext.job(); this.clusterInfo = bulkWriterContext.cluster(); - this.bridge = bulkWriterContext.bridge(); // Use bridge from context (SSTable version-based) this.dataTransferApi = createDirectDataTransferApi(); } @@ -73,6 +71,7 @@ public DirectDataTransferApi dataTransferApi() // only invoke in constructor protected DirectDataTransferApi createDirectDataTransferApi() { + CassandraBridge bridge = CassandraBridgeFactory.get(clusterInfo.getBridgeVersion()); return new SidecarDataTransferApi(clusterInfo.getCassandraContext(), bridge, jobInfo); } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/ClusterInfo.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/ClusterInfo.java index a5d0f66d1..93e39bd2c 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/ClusterInfo.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/ClusterInfo.java @@ -24,7 +24,6 @@ import com.google.common.collect.Range; -import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.bulkwriter.token.TokenRangeMapping; import org.apache.cassandra.spark.data.ReplicationFactor; import org.apache.cassandra.spark.data.partitioner.Partitioner; @@ -44,7 +43,7 @@ * for broadcasting to executors via {@link BulkWriterConfig}. *

* On executors, ClusterInfo instances are reconstructed from the broadcastable wrappers using - * {@link AbstractBulkWriterContext#reconstructClusterInfoOnExecutor(IBroadcastableClusterInfo, CassandraVersion)}. + * {@link AbstractBulkWriterContext#reconstructClusterInfoOnExecutor(IBroadcastableClusterInfo)}. */ public interface ClusterInfo extends StartupValidatable { @@ -52,6 +51,8 @@ public interface ClusterInfo extends StartupValidatable TokenRangeMapping getTokenRangeMapping(boolean cached); + String getBridgeVersion(); + /** * @return WriteAvailability per RingInstance in the cluster */ diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/IBroadcastableClusterInfo.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/IBroadcastableClusterInfo.java index a1c0af404..8a311a151 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/IBroadcastableClusterInfo.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/IBroadcastableClusterInfo.java @@ -21,7 +21,6 @@ import java.io.Serializable; -import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.data.partitioner.Partitioner; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; @@ -38,9 +37,10 @@ * Methods in this interface: *

    *
  • {@link #getPartitioner()} - static cluster partitioner configuration
  • + *
  • {@link #getBridgeVersion()} - pre-computed version string
  • *
  • {@link #clusterId()} - cluster identifier (optional)
  • *
  • {@link #getConf()} - BulkSparkConf needed for reconstruction on executors
  • - *
  • {@link #reconstruct(CassandraVersion)} - reconstructs full ClusterInfo instance on executors with bridge version
  • + *
  • {@link #reconstruct()} - reconstructs full ClusterInfo instance on executors
  • *
*/ public interface IBroadcastableClusterInfo extends Serializable @@ -50,6 +50,11 @@ public interface IBroadcastableClusterInfo extends Serializable */ Partitioner getPartitioner(); + /** + * @return the pre-computed Cassandra bridge version determined on the driver + */ + String getBridgeVersion(); + /** * ID string that can uniquely identify a cluster. * When writing to a single cluster, this may be null. @@ -72,8 +77,7 @@ public interface IBroadcastableClusterInfo extends Serializable * This allows adding new broadcastable types without modifying the reconstruction logic * in {@link AbstractBulkWriterContext}. * - * @param bridgeVersion the bridge version from broadcast * @return reconstructed ClusterInfo (CassandraClusterInfo or CassandraClusterInfoGroup) */ - ClusterInfo reconstruct(CassandraVersion bridgeVersion); + ClusterInfo reconstruct(); } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SSTableWriterFactory.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SSTableWriterFactory.java index f2350c5d4..7aee4f602 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SSTableWriterFactory.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SSTableWriterFactory.java @@ -22,14 +22,10 @@ import java.util.Set; import org.apache.cassandra.bridge.CassandraBridge; +import org.apache.cassandra.bridge.CassandraBridgeFactory; +import org.apache.cassandra.bridge.CassandraVersionFeatures; import org.apache.cassandra.bridge.SSTableWriter; -/** - * Factory for creating SSTableWriter instances. - * - * @deprecated This class is deprecated. Use {@code BulkWriterContext.bridge().getSSTableWriter()} directly instead. - */ -@Deprecated public final class SSTableWriterFactory { private SSTableWriterFactory() @@ -37,13 +33,7 @@ private SSTableWriterFactory() throw new IllegalStateException(getClass() + " is static utility class and shall not be instantiated"); } - /** - * Creates an SSTableWriter using the provided bridge. - * - * @deprecated Use {@code cassandraBridge.getSSTableWriter()} directly instead. - */ - @Deprecated - public static SSTableWriter getSSTableWriter(CassandraBridge cassandraBridge, + public static SSTableWriter getSSTableWriter(CassandraVersionFeatures bridgeVersion, String inDirectory, String partitioner, String createStatement, @@ -51,6 +41,7 @@ public static SSTableWriter getSSTableWriter(CassandraBridge cassandraBridge, Set userDefinedTypeStatements, int bufferSizeMB) { + CassandraBridge cassandraBridge = CassandraBridgeFactory.get(bridgeVersion); return cassandraBridge.getSSTableWriter(inDirectory, partitioner, createStatement, diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java index 5d942ace6..804051781 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java @@ -37,7 +37,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.cassandra.bridge.CassandraBridgeFactory; import org.apache.cassandra.bridge.CassandraVersion; +import org.apache.cassandra.bridge.CassandraVersionFeatures; import org.apache.cassandra.bridge.SSTableDescriptor; import org.apache.cassandra.spark.common.Digest; import org.apache.cassandra.spark.common.SSTables; @@ -118,17 +120,26 @@ public SortedSSTableWriter(BulkWriterContext writerContext, Path outDir, DigestA this.digestAlgorithm = digestAlgorithm; this.partitionId = partitionId; + String bridgeVersion = writerContext.cluster().getBridgeVersion(); + String packageVersion = getPackageVersion(bridgeVersion); + LOGGER.info("Running with version {}", packageVersion); + SchemaInfo schema = writerContext.schema(); TableSchema tableSchema = schema.getTableSchema(); + this.cqlSSTableWriter = SSTableWriterFactory.getSSTableWriter( + CassandraVersionFeatures.cassandraVersionFeaturesFromCassandraVersion(packageVersion), + this.outDir.toString(), + writerContext.cluster().getPartitioner().toString(), + tableSchema.createStatement, + tableSchema.modificationStatement, + schema.getUserDefinedTypeStatements(), + writerContext.job().sstableDataSizeInMiB()); + } - // Use the bridge from context which is already loaded based on SSTable version analysis - this.cqlSSTableWriter = writerContext.bridge().getSSTableWriter( - this.outDir.toString(), - writerContext.cluster().getPartitioner().toString(), - tableSchema.createStatement, - tableSchema.modificationStatement, - schema.getUserDefinedTypeStatements(), - writerContext.job().sstableDataSizeInMiB()); + @NotNull + public String getPackageVersion(String bridgeVersion) + { + return CASSANDRA_VERSION_PREFIX + bridgeVersion; } /** @@ -352,7 +363,7 @@ public void validateSSTables(@NotNull BulkWriterContext writerContext, @NotNull private LocalDataLayer buildLocalDataLayer(@NotNull BulkWriterContext writerContext, @NotNull Path outputDirectory, @Nullable Set dataFilePaths) { - CassandraVersion version = writerContext.bridgeVersion(); + CassandraVersion version = CassandraBridgeFactory.getCassandraVersion(writerContext.cluster().getBridgeVersion()); String keyspace = writerContext.job().qualifiedTableName().keyspace(); String schema = writerContext.schema().getTableSchema().createStatement; Partitioner partitioner = writerContext.cluster().getPartitioner(); diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableSchema.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableSchema.java index 69870807c..2efbd81cf 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableSchema.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableSchema.java @@ -33,7 +33,6 @@ import org.apache.cassandra.bridge.CassandraBridge; import org.apache.cassandra.bridge.CassandraBridgeFactory; -import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.common.schema.ColumnType; import org.apache.cassandra.spark.data.CqlField; import org.apache.cassandra.spark.exception.UnsupportedAnalyticsOperationException; @@ -61,7 +60,7 @@ public class TableSchema final WriteMode writeMode; final TTLOption ttlOption; final TimestampOption timestampOption; - final CassandraVersion bridgeVersion; + final String bridgeVersion; final boolean quoteIdentifiers; public TableSchema(StructType dfSchema, @@ -69,7 +68,7 @@ public TableSchema(StructType dfSchema, WriteMode writeMode, TTLOption ttlOption, TimestampOption timestampOption, - CassandraVersion bridgeVersion, + String bridgeVersion, boolean quoteIdentifiers, boolean skipSecondaryIndexCheck) { @@ -200,6 +199,7 @@ private String getInsertStatement(StructType dfSchema, TimestampOption timestampOption) { CassandraBridge bridge = CassandraBridgeFactory.get(bridgeVersion); + List columnNames = Arrays.stream(dfSchema.fieldNames()) .filter(fieldName -> !fieldName.equals(ttlOption.columnName())) .filter(fieldName -> !fieldName.equals(timestampOption.columnName())) @@ -327,7 +327,7 @@ private static List getKeyFieldPositions(StructType dfSchema, .collect(Collectors.toList()); } - private static void validateUserAddedColumns(CassandraVersion bridgeVersion, boolean quoteIdentifiers, + private static void validateUserAddedColumns(String bridgeVersion, boolean quoteIdentifiers, TTLOption ttlOption, TimestampOption timestampOption) { if (!quoteIdentifiers) diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/CloudStorageStreamSession.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/CloudStorageStreamSession.java index f99a543ce..b749a3597 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/CloudStorageStreamSession.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/CloudStorageStreamSession.java @@ -36,6 +36,7 @@ import o.a.c.sidecar.client.shaded.common.request.data.CreateSliceRequestPayload; import o.a.c.sidecar.client.shaded.common.response.data.RestoreJobSummaryResponsePayload; import org.apache.cassandra.bridge.CassandraBridge; +import org.apache.cassandra.bridge.CassandraBridgeFactory; import org.apache.cassandra.bridge.SSTableDescriptor; import org.apache.cassandra.clients.Sidecar; import o.a.c.sidecar.client.shaded.client.SidecarInstance; @@ -81,7 +82,7 @@ public CloudStorageStreamSession(BulkWriterContext bulkWriterContext, SortedSSTa ExecutorService executorService) { this(bulkWriterContext, sstableWriter, transportContext, sessionID, tokenRange, - bulkWriterContext.bridge(), // Use bridge from context (SSTable version-based) + CassandraBridgeFactory.get(bulkWriterContext.cluster().getBridgeVersion()), failureHandler, executorService); } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java index 1368680f6..fc8f7e909 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroup.java @@ -33,6 +33,7 @@ import java.util.function.Function; import java.util.stream.Collectors; +import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.Range; import org.apache.commons.lang3.StringUtils; @@ -84,6 +85,7 @@ public class CassandraClusterInfoGroup implements ClusterInfo, MultiClusterSuppo private volatile TokenRangeMapping consolidatedTokenRangeMapping; // Pre-computed values from BroadcastableClusterInfoGroup (only set when reconstructed on executors) private Partitioner cachedPartitioner; + private String cachedBridgeVersion; /** * Creates {@link CassandraClusterInfoGroup} with the list of {@link ClusterInfo} from {@link BulkSparkConf} and validation @@ -93,12 +95,9 @@ public class CassandraClusterInfoGroup implements ClusterInfo, MultiClusterSuppo */ public static CassandraClusterInfoGroup fromBulkSparkConf(BulkSparkConf conf) { - // bridgeVersion is null at construction time; the real version is applied later via setBridgeVersion(), - // once the cluster's SSTable versions have been read from the preliminary group. - return fromBulkSparkConf(conf, clusterId -> new CassandraClusterInfo(conf, clusterId, null)); + return fromBulkSparkConf(conf, clusterId -> new CassandraClusterInfo(conf, clusterId)); } - /** * Reconstruct from BroadcastableClusterInfoGroup on executor. * Creates CassandraClusterInfo instances for each cluster that will fetch data from Sidecar. @@ -108,10 +107,9 @@ public static CassandraClusterInfoGroup fromBulkSparkConf(BulkSparkConf conf) * @param broadcastable the broadcastable cluster info group from broadcast * @return new {@link CassandraClusterInfoGroup} instance */ - public static CassandraClusterInfoGroup from(BroadcastableClusterInfoGroup broadcastable, - CassandraVersion bridgeVersion) + public static CassandraClusterInfoGroup from(BroadcastableClusterInfoGroup broadcastable) { - return new CassandraClusterInfoGroup(broadcastable, bridgeVersion); + return new CassandraClusterInfoGroup(broadcastable); } /** @@ -166,18 +164,19 @@ private CassandraClusterInfoGroup(List clusterInfos) * re-validation and re-computation on executors. * * @param broadcastable the broadcastable cluster info group from broadcast - * @param bridgeVersion the bridge version from broadcast */ - private CassandraClusterInfoGroup(BroadcastableClusterInfoGroup broadcastable, CassandraVersion bridgeVersion) + private CassandraClusterInfoGroup(BroadcastableClusterInfoGroup broadcastable) { // Build list of ClusterInfo from broadcastable data List clusterInfosList = new ArrayList<>(); - broadcastable.forEach((clusterId, broadcastableInfo) -> clusterInfosList.add( - new CassandraClusterInfo((BroadcastableClusterInfo) broadcastableInfo, bridgeVersion))); + broadcastable.forEach((clusterId, broadcastableInfo) -> { + clusterInfosList.add(new CassandraClusterInfo((BroadcastableClusterInfo) broadcastableInfo)); + }); this.clusterInfos = Collections.unmodifiableList(clusterInfosList); // Extract pre-computed values from driver to avoid re-validation on executors this.cachedPartitioner = broadcastable.getPartitioner(); + this.cachedBridgeVersion = broadcastable.getBridgeVersion(); this.clusterId = broadcastable.clusterId(); clusterInfoById(); } @@ -213,6 +212,44 @@ public TokenRangeMapping getTokenRangeMapping(boolean cached) return consolidatedTokenRangeMapping; } + /** + * Determines the Cassandra bridge version for a coordinated write across all clusters. + * + *

Each cluster determines its own bridge version (see {@link CassandraClusterInfo#getBridgeVersion()}), + * which is the lowest mutually-compatible SSTable version present on that cluster. Across clusters the + * lowest of those is chosen so the produced SSTables remain importable by every cluster (a node can import + * its own and older SSTable versions, but not newer ones). The selection fails if the lowest version's + * SSTables cannot be imported by the highest version present across clusters. + * + *

Computed lazily and cached; on executors the value is seeded from the broadcastable. + * + * @return the determined Cassandra bridge version + */ + @Override + public String getBridgeVersion() + { + // Return cached value if available (executor-side reconstruction) + if (cachedBridgeVersion != null) + { + return cachedBridgeVersion; + } + + if (clusterInfos.size() == 1) + { + return clusterInfos.get(0).getBridgeVersion(); + } + + // Write at the lowest so every cluster can import the produced SSTables, but only after verifying the + // lowest version's SSTables are importable by the highest version present. + List bridgeVersions = clusterInfos.stream() + .map(ClusterInfo::getBridgeVersion) + .map(version -> CassandraVersion.fromVersion(version) + .orElseThrow(() -> new UnsupportedOperationException("Unsupported Cassandra version: " + version))) + .collect(Collectors.toList()); + CassandraVersion lowest = SSTableVersionAnalyzer.lowestCompatibleWriteVersionForCoordinatedWrites(bridgeVersions); + return lowest.versionName() + ".0"; + } + @Override public Map clusterWriteAvailability() { @@ -368,43 +405,9 @@ private RuntimeException toRuntimeException(ClusterInfo clusterInfo, Throwable c return new RuntimeException("Failed to perform action on cluster: " + clusterInfo.clusterId(), cause); } - /** - * Sets the bridge version on all contained CassandraClusterInfo instances. - * - * @param bridgeVersion the determined Cassandra bridge version - */ - public void setBridgeVersion(CassandraVersion bridgeVersion) + @VisibleForTesting + Map clusterInfoByIdUnsafe() { - for (ClusterInfo ci : clusterInfos) - { - ((CassandraClusterInfo) ci).setBridgeVersion(bridgeVersion); - } - } - - /** - * Determines the Cassandra bridge version for a coordinated write across all clusters. - * - *

Each cluster determines its own bridge version (see {@link CassandraClusterInfo#getBridgeVersion()}), - * which is the lowest mutually-compatible SSTable version present on that cluster. Across clusters the - * lowest of those is chosen so the produced SSTables remain importable by every cluster (a node can import - * its own and older SSTable versions, but not newer ones). The selection fails if the lowest version's - * SSTables cannot be imported by the highest version present across clusters. - * - * @return the determined Cassandra bridge version - */ - public CassandraVersion getBridgeVersion() - { - // Single cluster: use its bridge version directly without aggregation - if (clusterInfos.size() == 1) - { - return ((CassandraClusterInfo) clusterInfos.get(0)).getBridgeVersion(); - } - - // Write at the lowest so every cluster can import the produced SSTables, but only after verifying the - // lowest version's SSTables are importable by the highest version present. - List bridgeVersions = clusterInfos.stream() - .map(ci -> ((CassandraClusterInfo) ci).getBridgeVersion()) - .collect(Collectors.toList()); - return SSTableVersionAnalyzer.lowestCompatibleWriteVersionForCoordinatedWrites(bridgeVersions); + return clusterInfoById; } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraCoordinatedBulkWriterContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraCoordinatedBulkWriterContext.java index eb101c699..5b9f2ff15 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraCoordinatedBulkWriterContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraCoordinatedBulkWriterContext.java @@ -24,7 +24,6 @@ import com.google.common.base.Preconditions; import org.apache.commons.lang3.StringUtils; -import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.bulkwriter.AbstractBulkWriterContext; import org.apache.cassandra.spark.bulkwriter.BroadcastableClusterInfoGroup; import org.apache.cassandra.spark.bulkwriter.BroadcastableJobInfo; @@ -47,13 +46,6 @@ */ public class CassandraCoordinatedBulkWriterContext extends AbstractBulkWriterContext { - // A temporary CassandraClusterInfoGroup created with bridgeVersion=null during driver-side initialization. - // This preliminaryGroup provides the Sidecar connectivity needed to determine the bridge version. - // Once bridge version is determined, buildClusterInfo() promotes this by setting its bridge version. - // Marked transient because it is only used during the driver-side constructor and must not be serialized - // when broadcasting to executors. - private transient CassandraClusterInfoGroup preliminaryGroup; - public CassandraCoordinatedBulkWriterContext(@NotNull BulkSparkConf conf, @NotNull StructType structType, int sparkDefaultParallelism) @@ -85,29 +77,11 @@ private void validateConfiguration(BulkSparkConf conf) } @Override - protected CassandraVersion getBridgeVersion() - { - return getOrCreatePreliminaryGroup(bulkSparkConf()).getBridgeVersion(); - } - - @Override - protected ClusterInfo buildClusterInfo(CassandraVersion bridgeVersion) + protected ClusterInfo buildClusterInfo() { - CassandraClusterInfoGroup group = getOrCreatePreliminaryGroup(bulkSparkConf()); - preliminaryGroup = null; - group.setBridgeVersion(bridgeVersion); - group.startupValidate(); - return group; - } - - private CassandraClusterInfoGroup getOrCreatePreliminaryGroup(BulkSparkConf conf) - { - if (preliminaryGroup == null) - { - preliminaryGroup = CassandraClusterInfoGroup.fromBulkSparkConf(conf); - } - - return preliminaryGroup; + CassandraClusterInfoGroup clusterInfoGroup = CassandraClusterInfoGroup.fromBulkSparkConf(bulkSparkConf()); + clusterInfoGroup.startupValidate(); + return clusterInfoGroup; } @Override diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContextTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContextTest.java index f9f36cd6f..57810c7ac 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContextTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContextTest.java @@ -23,13 +23,13 @@ import org.junit.jupiter.api.Test; -import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.MultiClusterContainer; import org.apache.spark.sql.types.StructType; import org.jetbrains.annotations.NotNull; import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; class AbstractBulkWriterContextTest { @@ -52,18 +52,19 @@ void testContextUsesDeterminedBridgeVersion() // CassandraClusterInfoBridgeVersionTest. BulkSparkConf conf = mock(BulkSparkConf.class); StructType schema = mock(StructType.class); - TestBulkWriterContext context = TestBulkWriterContext.create(conf, schema, 1, CassandraVersion.FIVEZERO); + TestBulkWriterContext context = TestBulkWriterContext.create(conf, schema, 1, "5.0.0"); - assertThat(context.bridgeVersion()).isEqualTo(CassandraVersion.FIVEZERO); + assertThat(context.bridgeVersion()).isEqualTo("5.0.0"); } /** * Concrete test implementation of AbstractBulkWriterContext for testing. The bridge version is supplied - * directly (the real determination is exercised at the ClusterInfo level). + * via the (mock) ClusterInfo returned by {@link #buildClusterInfo()} (the real determination is exercised + * at the ClusterInfo level). */ static class TestBulkWriterContext extends AbstractBulkWriterContext { - private static CassandraVersion staticBridgeVersion; + private static String staticBridgeVersion; private TestBulkWriterContext(@NotNull BulkSparkConf conf, @NotNull StructType structType, @@ -75,22 +76,18 @@ private TestBulkWriterContext(@NotNull BulkSparkConf conf, static TestBulkWriterContext create(@NotNull BulkSparkConf conf, @NotNull StructType structType, int sparkDefaultParallelism, - CassandraVersion bridgeVersion) + String bridgeVersion) { staticBridgeVersion = bridgeVersion; return new TestBulkWriterContext(conf, structType, sparkDefaultParallelism); } @Override - protected CassandraVersion getBridgeVersion() + protected ClusterInfo buildClusterInfo() { - return staticBridgeVersion; - } - - @Override - protected ClusterInfo buildClusterInfo(CassandraVersion bridgeVersion) - { - return mock(ClusterInfo.class); + ClusterInfo clusterInfo = mock(ClusterInfo.class); + when(clusterInfo.getBridgeVersion()).thenReturn(staticBridgeVersion); + return clusterInfo; } @Override diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfigExtensibilityTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfigExtensibilityTest.java index c57a4a265..f0da1fb39 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfigExtensibilityTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/BulkWriterConfigExtensibilityTest.java @@ -24,13 +24,11 @@ import org.junit.jupiter.api.Test; import org.apache.cassandra.bridge.CassandraBridge; -import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.bulkwriter.cloudstorage.coordinated.MultiClusterContainer; import org.apache.cassandra.spark.common.stats.JobStatsPublisher; import org.jetbrains.annotations.NotNull; import static org.assertj.core.api.Assertions.assertThat; -import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -54,7 +52,7 @@ void testToBulkWriterContextCanBeOverridden() BroadcastableSchemaInfo mockSchemaInfo = mock(BroadcastableSchemaInfo.class); // A custom BulkWriterConfig subclass overriding toBulkWriterContext() - BulkWriterConfig customConfig = new BulkWriterConfig(mockConf, 4, mockJobInfo, mockClusterInfo, mockSchemaInfo, CassandraVersion.FOURZERO) + BulkWriterConfig customConfig = new BulkWriterConfig(mockConf, 4, mockJobInfo, mockClusterInfo, mockSchemaInfo, "4.0.0") { @Override public BulkWriterContext toBulkWriterContext() @@ -75,13 +73,13 @@ void testCustomIBroadcastableClusterInfoReconstructIsCalled() { ClusterInfo expectedCluster = mock(ClusterInfo.class); IBroadcastableClusterInfo mockBroadcastable = mock(IBroadcastableClusterInfo.class); - when(mockBroadcastable.reconstruct(any(CassandraVersion.class))).thenReturn(expectedCluster); + when(mockBroadcastable.reconstruct()).thenReturn(expectedCluster); BulkSparkConf mockConf = mock(BulkSparkConf.class); BroadcastableJobInfo mockJobInfo = mock(BroadcastableJobInfo.class); BroadcastableSchemaInfo mockSchemaInfo = mock(BroadcastableSchemaInfo.class); - BulkWriterConfig config = new BulkWriterConfig(mockConf, 4, mockJobInfo, mockBroadcastable, mockSchemaInfo, CassandraVersion.FOURZERO); + BulkWriterConfig config = new BulkWriterConfig(mockConf, 4, mockJobInfo, mockBroadcastable, mockSchemaInfo, "4.0.0"); TestBulkWriterContext context = new TestBulkWriterContext(config); @@ -96,7 +94,7 @@ void testReconstructJobInfoOnExecutorCanBeOverridden() BroadcastableJobInfo mockJobInfo = mock(BroadcastableJobInfo.class); BroadcastableSchemaInfo mockSchemaInfo = mock(BroadcastableSchemaInfo.class); BulkWriterConfig config = new BulkWriterConfig(mockConf, 4, mockJobInfo, mock(IBroadcastableClusterInfo.class), - mockSchemaInfo, CassandraVersion.FOURZERO); + mockSchemaInfo, "4.0.0"); // Subclass that overrides reconstructJobInfoOnExecutor to return custom JobInfo TestBulkWriterContext context = new TestBulkWriterContext(config) @@ -123,13 +121,7 @@ private static class TestBulkWriterContext extends AbstractBulkWriterContext } @Override - protected ClusterInfo buildClusterInfo(CassandraVersion bridgeVersion) - { - throw new UnsupportedOperationException("Driver-only"); - } - - @Override - protected CassandraVersion getBridgeVersion() + protected ClusterInfo buildClusterInfo() { throw new UnsupportedOperationException("Driver-only"); } diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfoBridgeVersionTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfoBridgeVersionTest.java index f7c873d39..608fc9f1f 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfoBridgeVersionTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfoBridgeVersionTest.java @@ -48,7 +48,7 @@ void testOverrideWinsWhenFeatureEnabled() { TestClusterInfo info = new TestClusterInfo(conf(false), "4.0.0", Collections.singleton("big-oa"), "5.0.0"); - assertThat(info.getBridgeVersion()).isEqualTo(CassandraVersion.FOURZERO); + assertThat(info.getBridgeVersion()).isEqualTo("4.0.0"); // override short-circuits: neither SSTable versions nor the lowest version are consulted assertThat(info.sstableVersionsCalls).isEqualTo(0); assertThat(info.lowestVersionCalls).isEqualTo(0); @@ -59,7 +59,7 @@ void testOverrideWinsWhenFeatureDisabled() { TestClusterInfo info = new TestClusterInfo(conf(true), "4.0.0", Collections.singleton("big-oa"), "5.0.0"); - assertThat(info.getBridgeVersion()).isEqualTo(CassandraVersion.FOURZERO); + assertThat(info.getBridgeVersion()).isEqualTo("4.0.0"); assertThat(info.sstableVersionsCalls).isEqualTo(0); assertThat(info.lowestVersionCalls).isEqualTo(0); } @@ -77,7 +77,7 @@ void testSSTableVersionBasedWhenEnabledAndNoOverride() CassandraVersion expected = bti ? CassandraVersion.FIVEZERO : CassandraVersion.FOURZERO; TestClusterInfo info = new TestClusterInfo(conf(false), null, sstableVersions, "5.0.0"); - assertThat(info.getBridgeVersion()).isEqualTo(expected); + assertThat(info.getBridgeVersion()).isEqualTo(expected.versionName() + ".0"); assertThat(info.sstableVersionsCalls).isEqualTo(1); // SSTable-based selection does not consult the legacy lowest version assertThat(info.lowestVersionCalls).isEqualTo(0); @@ -88,7 +88,7 @@ void testLegacyVersionWhenDisabledAndNoOverride() { TestClusterInfo info = new TestClusterInfo(conf(true), null, Collections.singleton("big-oa"), "5.0.0"); - assertThat(info.getBridgeVersion()).isEqualTo(CassandraVersion.FIVEZERO); + assertThat(info.getBridgeVersion()).isEqualTo("5.0.0"); assertThat(info.lowestVersionCalls).isEqualTo(1); // legacy path does not consult SSTable versions assertThat(info.sstableVersionsCalls).isEqualTo(0); @@ -116,8 +116,7 @@ private static final class TestClusterInfo extends CassandraClusterInfo TestClusterInfo(BulkSparkConf conf, String versionFromFeature, Set sstableVersions, String lowestVersion) { - // bridgeVersion arg is unused by getBridgeVersion(); pass any value - super(conf, CassandraVersion.FIVEZERO); + super(conf); this.versionFromFeature = versionFromFeature; this.sstableVersions = sstableVersions; this.lowestVersion = lowestVersion; @@ -143,7 +142,7 @@ public Set getSSTableVersionsOnCluster() } @Override - public String getLowestCassandraVersion() + public String getVersionFromSidecar() { lowestVersionCalls++; return lowestVersion; diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfoTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfoTest.java index 7c848a4f7..5ed4c5221 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfoTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/CassandraClusterInfoTest.java @@ -28,7 +28,6 @@ import org.junit.jupiter.api.Test; import o.a.c.sidecar.client.shaded.common.response.TimeSkewResponse; -import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.bulkwriter.token.TokenRangeMapping; import org.apache.cassandra.spark.exception.TimeSkewTooLargeException; @@ -82,7 +81,7 @@ private static class MockClusterInfoForTimeSkew extends CassandraClusterInfo MockClusterInfoForTimeSkew(int allowanceMinutes, Instant remoteNow) { - super((BulkSparkConf) null, CassandraVersion.FIVEZERO); + super((BulkSparkConf) null); mockCassandraContext(allowanceMinutes, remoteNow); } diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/MockBulkWriterContext.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/MockBulkWriterContext.java index 1b4a70bf6..932a3e92a 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/MockBulkWriterContext.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/MockBulkWriterContext.java @@ -378,7 +378,8 @@ public void checkBulkWriterIsEnabledOrThrow() BulkFeatures.BULK_WRITER)); } - public String getLowestCassandraVersion() + @Override + public String getBridgeVersion() { return cassandraVersion; } @@ -391,13 +392,6 @@ public Set getSSTableVersionsOnCluster() return new HashSet<>(version.getNativeSStableVersions()); } - @Override - public CassandraVersion bridgeVersion() - { - return CassandraVersion.fromVersion(cassandraVersion) - .orElse(CassandraVersion.FIVEZERO); - } - private List buildCompleteBatchIds(List uuids) { return uuids.stream().map(uuid -> uuid + "-" + jobId).collect(Collectors.toList()); diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TableSchemaTestCommon.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TableSchemaTestCommon.java index 3401ae85d..ea3ffb9ba 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TableSchemaTestCommon.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TableSchemaTestCommon.java @@ -340,7 +340,8 @@ public TableSchema build() dataFrameSchema = dataFrameSchema.add(timestampOption.columnName(), DataTypes.LongType); updatedCqlColumns = addColumnToCqlColumns(updatedCqlColumns, timestampOption.columnName(), SqlToCqlTypeConverter.BIGINT); } - CassandraVersion bridgeVersion = CassandraVersion.fromVersion(cassandraVersion) + // Validate the configured version is supported; TableSchema itself takes the version string. + CassandraVersion.fromVersion(cassandraVersion) .orElseThrow(() -> new IllegalArgumentException("Unsupported Cassandra version: " + cassandraVersion)); MockTableInfoProvider tableInfoProvider = new MockTableInfoProvider(bridge, @@ -356,7 +357,7 @@ public TableSchema build() writeMode, ttlOption, timestampOption, - bridgeVersion, + cassandraVersion, quoteIdentifiers, skipSecondaryIndexCheck); } diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroupTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroupTest.java index 04716c50e..af10673a8 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroupTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/coordinated/CassandraClusterInfoGroupTest.java @@ -34,7 +34,6 @@ import org.junit.jupiter.api.Test; import o.a.c.sidecar.client.shaded.common.response.TokenRangeReplicasResponse; -import org.apache.cassandra.bridge.CassandraVersion; import org.apache.cassandra.spark.bulkwriter.BroadcastableClusterInfoGroup; import org.apache.cassandra.spark.bulkwriter.BulkSparkConf; import org.apache.cassandra.spark.bulkwriter.CassandraClusterInfo; @@ -252,6 +251,7 @@ void testSerDeser() CassandraClusterInfoGroup originalGroup = mockClusterGroup(2, index -> { CassandraClusterInfo clusterInfo = mockClusterInfo("cluster" + index); when(clusterInfo.getPartitioner()).thenReturn(Partitioner.Murmur3Partitioner); + when(clusterInfo.getBridgeVersion()).thenReturn("5.0.0"); return clusterInfo; }); @@ -274,6 +274,10 @@ void testSerDeser() .describedAs("Partitioner should be preserved after serialization") .isEqualTo(Partitioner.Murmur3Partitioner); + assertThat(deserializedBroadcastable.getBridgeVersion()) + .describedAs("Bridge version should be preserved after serialization") + .isEqualTo("5.0.0"); + assertThat(deserializedBroadcastable.size()) .describedAs("Number of clusters should be preserved after serialization") .isEqualTo(2); @@ -293,10 +297,10 @@ void testGetBridgeVersionSingleClusterDelegates() { CassandraClusterInfoGroup group = mockClusterGroup(1, index -> { CassandraClusterInfo clusterInfo = mockClusterInfo("cluster" + index); - when(clusterInfo.getBridgeVersion()).thenReturn(CassandraVersion.FIVEZERO); + when(clusterInfo.getBridgeVersion()).thenReturn("5.0.0"); return clusterInfo; }); - assertThat(group.getBridgeVersion()).isEqualTo(CassandraVersion.FIVEZERO); + assertThat(group.getBridgeVersion()).isEqualTo("5.0.0"); } @Test @@ -305,10 +309,10 @@ void testGetBridgeVersionMultiClusterSameVersionReturnsIt() // All clusters resolve to the same version -> compatible; the (equal) lowest is returned CassandraClusterInfoGroup group = mockClusterGroup(2, index -> { CassandraClusterInfo clusterInfo = mockClusterInfo("cluster" + index); - when(clusterInfo.getBridgeVersion()).thenReturn(CassandraVersion.FOURZERO); + when(clusterInfo.getBridgeVersion()).thenReturn("4.0.0"); return clusterInfo; }); - assertThat(group.getBridgeVersion()).isEqualTo(CassandraVersion.FOURZERO); + assertThat(group.getBridgeVersion()).isEqualTo("4.0.0"); } @Test @@ -317,10 +321,10 @@ void testGetBridgeVersionMultiClusterAcrossMajorsReturnsLowest() // 4.0 and 5.0 clusters -> write at the lowest (4.0) so the produced SSTables are importable by both CassandraClusterInfoGroup group = mockClusterGroup(2, index -> { CassandraClusterInfo clusterInfo = mockClusterInfo("cluster" + index); - when(clusterInfo.getBridgeVersion()).thenReturn(index == 0 ? CassandraVersion.FOURZERO : CassandraVersion.FIVEZERO); + when(clusterInfo.getBridgeVersion()).thenReturn(index == 0 ? "4.0.0" : "5.0.0"); return clusterInfo; }); - assertThat(group.getBridgeVersion()).isEqualTo(CassandraVersion.FOURZERO); + assertThat(group.getBridgeVersion()).isEqualTo("4.0.0"); } @Test @@ -329,10 +333,10 @@ void testGetBridgeVersionFourZeroAndFourOneReturnsLowest() // 4.0 and 4.1 clusters -> write at the lowest (4.0) CassandraClusterInfoGroup group = mockClusterGroup(2, index -> { CassandraClusterInfo clusterInfo = mockClusterInfo("cluster" + index); - when(clusterInfo.getBridgeVersion()).thenReturn(index == 0 ? CassandraVersion.FOURZERO : CassandraVersion.FOURONE); + when(clusterInfo.getBridgeVersion()).thenReturn(index == 0 ? "4.0.0" : "4.1.0"); return clusterInfo; }); - assertThat(group.getBridgeVersion()).isEqualTo(CassandraVersion.FOURZERO); + assertThat(group.getBridgeVersion()).isEqualTo("4.0.0"); } @Test @@ -341,7 +345,7 @@ void testGetBridgeVersionIncompatibleMajorsThrows() // 3.0 and 5.0 clusters -> SSTables written at 3.0 cannot be imported by 5.0, so the coordinated write fails CassandraClusterInfoGroup group = mockClusterGroup(2, index -> { CassandraClusterInfo clusterInfo = mockClusterInfo("cluster" + index); - when(clusterInfo.getBridgeVersion()).thenReturn(index == 0 ? CassandraVersion.THREEZERO : CassandraVersion.FIVEZERO); + when(clusterInfo.getBridgeVersion()).thenReturn(index == 0 ? "3.0.0" : "5.0.0"); return clusterInfo; }); assertThatThrownBy(group::getBridgeVersion)