From 32f2c4584e2185faac68048db435c35b68e3efae Mon Sep 17 00:00:00 2001 From: Shailaja Koppu Date: Mon, 30 Mar 2026 20:40:16 +0100 Subject: [PATCH 1/6] draft sai support --- .../common/data/SSTableImportOptions.java | 26 ++ .../common/request/ImportSSTableRequest.java | 36 +++ .../CreateRestoreJobRequestPayloadTest.java | 2 + cassandra-analytics-cdc/build.gradle | 6 + cassandra-analytics-cdc/build.gradle.rej | 20 ++ .../cassandra/spark/utils/CqlUtils.java | 32 ++- cassandra-analytics-core/build.gradle | 11 + cassandra-analytics-core/build.gradle.rej | 24 ++ .../bulkwriter/AbstractBulkWriterContext.java | 20 +- .../bulkwriter/BroadcastableSchemaInfo.java | 18 +- .../CassandraDirectDataTransportContext.java | 5 +- .../spark/bulkwriter/CassandraSchemaInfo.java | 21 +- .../bulkwriter/SSTableWriterFactory.java | 19 ++ .../spark/bulkwriter/SchemaInfo.java | 12 + .../bulkwriter/SidecarDataTransferApi.java | 16 ++ .../spark/bulkwriter/SortedSSTableWriter.java | 1 + .../spark/bulkwriter/TableSchema.java | 94 +++++-- .../cassandra/bridge/CassandraBridge.java | 27 ++ .../bridge/CassandraBridgeImplementation.java | 13 + .../bridge/SSTableWriterImplementation.java | 233 ++++++++++++++++++ 20 files changed, 608 insertions(+), 28 deletions(-) create mode 100644 cassandra-analytics-cdc/build.gradle.rej create mode 100644 cassandra-analytics-core/build.gradle.rej create mode 100644 cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/SSTableWriterImplementation.java diff --git a/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/data/SSTableImportOptions.java b/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/data/SSTableImportOptions.java index 60b6a492f..d5a584662 100644 --- a/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/data/SSTableImportOptions.java +++ b/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/data/SSTableImportOptions.java @@ -33,6 +33,8 @@ public class SSTableImportOptions extends HashMap private static final String INVALIDATE_CACHES = "invalidateCaches"; private static final String EXTENDED_VERIFY = "extendedVerify"; private static final String COPY_DATA = "copyData"; + private static final String VALIDATE_SAI_INDEXES = "validateSaiIndexes"; + private static final String SAI_INDEX_CHECKSUM = "saiIndexChecksum"; public static SSTableImportOptions defaults() { @@ -48,6 +50,8 @@ private SSTableImportOptions() put(INVALIDATE_CACHES, Boolean.toString(true)); put(EXTENDED_VERIFY, Boolean.toString(true)); put(COPY_DATA, Boolean.toString(false)); // note: the default is false + put(VALIDATE_SAI_INDEXES, Boolean.toString(false)); // note: the default is false + put(SAI_INDEX_CHECKSUM, Boolean.toString(false)); // note: the default is false } public SSTableImportOptions resetLevel(boolean enabled) @@ -126,4 +130,26 @@ public boolean copyData() { return Boolean.parseBoolean(get(COPY_DATA)); } + + public SSTableImportOptions validateSaiIndexes(boolean enabled) + { + put(VALIDATE_SAI_INDEXES, Boolean.toString(enabled)); + return this; + } + + public boolean validateSaiIndexes() + { + return Boolean.parseBoolean(get(VALIDATE_SAI_INDEXES)); + } + + public SSTableImportOptions saiIndexChecksum(boolean enabled) + { + put(SAI_INDEX_CHECKSUM, Boolean.toString(enabled)); + return this; + } + + public boolean saiIndexChecksum() + { + return Boolean.parseBoolean(get(SAI_INDEX_CHECKSUM)); + } } diff --git a/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/request/ImportSSTableRequest.java b/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/request/ImportSSTableRequest.java index 7633b9440..9c14f6a2c 100644 --- a/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/request/ImportSSTableRequest.java +++ b/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/request/ImportSSTableRequest.java @@ -65,6 +65,8 @@ public static class ImportOptions private Boolean invalidateCaches; private Boolean extendedVerify; private Boolean copyData; + private Boolean validateSaiIndexes; + private Boolean saiIndexChecksum; public ImportOptions() { @@ -153,6 +155,32 @@ public ImportOptions copyData(boolean copyData) this.copyData = copyData; return this; } + + /** + * Sets the {@code validateSaiIndexes} and returns a reference to this ImportOptions enabling method chaining. + * When enabled, Cassandra validates SAI index components during import. + * + * @param validateSaiIndexes the {@code validateSaiIndexes} to set + * @return a reference to this ImportOptions + */ + public ImportOptions validateSaiIndexes(boolean validateSaiIndexes) + { + this.validateSaiIndexes = validateSaiIndexes; + return this; + } + + /** + * Sets the {@code saiIndexChecksum} and returns a reference to this ImportOptions enabling method chaining. + * When enabled, Cassandra verifies SAI index component checksums during import. + * + * @param saiIndexChecksum the {@code saiIndexChecksum} to set + * @return a reference to this ImportOptions + */ + public ImportOptions saiIndexChecksum(boolean saiIndexChecksum) + { + this.saiIndexChecksum = saiIndexChecksum; + return this; + } } static String requestURI(String keyspace, String tableName, String uploadId, ImportOptions importOptions) @@ -205,6 +233,14 @@ private static List selectedOptions(ImportOptions importOptions) { options.add("copyData=" + importOptions.copyData); } + if (importOptions.validateSaiIndexes != null) + { + options.add("validateSaiIndexes=" + importOptions.validateSaiIndexes); + } + if (importOptions.saiIndexChecksum != null) + { + options.add("saiIndexChecksum=" + importOptions.saiIndexChecksum); + } return options; } diff --git a/analytics-sidecar-client-common/src/test/java/org/apache/cassandra/sidecar/common/request/CreateRestoreJobRequestPayloadTest.java b/analytics-sidecar-client-common/src/test/java/org/apache/cassandra/sidecar/common/request/CreateRestoreJobRequestPayloadTest.java index 607d10019..bc8e925b9 100644 --- a/analytics-sidecar-client-common/src/test/java/org/apache/cassandra/sidecar/common/request/CreateRestoreJobRequestPayloadTest.java +++ b/analytics-sidecar-client-common/src/test/java/org/apache/cassandra/sidecar/common/request/CreateRestoreJobRequestPayloadTest.java @@ -64,8 +64,10 @@ void testSerDeser() throws JsonProcessingException "\"jobAgent\":\"agent\"," + "\"secrets\":" + MAPPER.writeValueAsString(secrets) + "," + "\"importOptions\":{" + + "\"validateSaiIndexes\":\"false\"," + "\"verifyTokens\":\"true\"," + "\"resetLevel\":\"true\"," + + "\"saiIndexChecksum\":\"false\"," + "\"clearRepaired\":\"true\"," + "\"extendedVerify\":\"true\"," + "\"verifySSTables\":\"true\"," + diff --git a/cassandra-analytics-cdc/build.gradle b/cassandra-analytics-cdc/build.gradle index 38c284065..0b5d7f780 100644 --- a/cassandra-analytics-cdc/build.gradle +++ b/cassandra-analytics-cdc/build.gradle @@ -144,9 +144,15 @@ jar { test { systemProperty "cassandra.sidecar.versions_to_test", (System.getenv("CASSANDRA_VERSION") ?: "4.0.17,5.0.5") +<<<<<<< Updated upstream minHeapSize = '1024m' maxHeapSize = '3072m' maxParallelForks = Math.max(Runtime.runtime.availableProcessors() * 2, 8) +======= + minHeapSize = System.getenv('CDC_TEST_MIN_HEAP_SIZE') ?: '512m' + maxHeapSize = System.getenv('CDC_TEST_MAX_HEAP_SIZE') ?: '2048m' + maxParallelForks = System.getenv('CDC_MAX_PARALLEL_FORKS')?.toInteger() ?: 3 +>>>>>>> Stashed changes forkEvery = 1 // Enables different end-to-end test classes use Spark contexts with different configurations // Make it so unit tests run on a Jar with Cassandra bridge implementations built in diff --git a/cassandra-analytics-cdc/build.gradle.rej b/cassandra-analytics-cdc/build.gradle.rej new file mode 100644 index 000000000..ffab6d207 --- /dev/null +++ b/cassandra-analytics-cdc/build.gradle.rej @@ -0,0 +1,20 @@ +diff a/cassandra-analytics-cdc/build.gradle b/cassandra-analytics-cdc/build.gradle (rejected hunks) +@@ -141,15 +141,12 @@ jar { + } + } + +-def cdcMaxHeapSize = System.getenv("CDC_MAX_HEAP_SIZE") ?: "2048m" +-def cdcMaxParallelForks = (System.getenv("CDC_MAX_PARALLEL_FORKS") ?: "1") as int +- + test { + systemProperty "cassandra.sidecar.versions_to_test", (System.getenv("CASSANDRA_VERSION") ?: "4.0.17,5.0.5") + +- minHeapSize = '512m' +- maxHeapSize = cdcMaxHeapSize +- maxParallelForks = cdcMaxParallelForks ++ minHeapSize = System.getenv('CDC_TEST_MIN_HEAP_SIZE') ?: '512m' ++ maxHeapSize = System.getenv('CDC_TEST_MAX_HEAP_SIZE') ?: '2048m' ++ maxParallelForks = System.getenv('CDC_MAX_PARALLEL_FORKS')?.toInteger() ?: 3 + forkEvery = 1 // Enables different end-to-end test classes use Spark contexts with different configurations + + // Make it so unit tests run on a Jar with Cassandra bridge implementations built in diff --git a/cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/utils/CqlUtils.java b/cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/utils/CqlUtils.java index 9d989be51..f7dcffc86 100644 --- a/cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/utils/CqlUtils.java +++ b/cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/utils/CqlUtils.java @@ -259,16 +259,42 @@ public static Set extractUdts(@NotNull String schemaStr, @NotNull String } public static int extractIndexCount(@NotNull String schemaStr, @NotNull String keyspace, @NotNull String table) + { + return extractIndexStatements(schemaStr, keyspace, table).size(); + } + + /** + * Extracts CREATE INDEX statements for the given table from the schema string. + * + * @param schemaStr full cluster schema text + * @param keyspace the keyspace name + * @param table the table name + * @return set of CREATE INDEX statements for the table + */ + public static Set extractIndexStatements(@NotNull String schemaStr, + @NotNull String keyspace, + @NotNull String table) { String cleaned = cleanCql(schemaStr); Pattern pattern = Pattern.compile(String.format("CREATE (CUSTOM )?INDEX \"?[^ ]* ON ?\"?%s?\"?\\.{1}\"?%s\"?[^;]*;", keyspace, table)); Matcher matcher = pattern.matcher(cleaned); - int indexCount = 0; + Set statements = new HashSet<>(); while (matcher.find()) { - indexCount++; + statements.add(matcher.group()); } - return indexCount; + return statements; + } + + /** + * Returns true if the given CREATE INDEX statement defines a Storage Attached Index (SAI). + * + * @param createIndexStatement a CREATE INDEX CQL statement + * @return true if the index uses SAI + */ + public static boolean isSaiIndex(@NotNull String createIndexStatement) + { + return createIndexStatement.toUpperCase().contains("USING 'STORAGEATTACHEDINDEX'"); } /** diff --git a/cassandra-analytics-core/build.gradle b/cassandra-analytics-core/build.gradle index ffce66600..4f77838ef 100644 --- a/cassandra-analytics-core/build.gradle +++ b/cassandra-analytics-core/build.gradle @@ -181,8 +181,13 @@ tasks.register('testSequential', Test) { } systemProperty "cassandra.analytics.bridges.sstable_format", System.getProperty("cassandra.analytics.bridges.sstable_format", "big") +<<<<<<< Updated upstream minHeapSize = '1024m' maxHeapSize = System.getenv('CORE_TEST_MAX_HEAP_SIZE') ?: '3072m' +======= + minHeapSize = System.getenv('CORE_SEQ_TEST_MIN_HEAP_SIZE') ?: '512m' + maxHeapSize = System.getenv('CORE_SEQ_TEST_MAX_HEAP_SIZE') ?: '4096m' +>>>>>>> Stashed changes maxParallelForks = 1 forkEvery = 1 // Enables different end-to-end test classes use Spark contexts with different configurations @@ -216,10 +221,16 @@ tasks.register('testSequential', Test) { test { systemProperty "cassandra.analytics.bridges.sstable_format", System.getProperty("cassandra.analytics.bridges.sstable_format", "big") +<<<<<<< Updated upstream minHeapSize = '1024m' maxHeapSize = System.getenv('CORE_TEST_MAX_HEAP_SIZE') ?: '3072m' maxParallelForks = System.getenv('CORE_MAX_PARALLEL_FORKS')?.toInteger() ?: Math.max(Runtime.runtime.availableProcessors() * 2, 8) +======= + minHeapSize = System.getenv('CORE_TEST_MIN_HEAP_SIZE') ?: '512m' + maxHeapSize = System.getenv('CORE_TEST_MAX_HEAP_SIZE') ?: '2048m' + maxParallelForks = System.getenv('CORE_MAX_PARALLEL_FORKS')?.toInteger() ?: 3 +>>>>>>> Stashed changes forkEvery = 1 // Enables different end-to-end test classes use Spark contexts with different configurations // Make it so unit tests run on a Jar with Cassandra bridge implementations built in diff --git a/cassandra-analytics-core/build.gradle.rej b/cassandra-analytics-core/build.gradle.rej new file mode 100644 index 000000000..8ef6088b6 --- /dev/null +++ b/cassandra-analytics-core/build.gradle.rej @@ -0,0 +1,24 @@ +diff a/cassandra-analytics-core/build.gradle b/cassandra-analytics-core/build.gradle (rejected hunks) +@@ -181,8 +181,8 @@ tasks.register('testSequential', Test) { + } + + systemProperty "cassandra.analytics.bridges.sstable_format", System.getProperty("cassandra.analytics.bridges.sstable_format", "big") +- minHeapSize = '512m' +- maxHeapSize = System.getenv('CORE_TEST_MAX_HEAP_SIZE') ?: '2048m' ++ minHeapSize = System.getenv('CORE_SEQ_TEST_MIN_HEAP_SIZE') ?: '512m' ++ maxHeapSize = System.getenv('CORE_SEQ_TEST_MAX_HEAP_SIZE') ?: '4096m' + maxParallelForks = 1 + forkEvery = 1 // Enables different end-to-end test classes use Spark contexts with different configurations + +@@ -216,8 +216,9 @@ tasks.register('testSequential', Test) { + + test { + systemProperty "cassandra.analytics.bridges.sstable_format", System.getProperty("cassandra.analytics.bridges.sstable_format", "big") +- minHeapSize = '512m' +- maxHeapSize = '2048m' ++ minHeapSize = System.getenv('CORE_TEST_MIN_HEAP_SIZE') ?: '512m' ++ maxHeapSize = System.getenv('CORE_TEST_MAX_HEAP_SIZE') ?: '2048m' ++ maxParallelForks = System.getenv('CORE_MAX_PARALLEL_FORKS')?.toInteger() ?: 3 + forkEvery = 1 // Enables different end-to-end test classes use Spark contexts with different configurations + + // Make it so unit tests run on a Jar with Cassandra bridge implementations built in diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java index b6eff3c80..e69c255d3 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java @@ -244,12 +244,13 @@ protected SchemaInfo buildSchemaInfo(StructType structType) String createTableSchema = CqlUtils.extractTableSchema(keyspaceSchema, keyspace, table); Set udts = CqlUtils.extractUdts(keyspaceSchema, keyspace); ReplicationFactor replicationFactor = CqlUtils.extractReplicationFactor(keyspaceSchema, keyspace); - int indexCount = CqlUtils.extractIndexCount(keyspaceSchema, keyspace, table); + Set indexStatements = CqlUtils.extractIndexStatements(keyspaceSchema, keyspace, table); + int indexCount = indexStatements.size(); CqlTable cqlTable = bridge().buildSchema(createTableSchema, keyspace, replicationFactor, partitioner, udts, null, indexCount, false); TableInfoProvider tableInfoProvider = new CqlTableInfoProvider(createTableSchema, cqlTable); - TableSchema tableSchema = initializeTableSchema(bulkSparkConf(), structType, tableInfoProvider, lowestCassandraVersion()); - return new CassandraSchemaInfo(tableSchema, udts); + TableSchema tableSchema = initializeTableSchema(bulkSparkConf(), structType, tableInfoProvider, lowestCassandraVersion(), indexStatements); + return new CassandraSchemaInfo(tableSchema, udts, indexStatements); } /*-------------------------------------------*/ @@ -314,6 +315,16 @@ protected TableSchema initializeTableSchema(@NotNull BulkSparkConf conf, @NotNull StructType dfSchema, TableInfoProvider tableInfoProvider, String lowestCassandraVersion) + { + return initializeTableSchema(conf, dfSchema, tableInfoProvider, lowestCassandraVersion, java.util.Collections.emptySet()); + } + + @NotNull + protected TableSchema initializeTableSchema(@NotNull BulkSparkConf conf, + @NotNull StructType dfSchema, + TableInfoProvider tableInfoProvider, + String lowestCassandraVersion, + Set indexStatements) { return new TableSchema(dfSchema, tableInfoProvider, @@ -322,7 +333,8 @@ protected TableSchema initializeTableSchema(@NotNull BulkSparkConf conf, conf.getTimestampOptions(), lowestCassandraVersion, job().qualifiedTableName().quoteIdentifiers(), - conf.skipSecondaryIndexCheck); + conf.skipSecondaryIndexCheck, + indexStatements); } @NotNull diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableSchemaInfo.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableSchemaInfo.java index 9c0a164b6..cd2bec78d 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableSchemaInfo.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableSchemaInfo.java @@ -27,7 +27,7 @@ /** * Broadcastable wrapper for schema information with ZERO transient fields to optimize Spark broadcasting. *

- * Contains BroadcastableTableSchema (pre-computed schema data) and UDT statements. + * Contains BroadcastableTableSchema (pre-computed schema data), UDT statements, and index statements. * Executors reconstruct CassandraSchemaInfo and TableSchema from these fields. *

* Why ZERO transient fields matters:
@@ -43,11 +43,12 @@ */ public final class BroadcastableSchemaInfo implements Serializable { - private static final long serialVersionUID = -8727074052066841748L; + private static final long serialVersionUID = 4719283056718294301L; // Essential fields broadcast to executors private final BroadcastableTableSchema broadcastableTableSchema; private final Set userDefinedTypeStatements; + private final Set indexStatements; /** * Creates a BroadcastableSchemaInfo from a source SchemaInfo. @@ -59,15 +60,18 @@ public static BroadcastableSchemaInfo from(@NotNull SchemaInfo source) { return new BroadcastableSchemaInfo( BroadcastableTableSchema.from(source.getTableSchema()), - source.getUserDefinedTypeStatements() + source.getUserDefinedTypeStatements(), + source.getIndexStatements() ); } private BroadcastableSchemaInfo(BroadcastableTableSchema broadcastableTableSchema, - Set userDefinedTypeStatements) + Set userDefinedTypeStatements, + Set indexStatements) { this.broadcastableTableSchema = broadcastableTableSchema; this.userDefinedTypeStatements = userDefinedTypeStatements; + this.indexStatements = indexStatements; } public BroadcastableTableSchema getBroadcastableTableSchema() @@ -80,4 +84,10 @@ public Set getUserDefinedTypeStatements() { return userDefinedTypeStatements; } + + @NotNull + public Set getIndexStatements() + { + return indexStatements; + } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java index bed36ac4b..e1ea2b599 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java @@ -36,12 +36,15 @@ public class CassandraDirectDataTransportContext implements TransportContext.Dir @NotNull private final ClusterInfo clusterInfo; @NotNull + private final SchemaInfo schemaInfo; + @NotNull private final DirectDataTransferApi dataTransferApi; public CassandraDirectDataTransportContext(@NotNull BulkWriterContext bulkWriterContext) { this.jobInfo = bulkWriterContext.job(); this.clusterInfo = bulkWriterContext.cluster(); + this.schemaInfo = bulkWriterContext.schema(); this.dataTransferApi = createDirectDataTransferApi(); } @@ -72,6 +75,6 @@ public DirectDataTransferApi dataTransferApi() protected DirectDataTransferApi createDirectDataTransferApi() { CassandraBridge bridge = CassandraBridgeFactory.get(clusterInfo.getLowestCassandraVersion()); - return new SidecarDataTransferApi(clusterInfo.getCassandraContext(), bridge, jobInfo); + return new SidecarDataTransferApi(clusterInfo.getCassandraContext(), bridge, jobInfo, schemaInfo.getIndexStatements()); } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraSchemaInfo.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraSchemaInfo.java index db98cf710..9d287c8c3 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraSchemaInfo.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraSchemaInfo.java @@ -19,17 +19,27 @@ package org.apache.cassandra.spark.bulkwriter; +import java.util.Collections; import java.util.Set; +import org.jetbrains.annotations.NotNull; + public class CassandraSchemaInfo implements SchemaInfo { private final TableSchema tableSchema; private final Set userDefinedTypeStatements; + private final Set indexStatements; public CassandraSchemaInfo(TableSchema tableSchema, Set userDefinedTypeStatements) + { + this(tableSchema, userDefinedTypeStatements, Collections.emptySet()); + } + + public CassandraSchemaInfo(TableSchema tableSchema, Set userDefinedTypeStatements, Set indexStatements) { this.tableSchema = tableSchema; this.userDefinedTypeStatements = userDefinedTypeStatements; + this.indexStatements = indexStatements; } /** @@ -41,7 +51,8 @@ public CassandraSchemaInfo(TableSchema tableSchema, Set userDefinedTypeS public CassandraSchemaInfo(BroadcastableSchemaInfo broadcastable) { this(new TableSchema(broadcastable.getBroadcastableTableSchema()), - broadcastable.getUserDefinedTypeStatements()); + broadcastable.getUserDefinedTypeStatements(), + broadcastable.getIndexStatements()); } @Override @@ -51,8 +62,16 @@ public TableSchema getTableSchema() } @Override + @NotNull public Set getUserDefinedTypeStatements() { return userDefinedTypeStatements; } + + @Override + @NotNull + public Set getIndexStatements() + { + return indexStatements; + } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SSTableWriterFactory.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SSTableWriterFactory.java index 77b8f5f42..d84e08b27 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SSTableWriterFactory.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SSTableWriterFactory.java @@ -49,4 +49,23 @@ public static SSTableWriter getSSTableWriter(CassandraVersionFeatures serverVers userDefinedTypeStatements, bufferSizeMB); } + + public static SSTableWriter getSSTableWriter(CassandraVersionFeatures serverVersion, + String inDirectory, + String partitioner, + String createStatement, + String insertStatement, + Set userDefinedTypeStatements, + Set indexCreateStatements, + int bufferSizeMB) + { + CassandraBridge cassandraBridge = CassandraBridgeFactory.get(serverVersion); + return cassandraBridge.getSSTableWriter(inDirectory, + partitioner, + createStatement, + insertStatement, + userDefinedTypeStatements, + indexCreateStatements, + bufferSizeMB); + } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SchemaInfo.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SchemaInfo.java index b21d5fd44..c5e149dd3 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SchemaInfo.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SchemaInfo.java @@ -19,6 +19,7 @@ package org.apache.cassandra.spark.bulkwriter; +import java.util.Collections; import java.util.Set; import org.jetbrains.annotations.NotNull; @@ -36,4 +37,15 @@ public interface SchemaInfo @NotNull Set getUserDefinedTypeStatements(); + + /** + * Returns the set of CREATE INDEX statements for the table, if any. + * + * @return set of CREATE INDEX CQL statements, empty if none + */ + @NotNull + default Set getIndexStatements() + { + return Collections.emptySet(); + } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SidecarDataTransferApi.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SidecarDataTransferApi.java index 9199b01ca..0f4358c23 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SidecarDataTransferApi.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SidecarDataTransferApi.java @@ -22,6 +22,7 @@ import java.nio.file.Path; import java.util.Collections; import java.util.List; +import java.util.Set; import java.util.concurrent.ExecutionException; import org.slf4j.Logger; @@ -36,6 +37,7 @@ import org.apache.cassandra.spark.common.model.CassandraInstance; import org.apache.cassandra.spark.data.QualifiedTableName; import org.apache.cassandra.spark.exception.SidecarApiCallException; +import org.apache.cassandra.spark.utils.CqlUtils; import static org.apache.cassandra.bridge.CassandraBridgeFactory.maybeQuotedIdentifier; @@ -52,13 +54,21 @@ public class SidecarDataTransferApi implements DirectDataTransferApi private final CassandraBridge bridge; private final int sidecarPort; private final JobInfo job; + private final boolean hasSaiIndexes; public SidecarDataTransferApi(CassandraContext cassandraContext, CassandraBridge bridge, JobInfo job) + { + this(cassandraContext, bridge, job, Collections.emptySet()); + } + + public SidecarDataTransferApi(CassandraContext cassandraContext, CassandraBridge bridge, JobInfo job, + Set indexStatements) { this.sidecarClient = cassandraContext.getSidecarClient(); this.sidecarPort = cassandraContext.sidecarPort(); this.bridge = bridge; this.job = job; + this.hasSaiIndexes = !indexStatements.isEmpty() && indexStatements.stream().allMatch(CqlUtils::isSaiIndex); } @Override @@ -107,6 +117,12 @@ public RemoteCommitResult commitSSTables(CassandraInstance instance, // Always verify SSTables on import importOptions.verifySSTables(true).extendedVerify(!job.skipExtendedVerify()); + // When SAI index components were generated alongside SSTables, enable SAI validation on import + if (hasSaiIndexes) + { + importOptions.validateSaiIndexes(true).saiIndexChecksum(true); + } + try { QualifiedTableName qt = job.qualifiedTableName(); diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java index 3d09be476..c01718b26 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java @@ -131,6 +131,7 @@ public SortedSSTableWriter(BulkWriterContext writerContext, Path outDir, DigestA tableSchema.createStatement, tableSchema.modificationStatement, schema.getUserDefinedTypeStatements(), + schema.getIndexStatements(), writerContext.job().sstableDataSizeInMiB()); } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableSchema.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableSchema.java index b198af984..98c7c1996 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableSchema.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableSchema.java @@ -27,6 +27,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -36,6 +37,7 @@ import org.apache.cassandra.spark.common.schema.ColumnType; import org.apache.cassandra.spark.data.CqlField; import org.apache.cassandra.spark.exception.UnsupportedAnalyticsOperationException; +import org.apache.cassandra.spark.utils.CqlUtils; import org.apache.spark.sql.types.StructType; import static org.apache.cassandra.bridge.CassandraBridgeFactory.maybeQuotedIdentifier; @@ -71,6 +73,20 @@ public TableSchema(StructType dfSchema, String lowestCassandraVersion, boolean quoteIdentifiers, boolean skipSecondaryIndexCheck) + { + this(dfSchema, tableInfo, writeMode, ttlOption, timestampOption, + lowestCassandraVersion, quoteIdentifiers, skipSecondaryIndexCheck, Collections.emptySet()); + } + + public TableSchema(StructType dfSchema, + TableInfoProvider tableInfo, + WriteMode writeMode, + TTLOption ttlOption, + TimestampOption timestampOption, + String lowestCassandraVersion, + boolean quoteIdentifiers, + boolean skipSecondaryIndexCheck, + Set indexStatements) { this.writeMode = writeMode; this.ttlOption = ttlOption; @@ -80,21 +96,7 @@ public TableSchema(StructType dfSchema, this.quoteIdentifiers = quoteIdentifiers; validateDataFrameCompatibility(dfSchema, tableInfo); - // If a table has indexes on it, some external process (application, DB, etc.) is responsible for rebuilding - // indexes on the table after the bulk write completes; cassandra does this as part of the SSTable import - // process today. 2i and SAI have different ergonomics here regarding if stale data is served during index build; - // ultimately we want the bulk writer to also write native SAI index files alongside sstables but until - // then, this is allowable and fine for users who Know What They're Doing. - if (!skipSecondaryIndexCheck) - { - validateNoSecondaryIndexes(tableInfo); - } - else if (tableInfo.hasSecondaryIndex()) - { - LOGGER.warn("Bulk writing to tables with SecondaryIndexes will have an asynchronous index rebuild " - + "take place automatically after writing. Reads against the index during this time " - + "window will produce inconsistent or stale results until index rebuild is complete."); - } + validateSecondaryIndexes(tableInfo, skipSecondaryIndexCheck, indexStatements, lowestCassandraVersion); validateUserAddedColumns(lowestCassandraVersion, quoteIdentifiers, ttlOption, timestampOption); this.createStatement = getCreateStatement(tableInfo); @@ -308,6 +310,50 @@ private static void validateDataframeFieldsInTable(TableInfoProvider tableInfo, Preconditions.checkArgument(unknownFields.isEmpty(), "Unknown fields in data frame => " + unknownFields); } + /** + * Validates secondary index constraints for bulk write operations. + *

+ * When the cluster is Cassandra 5.0+ and ALL indexes are SAI, the write is allowed because + * SAI index components are generated alongside SSTables and are immediately queryable after import. + *

+ * When any index is non-SAI (legacy 2i), the write is blocked unless SKIP_SECONDARY_INDEX_CHECK is set. + * + * @param tableInfo the table info provider + * @param skipSecondaryIndexCheck whether the user explicitly opted out of the check + * @param indexStatements the CREATE INDEX statements for the table + * @param lowestCassandraVersion the lowest Cassandra version in the cluster + */ + static void validateSecondaryIndexes(TableInfoProvider tableInfo, + boolean skipSecondaryIndexCheck, + Set indexStatements, + String lowestCassandraVersion) + { + if (!tableInfo.hasSecondaryIndex()) + { + return; // No indexes — nothing to validate + } + + boolean allSai = !indexStatements.isEmpty() && indexStatements.stream().allMatch(CqlUtils::isSaiIndex); + boolean isCassandra5OrLater = isCassandra5OrLater(lowestCassandraVersion); + + if (allSai && isCassandra5OrLater) + { + LOGGER.info("Table has SAI indexes only on Cassandra 5.0+. SAI index components will be generated " + + "alongside SSTables for immediate queryability after import. indexCount={}", indexStatements.size()); + return; + } + + if (skipSecondaryIndexCheck) + { + LOGGER.warn("Bulk writing to tables with SecondaryIndexes will have an asynchronous index rebuild " + + "take place automatically after writing. Reads against the index during this time " + + "window will produce inconsistent or stale results until index rebuild is complete."); + return; + } + + throw new UnsupportedAnalyticsOperationException("Bulkwriter doesn't support secondary indexes"); + } + static void validateNoSecondaryIndexes(TableInfoProvider tableInfo) { if (tableInfo.hasSecondaryIndex()) @@ -316,6 +362,24 @@ static void validateNoSecondaryIndexes(TableInfoProvider tableInfo) } } + @VisibleForTesting + static boolean isCassandra5OrLater(String version) + { + if (version == null || version.isEmpty()) + { + return false; + } + try + { + int majorVersion = Integer.parseInt(version.split("\\.")[0]); + return majorVersion >= 5; + } + catch (NumberFormatException exception) + { + return false; + } + } + private static List getKeyFieldPositions(StructType dfSchema, List columnNames, List keyFieldNames) diff --git a/cassandra-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridge.java b/cassandra-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridge.java index 6bbf7f142..43f0c8cb2 100644 --- a/cassandra-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridge.java +++ b/cassandra-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridge.java @@ -400,6 +400,33 @@ public abstract SSTableWriter getSSTableWriter(String inDirectory, Set userDefinedTypeStatements, int bufferSizeMB); + /** + * Creates an SSTableWriter with optional index generation support. + * The default implementation ignores index statements for backward compatibility with Cassandra 4.0. + * Cassandra 5.0+ bridges override this to generate SAI index components alongside SSTables. + * + * @param inDirectory output directory for SSTables + * @param partitioner the partitioner name + * @param createStatement CREATE TABLE CQL statement + * @param insertStatement INSERT CQL statement + * @param userDefinedTypeStatements set of CREATE TYPE statements + * @param indexCreateStatements set of CREATE INDEX statements + * @param bufferSizeMB max SSTable size in MiB + * @return a new SSTableWriter instance + */ + public SSTableWriter getSSTableWriter(String inDirectory, + String partitioner, + String createStatement, + String insertStatement, + Set userDefinedTypeStatements, + Set indexCreateStatements, + int bufferSizeMB) + { + // Default: ignore index statements (C* 4.0 does not support inline SAI generation) + return getSSTableWriter(inDirectory, partitioner, createStatement, insertStatement, + userDefinedTypeStatements, bufferSizeMB); + } + public abstract SSTableSummary getSSTableSummary(@NotNull String keyspace, @NotNull String table, @NotNull SSTable ssTable); diff --git a/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java b/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java index 88d5c6df0..299c6efba 100644 --- a/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java +++ b/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java @@ -564,6 +564,19 @@ public SSTableWriter getSSTableWriter(String inDirectory, userDefinedTypeStatements, bufferSizeMB); } + @Override + public SSTableWriter getSSTableWriter(String inDirectory, + String partitioner, + String createStatement, + String insertStatement, + Set userDefinedTypeStatements, + Set indexCreateStatements, + int bufferSizeMB) + { + return new SSTableWriterImplementation(inDirectory, partitioner, createStatement, insertStatement, + userDefinedTypeStatements, indexCreateStatements, bufferSizeMB); + } + @Override public SSTableSummary getSSTableSummary(@NotNull String keyspace, @NotNull String table, diff --git a/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/SSTableWriterImplementation.java b/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/SSTableWriterImplementation.java new file mode 100644 index 000000000..90488fe78 --- /dev/null +++ b/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/SSTableWriterImplementation.java @@ -0,0 +1,233 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.bridge; + +import java.io.IOException; +import java.lang.reflect.Method; +import java.util.Collection; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.function.Consumer; +import java.util.stream.Collectors; + +import javax.annotation.concurrent.NotThreadSafe; + +import com.google.common.annotations.VisibleForTesting; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.config.Config; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.RandomPartitioner; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.io.sstable.CQLSSTableWriter; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.jetbrains.annotations.NotNull; + +/** + * Cassandra 5.0 SSTableWriter implementation with SAI index generation support. + *

+ * When provided with CREATE INDEX statements, this writer attempts to configure the + * CQLSSTableWriter to generate SAI index component files alongside SSTables. + * This requires Cassandra 5.0+ with SAI support in CQLSSTableWriter. + */ +@NotThreadSafe +public class SSTableWriterImplementation implements SSTableWriter +{ + private static final Logger LOGGER = LoggerFactory.getLogger(SSTableWriterImplementation.class); + + static + { + Config.setClientMode(true); + } + + private final CQLSSTableWriter writer; + private Consumer> producedSSTablesListener; + + public SSTableWriterImplementation(String inDirectory, + String partitioner, + String createStatement, + String insertStatement, + @NotNull Set userDefinedTypeStatements, + int bufferSizeMB) + { + this(inDirectory, determineSupportedPartitioner(partitioner), createStatement, insertStatement, + userDefinedTypeStatements, java.util.Collections.emptySet(), bufferSizeMB); + } + + public SSTableWriterImplementation(String inDirectory, + String partitioner, + String createStatement, + String insertStatement, + @NotNull Set userDefinedTypeStatements, + @NotNull Set indexCreateStatements, + int bufferSizeMB) + { + this(inDirectory, determineSupportedPartitioner(partitioner), createStatement, insertStatement, + userDefinedTypeStatements, indexCreateStatements, bufferSizeMB); + } + + @VisibleForTesting + public SSTableWriterImplementation(String inDirectory, + IPartitioner partitioner, + String createStatement, + String insertStatement, + @NotNull Set userDefinedTypeStatements, + int bufferSizeMB) + { + this(inDirectory, partitioner, createStatement, insertStatement, + userDefinedTypeStatements, java.util.Collections.emptySet(), bufferSizeMB); + } + + @VisibleForTesting + public SSTableWriterImplementation(String inDirectory, + IPartitioner partitioner, + String createStatement, + String insertStatement, + @NotNull Set userDefinedTypeStatements, + @NotNull Set indexCreateStatements, + int bufferSizeMB) + { + CQLSSTableWriter.Builder builder = configureBuilder(inDirectory, + createStatement, + insertStatement, + bufferSizeMB, + userDefinedTypeStatements, + this::onSSTablesProduced, + partitioner); + if (!indexCreateStatements.isEmpty()) + { + configureIndexes(builder, indexCreateStatements); + } + this.writer = builder.build(); + } + + private static IPartitioner determineSupportedPartitioner(String partitioner) + { + return partitioner.toLowerCase().contains("random") + ? new RandomPartitioner() + : new Murmur3Partitioner(); + } + + private void onSSTablesProduced(Collection sstables) + { + Objects.requireNonNull(producedSSTablesListener, "producedSSTablesListener is not set"); + Set sstableDescriptors = sstables + .stream() + .map(sstable -> { + String baseFilename = CassandraBridgeImplementation.baseFilename(sstable.descriptor); + sstable.selfRef().close(); + return new SSTableDescriptor(baseFilename); + }) + .collect(Collectors.toSet()); + + producedSSTablesListener.accept(sstableDescriptors); + } + + @Override + public void addRow(Map values) throws IOException + { + try + { + writer.addRow(values); + } + catch (InvalidRequestException exception) + { + throw new RuntimeException(exception); + } + } + + @Override + public void setSSTablesProducedListener(Consumer> listener) + { + producedSSTablesListener = Objects.requireNonNull(listener); + } + + @Override + public void close() throws IOException + { + writer.close(); + } + + @VisibleForTesting + static CQLSSTableWriter.Builder configureBuilder(String inDirectory, + String createStatement, + String insertStatement, + int bufferSizeMB, + Set udts, + Consumer> producedSSTablesListener, + IPartitioner cassPartitioner) + { + CQLSSTableWriter.Builder builder = CQLSSTableWriter.builder(); + + for (String udt : udts) + { + builder.withType(udt); + } + + return builder.inDirectory(inDirectory) + .forTable(createStatement) + .withPartitioner(cassPartitioner) + .using(insertStatement) + .sorted() + .withSSTableProducedListener(producedSSTablesListener) + .openSSTableOnProduced() + .withMaxSSTableSizeInMiB(bufferSizeMB); + } + + /** + * Configures the CQLSSTableWriter builder to generate SAI index components. + * Uses reflection to call withIndexes/withBuildIndexes methods since they may not be + * present in all Cassandra 5.0.x patch versions. + * + * @param builder the CQLSSTableWriter builder + * @param indexCreateStatements the CREATE INDEX CQL statements + */ + @VisibleForTesting + static void configureIndexes(CQLSSTableWriter.Builder builder, Set indexCreateStatements) + { + try + { + // Use reflection to check for SAI index generation support in CQLSSTableWriter.Builder + Method withIndexMethod = CQLSSTableWriter.Builder.class.getMethod("withIndex", String.class); + Method withBuildIndexesMethod = CQLSSTableWriter.Builder.class.getMethod("withBuildIndexes", boolean.class); + + for (String indexStatement : indexCreateStatements) + { + withIndexMethod.invoke(builder, indexStatement); + } + withBuildIndexesMethod.invoke(builder, true); + + LOGGER.info("SAI index generation enabled for {} index(es)", indexCreateStatements.size()); + } + catch (NoSuchMethodException exception) + { + LOGGER.warn("CQLSSTableWriter does not support inline SAI index generation in this Cassandra version. " + + "Indexes will need to be rebuilt after import. indexCount={}", indexCreateStatements.size()); + } + catch (ReflectiveOperationException exception) + { + LOGGER.error("Failed to configure SAI index generation", exception); + throw new RuntimeException("Failed to configure SAI index generation", exception); + } + } +} From 41020b08b3177c275ca4cc835cdcd770e9127c51 Mon Sep 17 00:00:00 2001 From: Shailaja Koppu Date: Wed, 1 Apr 2026 15:03:21 +0100 Subject: [PATCH 2/6] first version --- cassandra-analytics-cdc/build.gradle | 6 - cassandra-analytics-cdc/build.gradle.rej | 20 --- cassandra-analytics-core/build.gradle | 11 -- cassandra-analytics-core/build.gradle.rej | 24 --- .../CassandraDirectDataTransportContext.java | 2 +- .../bulkwriter/SSTableWriterFactory.java | 17 -- .../spark/bulkwriter/SchemaInfo.java | 13 ++ .../bulkwriter/SidecarDataTransferApi.java | 11 +- .../spark/bulkwriter/SortedSSTableWriter.java | 16 +- .../analytics/BulkWriteSAIIndexTest.java | 155 ++++++++++++++++++ .../cassandra/bridge/CassandraBridge.java | 28 +--- .../bridge/CassandraBridgeImplementation.java | 12 -- .../bridge/SSTableWriterImplementation.java | 100 +++-------- .../SSTableWriterImplementationTest.java | 2 + .../bridge/CassandraBridgeImplementation.java | 3 +- 15 files changed, 207 insertions(+), 213 deletions(-) delete mode 100644 cassandra-analytics-cdc/build.gradle.rej delete mode 100644 cassandra-analytics-core/build.gradle.rej create mode 100644 cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriteSAIIndexTest.java diff --git a/cassandra-analytics-cdc/build.gradle b/cassandra-analytics-cdc/build.gradle index 0b5d7f780..38c284065 100644 --- a/cassandra-analytics-cdc/build.gradle +++ b/cassandra-analytics-cdc/build.gradle @@ -144,15 +144,9 @@ jar { test { systemProperty "cassandra.sidecar.versions_to_test", (System.getenv("CASSANDRA_VERSION") ?: "4.0.17,5.0.5") -<<<<<<< Updated upstream minHeapSize = '1024m' maxHeapSize = '3072m' maxParallelForks = Math.max(Runtime.runtime.availableProcessors() * 2, 8) -======= - minHeapSize = System.getenv('CDC_TEST_MIN_HEAP_SIZE') ?: '512m' - maxHeapSize = System.getenv('CDC_TEST_MAX_HEAP_SIZE') ?: '2048m' - maxParallelForks = System.getenv('CDC_MAX_PARALLEL_FORKS')?.toInteger() ?: 3 ->>>>>>> Stashed changes forkEvery = 1 // Enables different end-to-end test classes use Spark contexts with different configurations // Make it so unit tests run on a Jar with Cassandra bridge implementations built in diff --git a/cassandra-analytics-cdc/build.gradle.rej b/cassandra-analytics-cdc/build.gradle.rej deleted file mode 100644 index ffab6d207..000000000 --- a/cassandra-analytics-cdc/build.gradle.rej +++ /dev/null @@ -1,20 +0,0 @@ -diff a/cassandra-analytics-cdc/build.gradle b/cassandra-analytics-cdc/build.gradle (rejected hunks) -@@ -141,15 +141,12 @@ jar { - } - } - --def cdcMaxHeapSize = System.getenv("CDC_MAX_HEAP_SIZE") ?: "2048m" --def cdcMaxParallelForks = (System.getenv("CDC_MAX_PARALLEL_FORKS") ?: "1") as int -- - test { - systemProperty "cassandra.sidecar.versions_to_test", (System.getenv("CASSANDRA_VERSION") ?: "4.0.17,5.0.5") - -- minHeapSize = '512m' -- maxHeapSize = cdcMaxHeapSize -- maxParallelForks = cdcMaxParallelForks -+ minHeapSize = System.getenv('CDC_TEST_MIN_HEAP_SIZE') ?: '512m' -+ maxHeapSize = System.getenv('CDC_TEST_MAX_HEAP_SIZE') ?: '2048m' -+ maxParallelForks = System.getenv('CDC_MAX_PARALLEL_FORKS')?.toInteger() ?: 3 - forkEvery = 1 // Enables different end-to-end test classes use Spark contexts with different configurations - - // Make it so unit tests run on a Jar with Cassandra bridge implementations built in diff --git a/cassandra-analytics-core/build.gradle b/cassandra-analytics-core/build.gradle index 4f77838ef..ffce66600 100644 --- a/cassandra-analytics-core/build.gradle +++ b/cassandra-analytics-core/build.gradle @@ -181,13 +181,8 @@ tasks.register('testSequential', Test) { } systemProperty "cassandra.analytics.bridges.sstable_format", System.getProperty("cassandra.analytics.bridges.sstable_format", "big") -<<<<<<< Updated upstream minHeapSize = '1024m' maxHeapSize = System.getenv('CORE_TEST_MAX_HEAP_SIZE') ?: '3072m' -======= - minHeapSize = System.getenv('CORE_SEQ_TEST_MIN_HEAP_SIZE') ?: '512m' - maxHeapSize = System.getenv('CORE_SEQ_TEST_MAX_HEAP_SIZE') ?: '4096m' ->>>>>>> Stashed changes maxParallelForks = 1 forkEvery = 1 // Enables different end-to-end test classes use Spark contexts with different configurations @@ -221,16 +216,10 @@ tasks.register('testSequential', Test) { test { systemProperty "cassandra.analytics.bridges.sstable_format", System.getProperty("cassandra.analytics.bridges.sstable_format", "big") -<<<<<<< Updated upstream minHeapSize = '1024m' maxHeapSize = System.getenv('CORE_TEST_MAX_HEAP_SIZE') ?: '3072m' maxParallelForks = System.getenv('CORE_MAX_PARALLEL_FORKS')?.toInteger() ?: Math.max(Runtime.runtime.availableProcessors() * 2, 8) -======= - minHeapSize = System.getenv('CORE_TEST_MIN_HEAP_SIZE') ?: '512m' - maxHeapSize = System.getenv('CORE_TEST_MAX_HEAP_SIZE') ?: '2048m' - maxParallelForks = System.getenv('CORE_MAX_PARALLEL_FORKS')?.toInteger() ?: 3 ->>>>>>> Stashed changes forkEvery = 1 // Enables different end-to-end test classes use Spark contexts with different configurations // Make it so unit tests run on a Jar with Cassandra bridge implementations built in diff --git a/cassandra-analytics-core/build.gradle.rej b/cassandra-analytics-core/build.gradle.rej deleted file mode 100644 index 8ef6088b6..000000000 --- a/cassandra-analytics-core/build.gradle.rej +++ /dev/null @@ -1,24 +0,0 @@ -diff a/cassandra-analytics-core/build.gradle b/cassandra-analytics-core/build.gradle (rejected hunks) -@@ -181,8 +181,8 @@ tasks.register('testSequential', Test) { - } - - systemProperty "cassandra.analytics.bridges.sstable_format", System.getProperty("cassandra.analytics.bridges.sstable_format", "big") -- minHeapSize = '512m' -- maxHeapSize = System.getenv('CORE_TEST_MAX_HEAP_SIZE') ?: '2048m' -+ minHeapSize = System.getenv('CORE_SEQ_TEST_MIN_HEAP_SIZE') ?: '512m' -+ maxHeapSize = System.getenv('CORE_SEQ_TEST_MAX_HEAP_SIZE') ?: '4096m' - maxParallelForks = 1 - forkEvery = 1 // Enables different end-to-end test classes use Spark contexts with different configurations - -@@ -216,8 +216,9 @@ tasks.register('testSequential', Test) { - - test { - systemProperty "cassandra.analytics.bridges.sstable_format", System.getProperty("cassandra.analytics.bridges.sstable_format", "big") -- minHeapSize = '512m' -- maxHeapSize = '2048m' -+ minHeapSize = System.getenv('CORE_TEST_MIN_HEAP_SIZE') ?: '512m' -+ maxHeapSize = System.getenv('CORE_TEST_MAX_HEAP_SIZE') ?: '2048m' -+ maxParallelForks = System.getenv('CORE_MAX_PARALLEL_FORKS')?.toInteger() ?: 3 - forkEvery = 1 // Enables different end-to-end test classes use Spark contexts with different configurations - - // Make it so unit tests run on a Jar with Cassandra bridge implementations built in diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java index e1ea2b599..77d5ef8d1 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java @@ -75,6 +75,6 @@ public DirectDataTransferApi dataTransferApi() protected DirectDataTransferApi createDirectDataTransferApi() { CassandraBridge bridge = CassandraBridgeFactory.get(clusterInfo.getLowestCassandraVersion()); - return new SidecarDataTransferApi(clusterInfo.getCassandraContext(), bridge, jobInfo, schemaInfo.getIndexStatements()); + return new SidecarDataTransferApi(clusterInfo.getCassandraContext(), bridge, jobInfo, schemaInfo.hasSaiIndexes()); } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SSTableWriterFactory.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SSTableWriterFactory.java index d84e08b27..344f2cda9 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SSTableWriterFactory.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SSTableWriterFactory.java @@ -33,23 +33,6 @@ private SSTableWriterFactory() throw new IllegalStateException(getClass() + " is static utility class and shall not be instantiated"); } - public static SSTableWriter getSSTableWriter(CassandraVersionFeatures serverVersion, - String inDirectory, - String partitioner, - String createStatement, - String insertStatement, - Set userDefinedTypeStatements, - int bufferSizeMB) - { - CassandraBridge cassandraBridge = CassandraBridgeFactory.get(serverVersion); - return cassandraBridge.getSSTableWriter(inDirectory, - partitioner, - createStatement, - insertStatement, - userDefinedTypeStatements, - bufferSizeMB); - } - public static SSTableWriter getSSTableWriter(CassandraVersionFeatures serverVersion, String inDirectory, String partitioner, diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SchemaInfo.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SchemaInfo.java index c5e149dd3..a9233057c 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SchemaInfo.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SchemaInfo.java @@ -24,6 +24,8 @@ import org.jetbrains.annotations.NotNull; +import org.apache.cassandra.spark.utils.CqlUtils; + /** * Provides schema information for bulk write operations. *

@@ -48,4 +50,15 @@ default Set getIndexStatements() { return Collections.emptySet(); } + + /** + * Returns whether the table has SAI indexes. + * + * @return {@code true} if the table has index statements and all of them are SAI indexes + */ + default boolean hasSaiIndexes() + { + Set indexes = getIndexStatements(); + return !indexes.isEmpty() && indexes.stream().allMatch(CqlUtils::isSaiIndex); + } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SidecarDataTransferApi.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SidecarDataTransferApi.java index 0f4358c23..3a778c2d9 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SidecarDataTransferApi.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SidecarDataTransferApi.java @@ -22,7 +22,6 @@ import java.nio.file.Path; import java.util.Collections; import java.util.List; -import java.util.Set; import java.util.concurrent.ExecutionException; import org.slf4j.Logger; @@ -37,7 +36,6 @@ import org.apache.cassandra.spark.common.model.CassandraInstance; import org.apache.cassandra.spark.data.QualifiedTableName; import org.apache.cassandra.spark.exception.SidecarApiCallException; -import org.apache.cassandra.spark.utils.CqlUtils; import static org.apache.cassandra.bridge.CassandraBridgeFactory.maybeQuotedIdentifier; @@ -56,19 +54,14 @@ public class SidecarDataTransferApi implements DirectDataTransferApi private final JobInfo job; private final boolean hasSaiIndexes; - public SidecarDataTransferApi(CassandraContext cassandraContext, CassandraBridge bridge, JobInfo job) - { - this(cassandraContext, bridge, job, Collections.emptySet()); - } - public SidecarDataTransferApi(CassandraContext cassandraContext, CassandraBridge bridge, JobInfo job, - Set indexStatements) + boolean hasSaiIndexes) { this.sidecarClient = cassandraContext.getSidecarClient(); this.sidecarPort = cassandraContext.sidecarPort(); this.bridge = bridge; this.job = job; - this.hasSaiIndexes = !indexStatements.isEmpty() && indexStatements.stream().allMatch(CqlUtils::isSaiIndex); + this.hasSaiIndexes = hasSaiIndexes; } @Override diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java index c01718b26..809713fd9 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java @@ -125,14 +125,14 @@ public SortedSSTableWriter(BulkWriterContext writerContext, Path outDir, DigestA SchemaInfo schema = writerContext.schema(); TableSchema tableSchema = schema.getTableSchema(); this.cqlSSTableWriter = SSTableWriterFactory.getSSTableWriter( - CassandraVersionFeatures.cassandraVersionFeaturesFromCassandraVersion(packageVersion), - this.outDir.toString(), - writerContext.cluster().getPartitioner().toString(), - tableSchema.createStatement, - tableSchema.modificationStatement, - schema.getUserDefinedTypeStatements(), - schema.getIndexStatements(), - writerContext.job().sstableDataSizeInMiB()); + CassandraVersionFeatures.cassandraVersionFeaturesFromCassandraVersion(packageVersion), + this.outDir.toString(), + writerContext.cluster().getPartitioner().toString(), + tableSchema.createStatement, + tableSchema.modificationStatement, + schema.getUserDefinedTypeStatements(), + schema.getIndexStatements(), + writerContext.job().sstableDataSizeInMiB()); } @NotNull diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriteSAIIndexTest.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriteSAIIndexTest.java new file mode 100644 index 000000000..c3a05653b --- /dev/null +++ b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriteSAIIndexTest.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.analytics; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.stream.Stream; + +import com.vdurmont.semver4j.Semver; +import org.junit.jupiter.api.Test; + +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.sidecar.testing.QualifiedName; +import org.apache.cassandra.testing.ClusterBuilderConfiguration; +import org.apache.cassandra.testing.TestUtils; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assumptions.assumeThat; +import static org.apache.cassandra.testing.TestUtils.CREATE_TEST_TABLE_STATEMENT; +import static org.apache.cassandra.testing.TestUtils.DC1_RF3; +import static org.apache.cassandra.testing.TestUtils.ROW_COUNT; +import static org.apache.cassandra.testing.TestUtils.TEST_KEYSPACE; + +/** + * Integration test for bulk write and read operations on a table with multiple Storage Attached Indexes (SAI). + * Verifies that SAI SSTable components are written to disk and that SAI index filtering works after bulk write. + */ +class BulkWriteSAIIndexTest extends SharedClusterSparkIntegrationTestBase +{ + private static final QualifiedName TABLE_SAI = + new QualifiedName(TEST_KEYSPACE, "test_sai"); + + @Test + void testBulkWriteAndReadWithMultipleSaiIndexes() + { + SparkSession spark = getOrCreateSparkSession(); + Dataset dfWrite = DataGenerationUtils.generateCourseData(spark, ROW_COUNT); + + // 1. Bulk write to table with SAI indexes on both course and marks + bulkWriterDataFrameWriter(dfWrite, TABLE_SAI).save(); + + // 2. Flush to ensure SSTable components (including SAI) are written to disk + cluster.getFirstRunningInstance().flush(TEST_KEYSPACE); + + // 3. Verify SAI index SSTable components exist on the filesystem + assertThat(hasSaiIndexFiles()) + .as("SAI index files should exist on disk after bulk write and flush") + .isTrue(); + + // 4. Bulk read the data back and verify equality + Dataset dfRead = bulkReaderDataFrame(TABLE_SAI).load(); + checkSmallDataFrameEquality(dfWrite, dfRead); + + // 5. Verify SAI index filtering works on the course column + Object[][] courseResults = cluster.getFirstRunningInstance() + .coordinator() + .execute(String.format("SELECT * FROM %s WHERE course = 'course0';", + TABLE_SAI), + ConsistencyLevel.ALL); + assertThat(courseResults).isNotNull(); + assertThat(courseResults.length).isGreaterThan(0); + for (Object[] row : courseResults) + { + // course is the second column (id, course, marks) + assertThat(row[1]).isEqualTo("course0"); + } + + // 6. Verify SAI index filtering works on the marks column + Object[][] marksResults = cluster.getFirstRunningInstance() + .coordinator() + .execute(String.format("SELECT * FROM %s WHERE marks = 50;", + TABLE_SAI), + ConsistencyLevel.ALL); + assertThat(marksResults).isNotNull(); + } + + /** + * Checks whether SAI index files exist on the filesystem for the test keyspace. + * SAI stores index data in directories or files with naming patterns that include + * "SAI" or reside under directories indicating SAI components. + */ + private boolean hasSaiIndexFiles() + { + String[] dataDirs = (String[]) cluster.get(1) + .config() + .getParams() + .get("data_file_directories"); + String dataDir = dataDirs[0]; + Path keyspacePath = Paths.get(dataDir, TEST_KEYSPACE); + + try (Stream walkStream = Files.walk(keyspacePath)) + { + return walkStream + .filter(Files::isRegularFile) + .anyMatch(path -> { + String pathStr = path.toString(); + // SAI index components are stored in directories or files + // with patterns like ".sai/" or "SAI" in the path + return pathStr.contains("SAI") || pathStr.contains(".sai"); + }); + } + catch (IOException e) + { + return false; + } + } + + @Override + protected void beforeClusterProvisioning() + { + assumeThat(TestUtils.getDTestClusterVersion().isGreaterThanOrEqualTo(new Semver("5.0", Semver.SemverType.LOOSE))) + .describedAs("Storage Attached Index (SAI) is only available in Cassandra 5.0 and above") + .isTrue(); + } + + @Override + protected void initializeSchemaForTest() + { + createTestKeyspace(TEST_KEYSPACE, DC1_RF3); + createTestTable(TABLE_SAI, CREATE_TEST_TABLE_STATEMENT); + cluster.schemaChangeIgnoringStoppedInstances( + String.format("CREATE INDEX ON %s(course) USING 'StorageAttachedIndex';", TABLE_SAI)); + cluster.schemaChangeIgnoringStoppedInstances( + String.format("CREATE INDEX ON %s(marks) USING 'StorageAttachedIndex';", TABLE_SAI)); + } + + @Override + protected ClusterBuilderConfiguration testClusterConfiguration() + { + return super.testClusterConfiguration() + .nodesPerDc(3); + } +} diff --git a/cassandra-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridge.java b/cassandra-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridge.java index 43f0c8cb2..13d856fe7 100644 --- a/cassandra-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridge.java +++ b/cassandra-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridge.java @@ -398,35 +398,9 @@ public abstract SSTableWriter getSSTableWriter(String inDirectory, String createStatement, String insertStatement, Set userDefinedTypeStatements, + Set indexCreateStatements, int bufferSizeMB); - /** - * Creates an SSTableWriter with optional index generation support. - * The default implementation ignores index statements for backward compatibility with Cassandra 4.0. - * Cassandra 5.0+ bridges override this to generate SAI index components alongside SSTables. - * - * @param inDirectory output directory for SSTables - * @param partitioner the partitioner name - * @param createStatement CREATE TABLE CQL statement - * @param insertStatement INSERT CQL statement - * @param userDefinedTypeStatements set of CREATE TYPE statements - * @param indexCreateStatements set of CREATE INDEX statements - * @param bufferSizeMB max SSTable size in MiB - * @return a new SSTableWriter instance - */ - public SSTableWriter getSSTableWriter(String inDirectory, - String partitioner, - String createStatement, - String insertStatement, - Set userDefinedTypeStatements, - Set indexCreateStatements, - int bufferSizeMB) - { - // Default: ignore index statements (C* 4.0 does not support inline SAI generation) - return getSSTableWriter(inDirectory, partitioner, createStatement, insertStatement, - userDefinedTypeStatements, bufferSizeMB); - } - public abstract SSTableSummary getSSTableSummary(@NotNull String keyspace, @NotNull String table, @NotNull SSTable ssTable); diff --git a/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java b/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java index 299c6efba..1135ea999 100644 --- a/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java +++ b/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java @@ -552,18 +552,6 @@ public static IPartitioner getPartitioner(Partitioner partitioner) return CassandraTypesImplementation.getPartitioner(partitioner); } - @Override - public SSTableWriter getSSTableWriter(String inDirectory, - String partitioner, - String createStatement, - String insertStatement, - @NotNull Set userDefinedTypeStatements, - int bufferSizeMB) - { - return new SSTableWriterImplementation(inDirectory, partitioner, createStatement, insertStatement, - userDefinedTypeStatements, bufferSizeMB); - } - @Override public SSTableWriter getSSTableWriter(String inDirectory, String partitioner, diff --git a/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/SSTableWriterImplementation.java b/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/SSTableWriterImplementation.java index 90488fe78..600823c98 100644 --- a/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/SSTableWriterImplementation.java +++ b/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/SSTableWriterImplementation.java @@ -20,7 +20,6 @@ package org.apache.cassandra.bridge; import java.io.IOException; -import java.lang.reflect.Method; import java.util.Collection; import java.util.Map; import java.util.Objects; @@ -63,17 +62,6 @@ public class SSTableWriterImplementation implements SSTableWriter private final CQLSSTableWriter writer; private Consumer> producedSSTablesListener; - public SSTableWriterImplementation(String inDirectory, - String partitioner, - String createStatement, - String insertStatement, - @NotNull Set userDefinedTypeStatements, - int bufferSizeMB) - { - this(inDirectory, determineSupportedPartitioner(partitioner), createStatement, insertStatement, - userDefinedTypeStatements, java.util.Collections.emptySet(), bufferSizeMB); - } - public SSTableWriterImplementation(String inDirectory, String partitioner, String createStatement, @@ -86,18 +74,6 @@ public SSTableWriterImplementation(String inDirectory, userDefinedTypeStatements, indexCreateStatements, bufferSizeMB); } - @VisibleForTesting - public SSTableWriterImplementation(String inDirectory, - IPartitioner partitioner, - String createStatement, - String insertStatement, - @NotNull Set userDefinedTypeStatements, - int bufferSizeMB) - { - this(inDirectory, partitioner, createStatement, insertStatement, - userDefinedTypeStatements, java.util.Collections.emptySet(), bufferSizeMB); - } - @VisibleForTesting public SSTableWriterImplementation(String inDirectory, IPartitioner partitioner, @@ -107,18 +83,10 @@ public SSTableWriterImplementation(String inDirectory, @NotNull Set indexCreateStatements, int bufferSizeMB) { - CQLSSTableWriter.Builder builder = configureBuilder(inDirectory, - createStatement, - insertStatement, - bufferSizeMB, - userDefinedTypeStatements, - this::onSSTablesProduced, - partitioner); - if (!indexCreateStatements.isEmpty()) - { - configureIndexes(builder, indexCreateStatements); - } - this.writer = builder.build(); + this.writer = configureBuilder(inDirectory, createStatement, insertStatement, + bufferSizeMB, userDefinedTypeStatements, indexCreateStatements, + this::onSSTablesProduced, partitioner) + .build(); } private static IPartitioner determineSupportedPartitioner(String partitioner) @@ -135,6 +103,9 @@ private void onSSTablesProduced(Collection sstables) .stream() .map(sstable -> { String baseFilename = CassandraBridgeImplementation.baseFilename(sstable.descriptor); + // TODO: for now, the sstableReader is closed immediately, + // TODO (CONTI): we can potentially read from the reader to validate the underlying sstable, + // TODO (CONTI): replacing org.apache.cassandra.spark.bulkwriter.SortedSSTableWriter.validateSSTables sstable.selfRef().close(); return new SSTableDescriptor(baseFilename); }) @@ -174,6 +145,7 @@ static CQLSSTableWriter.Builder configureBuilder(String inDirectory, String insertStatement, int bufferSizeMB, Set udts, + Set indexCreateStatements, Consumer> producedSSTablesListener, IPartitioner cassPartitioner) { @@ -184,50 +156,24 @@ static CQLSSTableWriter.Builder configureBuilder(String inDirectory, builder.withType(udt); } - return builder.inDirectory(inDirectory) - .forTable(createStatement) - .withPartitioner(cassPartitioner) - .using(insertStatement) - .sorted() - .withSSTableProducedListener(producedSSTablesListener) - .openSSTableOnProduced() - .withMaxSSTableSizeInMiB(bufferSizeMB); - } + builder.inDirectory(inDirectory) + .forTable(createStatement) + .withPartitioner(cassPartitioner) + .using(insertStatement) + // The data frame to write is always sorted, + // see org.apache.cassandra.spark.bulkwriter.CassandraBulkSourceRelation.insert + .sorted() + .withSSTableProducedListener(producedSSTablesListener) + .openSSTableOnProduced() + .withMaxSSTableSizeInMiB(bufferSizeMB); - /** - * Configures the CQLSSTableWriter builder to generate SAI index components. - * Uses reflection to call withIndexes/withBuildIndexes methods since they may not be - * present in all Cassandra 5.0.x patch versions. - * - * @param builder the CQLSSTableWriter builder - * @param indexCreateStatements the CREATE INDEX CQL statements - */ - @VisibleForTesting - static void configureIndexes(CQLSSTableWriter.Builder builder, Set indexCreateStatements) - { - try + if (!indexCreateStatements.isEmpty()) { - // Use reflection to check for SAI index generation support in CQLSSTableWriter.Builder - Method withIndexMethod = CQLSSTableWriter.Builder.class.getMethod("withIndex", String.class); - Method withBuildIndexesMethod = CQLSSTableWriter.Builder.class.getMethod("withBuildIndexes", boolean.class); - - for (String indexStatement : indexCreateStatements) - { - withIndexMethod.invoke(builder, indexStatement); - } - withBuildIndexesMethod.invoke(builder, true); - + builder.withIndexes(indexCreateStatements.toArray(new String[0])); + builder.withBuildIndexes(true); LOGGER.info("SAI index generation enabled for {} index(es)", indexCreateStatements.size()); } - catch (NoSuchMethodException exception) - { - LOGGER.warn("CQLSSTableWriter does not support inline SAI index generation in this Cassandra version. " - + "Indexes will need to be rebuilt after import. indexCount={}", indexCreateStatements.size()); - } - catch (ReflectiveOperationException exception) - { - LOGGER.error("Failed to configure SAI index generation", exception); - throw new RuntimeException("Failed to configure SAI index generation", exception); - } + + return builder; } } diff --git a/cassandra-five-zero-bridge/src/test/java/org/apache/cassandra/bridge/SSTableWriterImplementationTest.java b/cassandra-five-zero-bridge/src/test/java/org/apache/cassandra/bridge/SSTableWriterImplementationTest.java index 042b7deb8..d269addae 100644 --- a/cassandra-five-zero-bridge/src/test/java/org/apache/cassandra/bridge/SSTableWriterImplementationTest.java +++ b/cassandra-five-zero-bridge/src/test/java/org/apache/cassandra/bridge/SSTableWriterImplementationTest.java @@ -69,6 +69,7 @@ void testSSTableWriterConfiguration() throws NoSuchFieldException, IllegalAccess INSERT_STATEMENT, 250, Collections.emptySet(), + Collections.emptySet(), sstables -> {}, new Murmur3Partitioner()); @@ -86,6 +87,7 @@ void testGetProducedSSTables() throws IOException CREATE_STATEMENT, INSERT_STATEMENT, Collections.emptySet(), + Collections.emptySet(), 1); writer.setSSTablesProducedListener(produced::addAll); assertThat(produced).isEmpty(); diff --git a/cassandra-four-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java b/cassandra-four-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java index 943a3b76a..e4e059d21 100644 --- a/cassandra-four-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java +++ b/cassandra-four-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java @@ -539,7 +539,8 @@ public SSTableWriter getSSTableWriter(String inDirectory, String partitioner, String createStatement, String insertStatement, - @NotNull Set userDefinedTypeStatements, + Set userDefinedTypeStatements, + Set indexCreateStatements, int bufferSizeMB) { return new SSTableWriterImplementation(inDirectory, partitioner, createStatement, insertStatement, From 80cc2530d98a7eb79466ac27cf5b828f0a79d657 Mon Sep 17 00:00:00 2001 From: Shailaja Koppu Date: Tue, 14 Apr 2026 11:51:40 +0100 Subject: [PATCH 3/6] pass index stmts instead of count --- .../org/apache/cassandra/cdc/CdcTests.java | 6 ++-- .../cdc/MicroBatchIteratorTests.java | 2 +- .../apache/cassandra/cdc/test/CdcTester.java | 3 +- .../apache/cassandra/spark/data/CqlTable.java | 31 ++++++++++++++----- .../bulkwriter/AbstractBulkWriterContext.java | 20 +++--------- .../bulkwriter/BroadcastableSchemaInfo.java | 18 +++-------- .../bulkwriter/BroadcastableTableSchema.java | 14 +++++++-- .../CassandraDirectDataTransportContext.java | 6 +++- .../spark/bulkwriter/CassandraSchemaInfo.java | 18 +---------- .../bulkwriter/CqlTableInfoProvider.java | 9 +++++- .../spark/bulkwriter/SchemaInfo.java | 25 --------------- .../spark/bulkwriter/SortedSSTableWriter.java | 2 +- .../spark/bulkwriter/TableInfoProvider.java | 3 ++ .../spark/bulkwriter/TableSchema.java | 22 +++++-------- .../spark/data/CassandraDataLayer.java | 4 +-- .../spark/bulkwriter/TableSchemaTest.java | 7 +++-- .../bulkwriter/TableSchemaTestCommon.java | 13 ++++++++ .../cassandra/spark/utils/CqlUtilsTest.java | 4 +-- .../cdc/avro/CqlToAvroSchemaConverter.java | 2 +- .../cassandra/bridge/CassandraBridge.java | 4 +-- .../spark/utils/test/TestSchema.java | 2 +- .../bridge/CassandraBridgeImplementation.java | 5 +-- .../bridge/CassandraBridgeImplementation.java | 5 +-- .../cassandra/spark/reader/SchemaBuilder.java | 16 +++++----- 24 files changed, 116 insertions(+), 125 deletions(-) diff --git a/cassandra-analytics-cdc/src/test/java/org/apache/cassandra/cdc/CdcTests.java b/cassandra-analytics-cdc/src/test/java/org/apache/cassandra/cdc/CdcTests.java index b67fc2fda..30b43f0b1 100644 --- a/cassandra-analytics-cdc/src/test/java/org/apache/cassandra/cdc/CdcTests.java +++ b/cassandra-analytics-cdc/src/test/java/org/apache/cassandra/cdc/CdcTests.java @@ -175,7 +175,7 @@ public void testMockedCdc(CassandraVersion version) Partitioner.Murmur3Partitioner, table.udtCreateStmts(bridge.cassandraTypes()), null, - 0, + Collections.emptySet(), true); SchemaSupplier schemaSupplier = () -> CompletableFuture.completedFuture(ImmutableSet.of(table)); AtomicReference state = new AtomicReference<>(); @@ -514,13 +514,13 @@ public void testMultiTable(CassandraVersion version) ReplicationFactor.simpleStrategy(1), Partitioner.Murmur3Partitioner, Collections.emptySet(), - null, 0, schema2.withCdc); + null, Collections.emptySet(), schema2.withCdc); bridge.buildSchema(cqlTable3.createStatement(), cqlTable3.keyspace(), ReplicationFactor.simpleStrategy(1), Partitioner.Murmur3Partitioner, Collections.emptySet(), - null, 0, schema3.withCdc); + null, Collections.emptySet(), schema3.withCdc); int numRows = DEFAULT_NUM_ROWS; AtomicReference schema1Holder = new AtomicReference<>(); diff --git a/cassandra-analytics-cdc/src/test/java/org/apache/cassandra/cdc/MicroBatchIteratorTests.java b/cassandra-analytics-cdc/src/test/java/org/apache/cassandra/cdc/MicroBatchIteratorTests.java index 7b477d2ae..6521c275f 100644 --- a/cassandra-analytics-cdc/src/test/java/org/apache/cassandra/cdc/MicroBatchIteratorTests.java +++ b/cassandra-analytics-cdc/src/test/java/org/apache/cassandra/cdc/MicroBatchIteratorTests.java @@ -390,7 +390,7 @@ private void runTest(CassandraBridge bridge, ReplicationFactor.simpleStrategy(1), Partitioner.Murmur3Partitioner, Collections.emptySet(), - null, 0, schema.withCdc); + null, Collections.emptySet(), schema.withCdc); schema.setCassandraVersion(bridge.getVersion()); try diff --git a/cassandra-analytics-cdc/src/test/java/org/apache/cassandra/cdc/test/CdcTester.java b/cassandra-analytics-cdc/src/test/java/org/apache/cassandra/cdc/test/CdcTester.java index 2c6d7ce73..296395eea 100644 --- a/cassandra-analytics-cdc/src/test/java/org/apache/cassandra/cdc/test/CdcTester.java +++ b/cassandra-analytics-cdc/src/test/java/org/apache/cassandra/cdc/test/CdcTester.java @@ -21,6 +21,7 @@ import java.nio.file.Path; import java.util.ArrayList; +import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -266,7 +267,7 @@ public void run() { LOGGER.info("Running CDC test testId={} schema='{}' thread={}", testId, cqlTable.fields(), Thread.currentThread().getName()); Set udtStmts = schema.udts.stream().map(e -> e.createStatement(bridge.cassandraTypes(), schema.keyspace)).collect(Collectors.toSet()); - bridge.buildSchema(schema.createStatement, schema.keyspace, schema.rf, partitioner, udtStmts, null, 0, true); + bridge.buildSchema(schema.createStatement, schema.keyspace, schema.rf, partitioner, udtStmts, null, Collections.emptySet(), true); schema.setCassandraVersion(bridge.getVersion()); // write some mutations to CDC CommitLog diff --git a/cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/data/CqlTable.java b/cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/data/CqlTable.java index ebb5c2abc..ba4dc0fc6 100644 --- a/cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/data/CqlTable.java +++ b/cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/data/CqlTable.java @@ -57,7 +57,7 @@ public class CqlTable implements Serializable private final List staticColumns; private final List valueColumns; private final transient Map columns; - private final int indexCount; + private final Set indexStatements; public CqlTable(@NotNull String keyspace, @NotNull String table, @@ -65,7 +65,7 @@ public CqlTable(@NotNull String keyspace, @NotNull ReplicationFactor replicationFactor, @NotNull List fields) { - this(keyspace, table, createStatement, replicationFactor, fields, Collections.emptySet(), 0); + this(keyspace, table, createStatement, replicationFactor, fields, Collections.emptySet(), Collections.emptySet()); } public CqlTable(@NotNull String keyspace, @@ -74,7 +74,7 @@ public CqlTable(@NotNull String keyspace, @NotNull ReplicationFactor replicationFactor, @NotNull List fields, @NotNull Set udts, - int indexCount) + @NotNull Set indexStatements) { this.keyspace = keyspace; this.table = table; @@ -87,7 +87,7 @@ public CqlTable(@NotNull String keyspace, this.staticColumns = this.fields.stream().filter(CqlField::isStaticColumn).sorted().collect(Collectors.toList()); this.valueColumns = this.fields.stream().filter(CqlField::isValueColumn).sorted().collect(Collectors.toList()); this.udts = Collections.unmodifiableSet(udts); - this.indexCount = indexCount; + this.indexStatements = Collections.unmodifiableSet(indexStatements); // We use a linked hashmap to guarantee ordering of a 'SELECT * FROM ...' this.columns = new LinkedHashMap<>(); @@ -241,9 +241,14 @@ public String createStatement() return createStatement; } + public Set indexStatements() + { + return indexStatements; + } + public int indexCount() { - return indexCount; + return indexStatements.size(); } /** @@ -333,8 +338,13 @@ public CqlTable read(Kryo kryo, Input input, Class type) { udts.add((CqlField.CqlUdt) CqlField.CqlType.read(input, cassandraTypes)); } - int indexCount = input.readInt(); - return new CqlTable(keyspace, table, createStatement, replicationFactor, fields, udts, indexCount); + int numIndexStatements = input.readInt(); + Set indexStatements = new LinkedHashSet<>(numIndexStatements); + for (int idx = 0; idx < numIndexStatements; idx++) + { + indexStatements.add(input.readString()); + } + return new CqlTable(keyspace, table, createStatement, replicationFactor, fields, udts, indexStatements); } @Override @@ -356,7 +366,12 @@ public void write(Kryo kryo, Output output, CqlTable table) { udt.write(output); } - output.writeInt(table.indexCount()); + Set indexStatements = table.indexStatements(); + output.writeInt(indexStatements.size()); + for (String stmt : indexStatements) + { + output.writeString(stmt); + } } } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java index e69c255d3..b32980add 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/AbstractBulkWriterContext.java @@ -245,12 +245,11 @@ protected SchemaInfo buildSchemaInfo(StructType structType) Set udts = CqlUtils.extractUdts(keyspaceSchema, keyspace); ReplicationFactor replicationFactor = CqlUtils.extractReplicationFactor(keyspaceSchema, keyspace); Set indexStatements = CqlUtils.extractIndexStatements(keyspaceSchema, keyspace, table); - int indexCount = indexStatements.size(); - CqlTable cqlTable = bridge().buildSchema(createTableSchema, keyspace, replicationFactor, partitioner, udts, null, indexCount, false); + CqlTable cqlTable = bridge().buildSchema(createTableSchema, keyspace, replicationFactor, partitioner, udts, null, indexStatements, false); TableInfoProvider tableInfoProvider = new CqlTableInfoProvider(createTableSchema, cqlTable); - TableSchema tableSchema = initializeTableSchema(bulkSparkConf(), structType, tableInfoProvider, lowestCassandraVersion(), indexStatements); - return new CassandraSchemaInfo(tableSchema, udts, indexStatements); + TableSchema tableSchema = initializeTableSchema(bulkSparkConf(), structType, tableInfoProvider, lowestCassandraVersion()); + return new CassandraSchemaInfo(tableSchema, udts); } /*-------------------------------------------*/ @@ -315,16 +314,6 @@ protected TableSchema initializeTableSchema(@NotNull BulkSparkConf conf, @NotNull StructType dfSchema, TableInfoProvider tableInfoProvider, String lowestCassandraVersion) - { - return initializeTableSchema(conf, dfSchema, tableInfoProvider, lowestCassandraVersion, java.util.Collections.emptySet()); - } - - @NotNull - protected TableSchema initializeTableSchema(@NotNull BulkSparkConf conf, - @NotNull StructType dfSchema, - TableInfoProvider tableInfoProvider, - String lowestCassandraVersion, - Set indexStatements) { return new TableSchema(dfSchema, tableInfoProvider, @@ -333,8 +322,7 @@ protected TableSchema initializeTableSchema(@NotNull BulkSparkConf conf, conf.getTimestampOptions(), lowestCassandraVersion, job().qualifiedTableName().quoteIdentifiers(), - conf.skipSecondaryIndexCheck, - indexStatements); + conf.skipSecondaryIndexCheck); } @NotNull diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableSchemaInfo.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableSchemaInfo.java index cd2bec78d..2ad28ffc5 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableSchemaInfo.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableSchemaInfo.java @@ -27,7 +27,9 @@ /** * Broadcastable wrapper for schema information with ZERO transient fields to optimize Spark broadcasting. *

- * Contains BroadcastableTableSchema (pre-computed schema data), UDT statements, and index statements. + * Contains BroadcastableTableSchema (pre-computed schema data) and UDT statements. + * Index statements are now stored in CqlTable (serialized via Kryo in BroadcastableTableSchema) + * and accessed through TableSchema. * Executors reconstruct CassandraSchemaInfo and TableSchema from these fields. *

* Why ZERO transient fields matters:
@@ -48,7 +50,6 @@ public final class BroadcastableSchemaInfo implements Serializable // Essential fields broadcast to executors private final BroadcastableTableSchema broadcastableTableSchema; private final Set userDefinedTypeStatements; - private final Set indexStatements; /** * Creates a BroadcastableSchemaInfo from a source SchemaInfo. @@ -60,18 +61,15 @@ public static BroadcastableSchemaInfo from(@NotNull SchemaInfo source) { return new BroadcastableSchemaInfo( BroadcastableTableSchema.from(source.getTableSchema()), - source.getUserDefinedTypeStatements(), - source.getIndexStatements() + source.getUserDefinedTypeStatements() ); } private BroadcastableSchemaInfo(BroadcastableTableSchema broadcastableTableSchema, - Set userDefinedTypeStatements, - Set indexStatements) + Set userDefinedTypeStatements) { this.broadcastableTableSchema = broadcastableTableSchema; this.userDefinedTypeStatements = userDefinedTypeStatements; - this.indexStatements = indexStatements; } public BroadcastableTableSchema getBroadcastableTableSchema() @@ -84,10 +82,4 @@ public Set getUserDefinedTypeStatements() { return userDefinedTypeStatements; } - - @NotNull - public Set getIndexStatements() - { - return indexStatements; - } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableTableSchema.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableTableSchema.java index 322e815dd..693b07740 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableTableSchema.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableTableSchema.java @@ -21,6 +21,7 @@ import java.io.Serializable; import java.util.List; +import java.util.Set; import com.google.common.base.Preconditions; @@ -60,6 +61,7 @@ public final class BroadcastableTableSchema implements Serializable private final TimestampOption timestampOption; private final String lowestCassandraVersion; private final boolean quoteIdentifiers; + private final Set indexStatements; /** * Creates a BroadcastableTableSchema from a source TableSchema. @@ -81,7 +83,8 @@ public static BroadcastableTableSchema from(@NotNull TableSchema source) source.ttlOption, source.timestampOption, source.lowestCassandraVersion, - source.quoteIdentifiers + source.quoteIdentifiers, + source.indexStatements ); } @@ -95,7 +98,8 @@ private BroadcastableTableSchema(String createStatement, TTLOption ttlOption, TimestampOption timestampOption, String lowestCassandraVersion, - boolean quoteIdentifiers) + boolean quoteIdentifiers, + Set indexStatements) { this.createStatement = createStatement; this.modificationStatement = modificationStatement; @@ -108,6 +112,7 @@ private BroadcastableTableSchema(String createStatement, this.timestampOption = timestampOption; this.lowestCassandraVersion = lowestCassandraVersion; this.quoteIdentifiers = quoteIdentifiers; + this.indexStatements = indexStatements; } public String getCreateStatement() @@ -165,6 +170,11 @@ public boolean isQuoteIdentifiers() return quoteIdentifiers; } + public Set getIndexStatements() + { + return indexStatements; + } + /** * Normalizes a row by applying type converters to each field. * This mirrors the normalize method in TableSchema but uses the broadcast-safe converters list. diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java index 77d5ef8d1..c223694d5 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java @@ -20,6 +20,7 @@ package org.apache.cassandra.spark.bulkwriter; import java.math.BigInteger; +import java.util.Set; import java.util.concurrent.ExecutorService; import com.google.common.collect.Range; @@ -27,6 +28,7 @@ import org.apache.cassandra.bridge.CassandraBridge; import org.apache.cassandra.bridge.CassandraBridgeFactory; import org.apache.cassandra.spark.bulkwriter.token.ReplicaAwareFailureHandler; +import org.apache.cassandra.spark.utils.CqlUtils; import org.jetbrains.annotations.NotNull; public class CassandraDirectDataTransportContext implements TransportContext.DirectDataBulkWriterContext @@ -75,6 +77,8 @@ public DirectDataTransferApi dataTransferApi() protected DirectDataTransferApi createDirectDataTransferApi() { CassandraBridge bridge = CassandraBridgeFactory.get(clusterInfo.getLowestCassandraVersion()); - return new SidecarDataTransferApi(clusterInfo.getCassandraContext(), bridge, jobInfo, schemaInfo.hasSaiIndexes()); + Set indexStatements = schemaInfo.getTableSchema().getIndexStatements(); + boolean hasSaiIndexes = !indexStatements.isEmpty() && indexStatements.stream().allMatch(CqlUtils::isSaiIndex); + return new SidecarDataTransferApi(clusterInfo.getCassandraContext(), bridge, jobInfo, hasSaiIndexes); } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraSchemaInfo.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraSchemaInfo.java index 9d287c8c3..a082da296 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraSchemaInfo.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraSchemaInfo.java @@ -19,7 +19,6 @@ package org.apache.cassandra.spark.bulkwriter; -import java.util.Collections; import java.util.Set; import org.jetbrains.annotations.NotNull; @@ -28,18 +27,11 @@ public class CassandraSchemaInfo implements SchemaInfo { private final TableSchema tableSchema; private final Set userDefinedTypeStatements; - private final Set indexStatements; public CassandraSchemaInfo(TableSchema tableSchema, Set userDefinedTypeStatements) - { - this(tableSchema, userDefinedTypeStatements, Collections.emptySet()); - } - - public CassandraSchemaInfo(TableSchema tableSchema, Set userDefinedTypeStatements, Set indexStatements) { this.tableSchema = tableSchema; this.userDefinedTypeStatements = userDefinedTypeStatements; - this.indexStatements = indexStatements; } /** @@ -51,8 +43,7 @@ public CassandraSchemaInfo(TableSchema tableSchema, Set userDefinedTypeS public CassandraSchemaInfo(BroadcastableSchemaInfo broadcastable) { this(new TableSchema(broadcastable.getBroadcastableTableSchema()), - broadcastable.getUserDefinedTypeStatements(), - broadcastable.getIndexStatements()); + broadcastable.getUserDefinedTypeStatements()); } @Override @@ -67,11 +58,4 @@ public Set getUserDefinedTypeStatements() { return userDefinedTypeStatements; } - - @Override - @NotNull - public Set getIndexStatements() - { - return indexStatements; - } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CqlTableInfoProvider.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CqlTableInfoProvider.java index a4c8c3594..2bc117a40 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CqlTableInfoProvider.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CqlTableInfoProvider.java @@ -21,6 +21,7 @@ import java.util.List; import java.util.Map; +import java.util.Set; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -157,7 +158,13 @@ public String getKeyspaceName() @Override public boolean hasSecondaryIndex() { - return cqlTable.indexCount() > 0; + return !cqlTable.indexStatements().isEmpty(); + } + + @Override + public Set getIndexStatements() + { + return cqlTable.indexStatements(); } @Override diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SchemaInfo.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SchemaInfo.java index a9233057c..b21d5fd44 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SchemaInfo.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SchemaInfo.java @@ -19,13 +19,10 @@ package org.apache.cassandra.spark.bulkwriter; -import java.util.Collections; import java.util.Set; import org.jetbrains.annotations.NotNull; -import org.apache.cassandra.spark.utils.CqlUtils; - /** * Provides schema information for bulk write operations. *

@@ -39,26 +36,4 @@ public interface SchemaInfo @NotNull Set getUserDefinedTypeStatements(); - - /** - * Returns the set of CREATE INDEX statements for the table, if any. - * - * @return set of CREATE INDEX CQL statements, empty if none - */ - @NotNull - default Set getIndexStatements() - { - return Collections.emptySet(); - } - - /** - * Returns whether the table has SAI indexes. - * - * @return {@code true} if the table has index statements and all of them are SAI indexes - */ - default boolean hasSaiIndexes() - { - Set indexes = getIndexStatements(); - return !indexes.isEmpty() && indexes.stream().allMatch(CqlUtils::isSaiIndex); - } } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java index 809713fd9..78bbf7f3b 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java @@ -131,7 +131,7 @@ public SortedSSTableWriter(BulkWriterContext writerContext, Path outDir, DigestA tableSchema.createStatement, tableSchema.modificationStatement, schema.getUserDefinedTypeStatements(), - schema.getIndexStatements(), + tableSchema.getIndexStatements(), writerContext.job().sstableDataSizeInMiB()); } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableInfoProvider.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableInfoProvider.java index aaf1ff56f..65213c39d 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableInfoProvider.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableInfoProvider.java @@ -20,6 +20,7 @@ package org.apache.cassandra.spark.bulkwriter; import java.util.List; +import java.util.Set; import org.apache.cassandra.spark.common.schema.ColumnType; import org.apache.cassandra.spark.data.CqlField; @@ -45,4 +46,6 @@ public interface TableInfoProvider boolean hasSecondaryIndex(); List getColumnNames(); + + Set getIndexStatements(); } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableSchema.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableSchema.java index 98c7c1996..b19d2e0f3 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableSchema.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableSchema.java @@ -64,6 +64,7 @@ public class TableSchema final TimestampOption timestampOption; final String lowestCassandraVersion; final boolean quoteIdentifiers; + final Set indexStatements; public TableSchema(StructType dfSchema, TableInfoProvider tableInfo, @@ -73,20 +74,6 @@ public TableSchema(StructType dfSchema, String lowestCassandraVersion, boolean quoteIdentifiers, boolean skipSecondaryIndexCheck) - { - this(dfSchema, tableInfo, writeMode, ttlOption, timestampOption, - lowestCassandraVersion, quoteIdentifiers, skipSecondaryIndexCheck, Collections.emptySet()); - } - - public TableSchema(StructType dfSchema, - TableInfoProvider tableInfo, - WriteMode writeMode, - TTLOption ttlOption, - TimestampOption timestampOption, - String lowestCassandraVersion, - boolean quoteIdentifiers, - boolean skipSecondaryIndexCheck, - Set indexStatements) { this.writeMode = writeMode; this.ttlOption = ttlOption; @@ -94,6 +81,7 @@ public TableSchema(StructType dfSchema, this.timestampOption = timestampOption; this.lowestCassandraVersion = lowestCassandraVersion; this.quoteIdentifiers = quoteIdentifiers; + this.indexStatements = tableInfo.getIndexStatements(); validateDataFrameCompatibility(dfSchema, tableInfo); validateSecondaryIndexes(tableInfo, skipSecondaryIndexCheck, indexStatements, lowestCassandraVersion); @@ -127,6 +115,7 @@ public TableSchema(BroadcastableTableSchema broadcastable) this.timestampOption = broadcastable.getTimestampOption(); this.lowestCassandraVersion = broadcastable.getLowestCassandraVersion(); this.quoteIdentifiers = broadcastable.isQuoteIdentifiers(); + this.indexStatements = broadcastable.getIndexStatements(); } private List getRequiredKeyColumns(TableInfoProvider tableInfo) @@ -362,6 +351,11 @@ static void validateNoSecondaryIndexes(TableInfoProvider tableInfo) } } + public Set getIndexStatements() + { + return indexStatements; + } + @VisibleForTesting static boolean isCassandra5OrLater(String version) { diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java index 99cbc6827..a45b12b76 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/CassandraDataLayer.java @@ -295,7 +295,7 @@ private int initBulkReader(@NotNull ClientConfig options) throws ExecutionExcept String fullSchema = schemaFuture.get().schema(); String createStmt = CqlUtils.extractTableSchema(fullSchema, keyspace, table); - int indexCount = CqlUtils.extractIndexCount(fullSchema, keyspace, table); + Set indexStatements = CqlUtils.extractIndexStatements(fullSchema, keyspace, table); Set udts = CqlUtils.extractUdts(fullSchema, keyspace); ReplicationFactor replicationFactor = CqlUtils.extractReplicationFactor(fullSchema, keyspace); @@ -316,7 +316,7 @@ private int initBulkReader(@NotNull ClientConfig options) throws ExecutionExcept validateReplicationFactor(replicationFactor); udts.forEach(udt -> LOGGER.info("Adding schema UDT: '{}'", udt)); - cqlTable = bridge().buildSchema(createStmt, keyspace, replicationFactor, partitioner, udts, null, indexCount, false); + cqlTable = bridge().buildSchema(createStmt, keyspace, replicationFactor, partitioner, udts, null, indexStatements, false); CassandraRing ring = createCassandraRingFromRing(partitioner, replicationFactor, ringFuture.get()); int effectiveNumberOfCores = sizingFuture.get(); diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TableSchemaTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TableSchemaTest.java index 138bbffe9..7df89d16e 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TableSchemaTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TableSchemaTest.java @@ -22,6 +22,7 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.util.Arrays; +import java.util.Set; import com.google.common.collect.ImmutableMap; import org.apache.commons.io.FileUtils; @@ -260,10 +261,10 @@ public void testSecondaryIndexIsUnsupported() throws Exception { Path fullSchemaSampleFile = ResourceUtils.writeResourceToPath(CqlUtilsTest.class.getClassLoader(), tempPath, "cql/fullSchema.cql"); String fullSchemaSample = FileUtils.readFileToString(fullSchemaSampleFile.toFile(), StandardCharsets.UTF_8); - int indexCount = CqlUtils.extractIndexCount(fullSchemaSample, "cycling", "rank_by_year_and_name"); - assertThat(indexCount).isEqualTo(3); + Set indexStatements = CqlUtils.extractIndexStatements(fullSchemaSample, "cycling", "rank_by_year_and_name"); + assertThat(indexStatements).hasSize(3); CqlTable table = mock(CqlTable.class); - when(table.indexCount()).thenReturn(indexCount); + when(table.indexStatements()).thenReturn(indexStatements); TableInfoProvider tableInfoProvider = new CqlTableInfoProvider("", table); assertThatThrownBy(() -> TableSchema.validateNoSecondaryIndexes(tableInfoProvider)) .isInstanceOf(UnsupportedAnalyticsOperationException.class) diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TableSchemaTestCommon.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TableSchemaTestCommon.java index 4122d94dd..4f64a78b9 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TableSchemaTestCommon.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TableSchemaTestCommon.java @@ -21,10 +21,13 @@ import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; @@ -478,6 +481,16 @@ public boolean hasSecondaryIndex() return hasSecondaryIndex; } + @Override + public Set getIndexStatements() + { + if (hasSecondaryIndex) + { + return new HashSet<>(Collections.singletonList("CREATE INDEX test_idx ON test." + uniqueTableName + " (col);")); + } + return Collections.emptySet(); + } + @Override public List getColumnNames() { diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/CqlUtilsTest.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/CqlUtilsTest.java index 5bb363d97..423ae1d51 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/CqlUtilsTest.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/utils/CqlUtilsTest.java @@ -321,7 +321,7 @@ public void testEscapedTableName(CassandraBridge bridge) ImmutableMap.of("datacenter1", 3)), Partitioner.Murmur3Partitioner, Collections.emptySet(), - null, 0, false); + null, Collections.emptySet(), false); assertThat(table.keyspace()).isEqualTo("ks"); assertThat(table.table()).isEqualTo("tb"); assertThat(table.getField("key").name()).isEqualTo("key"); @@ -552,7 +552,7 @@ public void testParseClusteringKeySchema(CassandraBridge bridge) new ReplicationFactor(ReplicationFactor.ReplicationStrategy.NetworkTopologyStrategy, ImmutableMap.of("datacenter1", 3)), Partitioner.Murmur3Partitioner, - Collections.emptySet(), null, 0, false); + Collections.emptySet(), null, Collections.emptySet(), false); } @ParameterizedTest diff --git a/cassandra-avro-converter/src/main/java/org/apache/cassandra/cdc/avro/CqlToAvroSchemaConverter.java b/cassandra-avro-converter/src/main/java/org/apache/cassandra/cdc/avro/CqlToAvroSchemaConverter.java index aedbc2ddd..907921c0c 100644 --- a/cassandra-avro-converter/src/main/java/org/apache/cassandra/cdc/avro/CqlToAvroSchemaConverter.java +++ b/cassandra-avro-converter/src/main/java/org/apache/cassandra/cdc/avro/CqlToAvroSchemaConverter.java @@ -69,7 +69,7 @@ default Schema convert(String keyspace, String tableCreateStatement, Set new ReplicationFactor(ReplicationFactor.ReplicationStrategy.NetworkTopologyStrategy, Collections.singletonMap("DC1", 3)), Partitioner.Murmur3Partitioner, - udts, null, 0, true); + udts, null, Collections.emptySet(), true); return convert(cqlTable); } diff --git a/cassandra-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridge.java b/cassandra-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridge.java index 13d856fe7..7ca12d4a7 100644 --- a/cassandra-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridge.java +++ b/cassandra-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridge.java @@ -143,7 +143,7 @@ public CqlTable buildSchema(String createStatement, Partitioner partitioner, Set udts) { - return buildSchema(createStatement, keyspace, replicationFactor, partitioner, udts, null, 0, false); + return buildSchema(createStatement, keyspace, replicationFactor, partitioner, udts, null, Collections.emptySet(), false); } public abstract CqlTable buildSchema(String createStatement, @@ -152,7 +152,7 @@ public abstract CqlTable buildSchema(String createStatement, Partitioner partitioner, Set udts, @Nullable UUID tableId, - int indexCount, + Set indexStatements, boolean enableCdc); /** diff --git a/cassandra-bridge/src/testFixtures/java/org/apache/cassandra/spark/utils/test/TestSchema.java b/cassandra-bridge/src/testFixtures/java/org/apache/cassandra/spark/utils/test/TestSchema.java index e96847496..12f5b8e53 100644 --- a/cassandra-bridge/src/testFixtures/java/org/apache/cassandra/spark/utils/test/TestSchema.java +++ b/cassandra-bridge/src/testFixtures/java/org/apache/cassandra/spark/utils/test/TestSchema.java @@ -498,7 +498,7 @@ public CqlTable buildTable() rf, allFields, udts, - 0); + Collections.emptySet()); } public void writeSSTable(TemporaryDirectory directory, diff --git a/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java b/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java index 1135ea999..58b93979d 100644 --- a/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java +++ b/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java @@ -277,10 +277,11 @@ public CqlTable buildSchema(String createStatement, Partitioner partitioner, Set udts, @Nullable UUID tableId, - int indexCount, + Set indexStatements, boolean enableCdc) { - return new SchemaBuilder(createStatement, keyspace, replicationFactor, partitioner, cassandraTypes -> udts, tableId, indexCount, enableCdc).build(); + return new SchemaBuilder(createStatement, keyspace, replicationFactor, partitioner, + cassandraTypes -> udts, tableId, indexStatements, enableCdc).build(); } @Override diff --git a/cassandra-four-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java b/cassandra-four-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java index e4e059d21..47115c96b 100644 --- a/cassandra-four-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java +++ b/cassandra-four-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java @@ -269,10 +269,11 @@ public CqlTable buildSchema(String createStatement, Partitioner partitioner, Set udts, @Nullable UUID tableId, - int indexCount, + Set indexStatements, boolean enableCdc) { - return new SchemaBuilder(createStatement, keyspace, replicationFactor, partitioner, cassandraTypes -> udts, tableId, indexCount, enableCdc).build(); + return new SchemaBuilder(createStatement, keyspace, replicationFactor, partitioner, + cassandraTypes -> udts, tableId, indexStatements, enableCdc).build(); } @Override diff --git a/cassandra-four-zero-types/src/main/java/org/apache/cassandra/spark/reader/SchemaBuilder.java b/cassandra-four-zero-types/src/main/java/org/apache/cassandra/spark/reader/SchemaBuilder.java index 5d0493a1f..941071b62 100644 --- a/cassandra-four-zero-types/src/main/java/org/apache/cassandra/spark/reader/SchemaBuilder.java +++ b/cassandra-four-zero-types/src/main/java/org/apache/cassandra/spark/reader/SchemaBuilder.java @@ -84,7 +84,7 @@ public class SchemaBuilder private final String keyspace; private final ReplicationFactor replicationFactor; private final CassandraTypes cassandraTypes; - private final int indexCount; + private final Set indexStatements; public SchemaBuilder(CqlTable table, Partitioner partitioner, boolean enableCdc) { @@ -104,14 +104,15 @@ public SchemaBuilder(CqlTable table, Partitioner partitioner, UUID tableId, bool partitioner, table::udtCreateStmts, tableId, - 0, + Collections.emptySet(), enableCdc); } @VisibleForTesting public SchemaBuilder(String createStmt, String keyspace, ReplicationFactor replicationFactor) { - this(createStmt, keyspace, replicationFactor, Partitioner.Murmur3Partitioner, bridge -> Collections.emptySet(), null, 0, false); + this(createStmt, keyspace, replicationFactor, Partitioner.Murmur3Partitioner, bridge -> Collections.emptySet(), + null, Collections.emptySet(), false); } @VisibleForTesting @@ -120,7 +121,8 @@ public SchemaBuilder(String createStmt, ReplicationFactor replicationFactor, Partitioner partitioner) { - this(createStmt, keyspace, replicationFactor, partitioner, bridge -> Collections.emptySet(), null, 0, false); + this(createStmt, keyspace, replicationFactor, partitioner, bridge -> Collections.emptySet(), null, + Collections.emptySet(), false); } public SchemaBuilder(String createStmt, @@ -129,14 +131,14 @@ public SchemaBuilder(String createStmt, Partitioner partitioner, Function> udtStatementsProvider, @Nullable UUID tableId, - int indexCount, + Set indexStatements, boolean enableCdc) { this.createStmt = createStmt; this.keyspace = keyspace; this.replicationFactor = replicationFactor; this.cassandraTypes = new CassandraTypesImplementation(); - this.indexCount = indexCount; + this.indexStatements = indexStatements; Pair updated = CassandraSchema.apply(schema -> updateSchema(schema, @@ -477,7 +479,7 @@ public CqlTable build() replicationFactor, fields, new HashSet<>(udts.values()), - indexCount); + indexStatements); } private Map buildsUdts(KeyspaceMetadata keyspaceMetadata) From 556ebe5b986ffba082928bc7db099265c97c11c0 Mon Sep 17 00:00:00 2001 From: Shailaja Koppu Date: Tue, 16 Jun 2026 14:08:58 +0100 Subject: [PATCH 4/6] integrate with sidecar changes --- CHANGES.txt | 1 + .../common/data/SSTableImportOptions.java | 33 +++++++++++-------- .../common/request/ImportSSTableRequest.java | 28 ++++++++-------- .../CreateRestoreJobRequestPayloadTest.java | 10 +++--- .../bulkwriter/SidecarDataTransferApi.java | 6 ++-- .../build.gradle | 3 ++ .../SharedClusterIntegrationTestBase.java | 2 ++ gradle.properties | 2 +- 8 files changed, 49 insertions(+), 36 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 58d4cf960..ea1f42124 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,6 @@ 0.5.0 ----- + * Upgrade sidecar version to 0.4.0 * Regenerate bloom filters for CQLSSTableWriter (CASSANALYTICS-167) * Avoid Spark 4 partitioning warnings during bulk reads (CASSANALYTICS-171) * Spark 4.0 Support (CASSANALYTICS-34) diff --git a/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/data/SSTableImportOptions.java b/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/data/SSTableImportOptions.java index d5a584662..f9c62c448 100644 --- a/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/data/SSTableImportOptions.java +++ b/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/data/SSTableImportOptions.java @@ -18,14 +18,19 @@ package org.apache.cassandra.sidecar.common.data; -import java.util.HashMap; +import java.util.LinkedHashMap; /** * Options for Cassandra import nodetool command. It is like properties. * Supports Json serialization and deserialization. + *

+ * This class is used exclusively in the S3 restore job path ({@code CreateRestoreJobRequestPayload}). */ -public class SSTableImportOptions extends HashMap +public class SSTableImportOptions extends LinkedHashMap { + public static final boolean DEFAULT_FAIL_ON_MISSING_INDEX = false; + public static final boolean DEFAULT_VALIDATE_INDEX_CHECKSUM = false; + private static final String RESET_LEVEL = "resetLevel"; private static final String CLEAR_REPAIRED = "clearRepaired"; private static final String VERIFY_SSTABLES = "verifySSTables"; @@ -33,8 +38,8 @@ public class SSTableImportOptions extends HashMap private static final String INVALIDATE_CACHES = "invalidateCaches"; private static final String EXTENDED_VERIFY = "extendedVerify"; private static final String COPY_DATA = "copyData"; - private static final String VALIDATE_SAI_INDEXES = "validateSaiIndexes"; - private static final String SAI_INDEX_CHECKSUM = "saiIndexChecksum"; + private static final String FAIL_ON_MISSING_INDEX = "failOnMissingIndex"; + private static final String VALIDATE_INDEX_CHECKSUM = "validateIndexChecksum"; public static SSTableImportOptions defaults() { @@ -50,8 +55,8 @@ private SSTableImportOptions() put(INVALIDATE_CACHES, Boolean.toString(true)); put(EXTENDED_VERIFY, Boolean.toString(true)); put(COPY_DATA, Boolean.toString(false)); // note: the default is false - put(VALIDATE_SAI_INDEXES, Boolean.toString(false)); // note: the default is false - put(SAI_INDEX_CHECKSUM, Boolean.toString(false)); // note: the default is false + put(FAIL_ON_MISSING_INDEX, Boolean.toString(DEFAULT_FAIL_ON_MISSING_INDEX)); + put(VALIDATE_INDEX_CHECKSUM, Boolean.toString(DEFAULT_VALIDATE_INDEX_CHECKSUM)); } public SSTableImportOptions resetLevel(boolean enabled) @@ -131,25 +136,25 @@ public boolean copyData() return Boolean.parseBoolean(get(COPY_DATA)); } - public SSTableImportOptions validateSaiIndexes(boolean enabled) + public SSTableImportOptions failOnMissingIndex(boolean enabled) { - put(VALIDATE_SAI_INDEXES, Boolean.toString(enabled)); + put(FAIL_ON_MISSING_INDEX, Boolean.toString(enabled)); return this; } - public boolean validateSaiIndexes() + public boolean failOnMissingIndex() { - return Boolean.parseBoolean(get(VALIDATE_SAI_INDEXES)); + return Boolean.parseBoolean(get(FAIL_ON_MISSING_INDEX)); } - public SSTableImportOptions saiIndexChecksum(boolean enabled) + public SSTableImportOptions validateIndexChecksum(boolean enabled) { - put(SAI_INDEX_CHECKSUM, Boolean.toString(enabled)); + put(VALIDATE_INDEX_CHECKSUM, Boolean.toString(enabled)); return this; } - public boolean saiIndexChecksum() + public boolean validateIndexChecksum() { - return Boolean.parseBoolean(get(SAI_INDEX_CHECKSUM)); + return Boolean.parseBoolean(get(VALIDATE_INDEX_CHECKSUM)); } } diff --git a/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/request/ImportSSTableRequest.java b/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/request/ImportSSTableRequest.java index 9c14f6a2c..6d5477b4c 100644 --- a/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/request/ImportSSTableRequest.java +++ b/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/request/ImportSSTableRequest.java @@ -65,8 +65,8 @@ public static class ImportOptions private Boolean invalidateCaches; private Boolean extendedVerify; private Boolean copyData; - private Boolean validateSaiIndexes; - private Boolean saiIndexChecksum; + private Boolean failOnMissingIndex; + private Boolean validateIndexChecksum; public ImportOptions() { @@ -157,28 +157,28 @@ public ImportOptions copyData(boolean copyData) } /** - * Sets the {@code validateSaiIndexes} and returns a reference to this ImportOptions enabling method chaining. + * Sets the {@code failOnMissingIndex} and returns a reference to this ImportOptions enabling method chaining. * When enabled, Cassandra validates SAI index components during import. * - * @param validateSaiIndexes the {@code validateSaiIndexes} to set + * @param failOnMissingIndex the {@code failOnMissingIndex} to set * @return a reference to this ImportOptions */ - public ImportOptions validateSaiIndexes(boolean validateSaiIndexes) + public ImportOptions failOnMissingIndex(boolean failOnMissingIndex) { - this.validateSaiIndexes = validateSaiIndexes; + this.failOnMissingIndex = failOnMissingIndex; return this; } /** - * Sets the {@code saiIndexChecksum} and returns a reference to this ImportOptions enabling method chaining. + * Sets the {@code validateIndexChecksum} and returns a reference to this ImportOptions enabling method chaining. * When enabled, Cassandra verifies SAI index component checksums during import. * - * @param saiIndexChecksum the {@code saiIndexChecksum} to set + * @param validateIndexChecksum the {@code validateIndexChecksum} to set * @return a reference to this ImportOptions */ - public ImportOptions saiIndexChecksum(boolean saiIndexChecksum) + public ImportOptions validateIndexChecksum(boolean validateIndexChecksum) { - this.saiIndexChecksum = saiIndexChecksum; + this.validateIndexChecksum = validateIndexChecksum; return this; } } @@ -233,13 +233,13 @@ private static List selectedOptions(ImportOptions importOptions) { options.add("copyData=" + importOptions.copyData); } - if (importOptions.validateSaiIndexes != null) + if (importOptions.failOnMissingIndex != null) { - options.add("validateSaiIndexes=" + importOptions.validateSaiIndexes); + options.add("failOnMissingIndex=" + importOptions.failOnMissingIndex); } - if (importOptions.saiIndexChecksum != null) + if (importOptions.validateIndexChecksum != null) { - options.add("saiIndexChecksum=" + importOptions.saiIndexChecksum); + options.add("validateIndexChecksum=" + importOptions.validateIndexChecksum); } return options; diff --git a/analytics-sidecar-client-common/src/test/java/org/apache/cassandra/sidecar/common/request/CreateRestoreJobRequestPayloadTest.java b/analytics-sidecar-client-common/src/test/java/org/apache/cassandra/sidecar/common/request/CreateRestoreJobRequestPayloadTest.java index 2dea3a9bf..89eced983 100644 --- a/analytics-sidecar-client-common/src/test/java/org/apache/cassandra/sidecar/common/request/CreateRestoreJobRequestPayloadTest.java +++ b/analytics-sidecar-client-common/src/test/java/org/apache/cassandra/sidecar/common/request/CreateRestoreJobRequestPayloadTest.java @@ -67,15 +67,15 @@ void testSerDeser() throws JsonProcessingException "\"jobAgent\":\"agent\"," + "\"secrets\":" + MAPPER.writeValueAsString(secrets) + "," + "\"importOptions\":{" + - "\"validateSaiIndexes\":\"false\"," + - "\"verifyTokens\":\"true\"," + "\"resetLevel\":\"true\"," + - "\"saiIndexChecksum\":\"false\"," + "\"clearRepaired\":\"true\"," + - "\"extendedVerify\":\"true\"," + "\"verifySSTables\":\"true\"," + + "\"verifyTokens\":\"true\"," + "\"invalidateCaches\":\"true\"," + - "\"copyData\":\"false\"}," + + "\"extendedVerify\":\"true\"," + + "\"copyData\":\"false\"," + + "\"failOnMissingIndex\":\"false\"," + + "\"validateIndexChecksum\":\"false\"}," + "\"expireAt\":" + expireAt + "," + "\"consistencyLevel\":\"QUORUM\"}"); CreateRestoreJobRequestPayload test = MAPPER.readValue(json, CreateRestoreJobRequestPayload.class); diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SidecarDataTransferApi.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SidecarDataTransferApi.java index 3a778c2d9..b7debb5c3 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SidecarDataTransferApi.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SidecarDataTransferApi.java @@ -110,10 +110,12 @@ public RemoteCommitResult commitSSTables(CassandraInstance instance, // Always verify SSTables on import importOptions.verifySSTables(true).extendedVerify(!job.skipExtendedVerify()); - // When SAI index components were generated alongside SSTables, enable SAI validation on import + // When SAI index components were generated alongside SSTables, enable SAI validation on import. + // failOnMissingIndex makes the import fail (instead of silently rebuilding) when the uploaded + // SSTables are missing their SAI components, and validateIndexChecksum verifies their checksums. if (hasSaiIndexes) { - importOptions.validateSaiIndexes(true).saiIndexChecksum(true); + importOptions.failOnMissingIndex(true).validateIndexChecksum(true); } try diff --git a/cassandra-analytics-integration-framework/build.gradle b/cassandra-analytics-integration-framework/build.gradle index a88c0da9a..6c1c65b0f 100644 --- a/cassandra-analytics-integration-framework/build.gradle +++ b/cassandra-analytics-integration-framework/build.gradle @@ -86,6 +86,9 @@ dependencies { exclude(group: 'org.apache.logging.log4j') exclude(group: 'org.slf4j') exclude(group: 'ch.qos.logback') + // kafka-avro-serializer is only needed when sidecar publishes CDC events to a + // Schema-Registry-backed Kafka topic, which our in-jvm integration tests do not exercise. + exclude(group: 'io.confluent') } api(group: 'org.apache.cassandra', name: 'sidecar-server-common', version: "${sidecarVersion}") } diff --git a/cassandra-analytics-integration-framework/src/main/java/org/apache/cassandra/sidecar/testing/SharedClusterIntegrationTestBase.java b/cassandra-analytics-integration-framework/src/main/java/org/apache/cassandra/sidecar/testing/SharedClusterIntegrationTestBase.java index 460d8565b..a10046865 100644 --- a/cassandra-analytics-integration-framework/src/main/java/org/apache/cassandra/sidecar/testing/SharedClusterIntegrationTestBase.java +++ b/cassandra-analytics-integration-framework/src/main/java/org/apache/cassandra/sidecar/testing/SharedClusterIntegrationTestBase.java @@ -738,6 +738,7 @@ static InstanceMetadata buildInstanceMetadata(Vertx vertx, hostName = ipAddress; } int port = tryGetIntConfig(config, "native_transport_port", 9042); + int storagePort = tryGetIntConfig(config, "storage_port", 7000); String[] dataDirectories = (String[]) config.get("data_file_directories"); String stagingDir = stagingDir(dataDirectories); @@ -762,6 +763,7 @@ static InstanceMetadata buildInstanceMetadata(Vertx vertx, .id(config.num()) .host(hostName) .port(port) + .storagePort(storagePort) .dataDirs(Arrays.asList(dataDirectories)) .cdcDir(config.getString("cdc_raw_directory")) .commitlogDir(config.getString("commitlog_directory")) diff --git a/gradle.properties b/gradle.properties index 30e1aa3b9..aa3b798cb 100644 --- a/gradle.properties +++ b/gradle.properties @@ -23,7 +23,7 @@ description=Apache Cassandra Analytics analyticsJDKLevel=17 cassandra40Version=4.0.17 cassandra50Version=5.0.5 -sidecarVersion=0.2.0 +sidecarVersion=0.4.0 intellijVersion=9.0.4 junitVersion=5.10.2 assertjCoreVersion=3.24.2 From bb27eed9eb4875838ee0ec734f7687e34d513f55 Mon Sep 17 00:00:00 2001 From: Shailaja Koppu Date: Tue, 16 Jun 2026 19:24:04 +0100 Subject: [PATCH 5/6] fix schema --- CHANGES.txt | 1 + .../cassandra/spark/common/SSTables.java | 38 ++++++++++ .../spark/bulkwriter/DirectStreamSession.java | 14 ++-- .../spark/bulkwriter/SortedSSTableWriter.java | 12 ++-- .../cassandra/spark/data/LocalDataLayer.java | 3 +- .../cassandra/spark/reader/SchemaBuilder.java | 72 +++++++++++++++++++ 6 files changed, 126 insertions(+), 14 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 974b9ceaa..ce86121c6 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,6 @@ 0.5.0 ----- + * Add Storage Attached Index (SAI) support to the bulk writer (CASSANALYTICS-31) * Upgrade sidecar version to 0.4.0 * Exclude IP address from RingInstance equality so node replacement does not fail bulk write jobs (CASSANALYTICS-175) * Regenerate bloom filters for CQLSSTableWriter (CASSANALYTICS-167) diff --git a/cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/common/SSTables.java b/cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/common/SSTables.java index 68409be71..aa41acc39 100644 --- a/cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/common/SSTables.java +++ b/cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/common/SSTables.java @@ -22,14 +22,52 @@ import java.nio.file.Path; import org.apache.cassandra.bridge.SSTableDescriptor; +import org.apache.cassandra.spark.data.FileType; public final class SSTables { + /** + * Suffix identifying the primary SSTable data component, e.g. "-Data.db". + * The leading '-' is significant: it excludes SAI per-index components such as + * "...+TermsData.db" that also end with "Data.db". + */ + private static final String DATA_COMPONENT_SUFFIX = "-" + FileType.DATA.getFileSuffix(); + + /** + * Glob matching primary SSTable data components, e.g. "*-Data.db". + * Suitable for {@link java.nio.file.Files#newDirectoryStream(Path, String)}. + */ + public static final String DATA_COMPONENT_GLOB = "*" + DATA_COMPONENT_SUFFIX; + private SSTables() { throw new IllegalStateException(getClass() + " is static utility class and shall not be instantiated"); } + /** + * Determine whether the given file name is a primary SSTable data component ("<descriptor>-Data.db"). + * The leading '-' check excludes SAI per-index components such as "...+TermsData.db" which also end with "Data.db". + * + * @param fileName file name (not a full path) + * @return true if the name is a primary data component + */ + public static boolean isDataComponent(String fileName) + { + return fileName.endsWith(DATA_COMPONENT_SUFFIX); + } + + /** + * Determine whether the given path is a primary SSTable data component ("<descriptor>-Data.db"). + * + * @param path file path + * @return true if the path's file name is a primary data component + * @see #isDataComponent(String) + */ + public static boolean isDataComponent(Path path) + { + return isDataComponent(path.getFileName().toString()); + } + /** * Get the sstable base name from data file path. * For example, the base name of data file '/path/to/table/nb-1-big-Data.db' is 'nb-1-big' diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/DirectStreamSession.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/DirectStreamSession.java index 7af617090..c661bbc7e 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/DirectStreamSession.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/DirectStreamSession.java @@ -43,7 +43,6 @@ import org.apache.cassandra.spark.bulkwriter.token.ReplicaAwareFailureHandler; import org.apache.cassandra.spark.common.Digest; import org.apache.cassandra.spark.common.SSTables; -import org.apache.cassandra.spark.data.FileType; import org.apache.cassandra.util.IntWrapper; public class DirectStreamSession extends StreamSession @@ -86,11 +85,12 @@ protected void onSSTablesProduced(Set sstables) // 3. send the sstables to all replicas // 4. remove the sstables once sent Map fileDigests = sstableWriter.prepareSStablesToSend(writerContext, sstables); - // retain only the SSTable data components + // retain only the primary SSTable data components ("-Data.db"); SAI per-index + // components such as "...+TermsData.db" (which also end with "Data.db") are excluded IntWrapper sstableCounter = new IntWrapper(); fileDigests.keySet() .stream() - .filter(p -> p.getFileName().toString().endsWith(FileType.DATA.getFileSuffix())) + .filter(SSTables::isDataComponent) .forEach(sstable -> { sstableCounter.value++; sendSStableToReplicas(sstable); @@ -153,7 +153,8 @@ protected StreamResult doFinalizeStream() @Override protected void sendRemainingSSTables() { - try (DirectoryStream dataFileStream = Files.newDirectoryStream(sstableWriter.getOutDir(), "*Data.db")) + try (DirectoryStream dataFileStream = Files.newDirectoryStream(sstableWriter.getOutDir(), + SSTables.DATA_COMPONENT_GLOB)) { for (Path dataFile : dataFileStream) { @@ -224,8 +225,9 @@ private void sendSSTableToOneReplica(Path dataFile, { for (Path componentFile : componentFileStream) { - // send data component the last - if (componentFile.getFileName().toString().endsWith("Data.db")) + // send the primary data component ("-Data.db") last; SAI per-index components + // such as "...+TermsData.db" are still streamed here rather than skipped + if (SSTables.isDataComponent(componentFile)) { continue; } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java index be6d7e8ef..02453c8e5 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/SortedSSTableWriter.java @@ -44,7 +44,6 @@ import org.apache.cassandra.spark.common.Digest; import org.apache.cassandra.spark.common.SSTables; import org.apache.cassandra.spark.data.FileSystemSSTable; -import org.apache.cassandra.spark.data.FileType; import org.apache.cassandra.spark.data.LocalDataLayer; import org.apache.cassandra.spark.data.partitioner.Partitioner; import org.apache.cassandra.spark.reader.RowData; @@ -237,7 +236,7 @@ public synchronized Map prepareSStablesToSend(@NotNull BulkWriterC { for (Path path : stream) { - if (path.getFileName().toString().endsWith("-" + FileType.DATA.getFileSuffix())) + if (SSTables.isDataComponent(path)) { dataFilePaths.add(path); sstableCount += 1; @@ -388,11 +387,10 @@ private LocalDataLayer buildLocalDataLayer(@NotNull BulkWriterContext writerCont private DirectoryStream getDataFileStream(DirectoryStream.Filter filter) throws IOException { - // Combine the data file filter with the provided filter - DirectoryStream.Filter combinedFilter = path -> { - String fileName = path.getFileName().toString(); - return fileName.endsWith("Data.db") && filter.accept(path); - }; + // Combine the data file filter with the provided filter. + // Match only the primary SSTable data component ("-Data.db"); the leading '-' excludes + // SAI per-index components such as "...+TermsData.db" which also end with "Data.db". + DirectoryStream.Filter combinedFilter = path -> SSTables.isDataComponent(path) && filter.accept(path); return Files.newDirectoryStream(getOutDir(), combinedFilter); } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/LocalDataLayer.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/LocalDataLayer.java index 0a30cde03..077198587 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/LocalDataLayer.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/data/LocalDataLayer.java @@ -51,6 +51,7 @@ import org.apache.cassandra.bridge.CassandraBridge; import org.apache.cassandra.bridge.CassandraBridgeFactory; import org.apache.cassandra.bridge.CassandraVersion; +import org.apache.cassandra.spark.common.SSTables; import org.apache.cassandra.spark.config.SchemaFeature; import org.apache.cassandra.spark.config.SchemaFeatureSet; import org.apache.cassandra.spark.data.partitioner.Partitioner; @@ -365,7 +366,7 @@ public SSTablesSupplier sstables(int partitionId, .stream(paths) .map(Paths::get) .flatMap(Throwing.function(Files::list)) - .filter(path -> path.getFileName().toString().endsWith("-" + FileType.DATA.getFileSuffix())); + .filter(SSTables::isDataComponent); } return new BasicSupplier(dataFilePathsStream diff --git a/cassandra-four-zero-types/src/main/java/org/apache/cassandra/spark/reader/SchemaBuilder.java b/cassandra-four-zero-types/src/main/java/org/apache/cassandra/spark/reader/SchemaBuilder.java index 941071b62..70c81cfcb 100644 --- a/cassandra-four-zero-types/src/main/java/org/apache/cassandra/spark/reader/SchemaBuilder.java +++ b/cassandra-four-zero-types/src/main/java/org/apache/cassandra/spark/reader/SchemaBuilder.java @@ -44,6 +44,7 @@ import org.apache.cassandra.cql3.CQL3Type; import org.apache.cassandra.cql3.CQLFragmentParser; import org.apache.cassandra.cql3.CqlParser; +import org.apache.cassandra.cql3.statements.schema.CreateIndexStatement; import org.apache.cassandra.cql3.statements.schema.CreateTableStatement; import org.apache.cassandra.cql3.statements.schema.CreateTypeStatement; import org.apache.cassandra.db.Keyspace; @@ -58,11 +59,14 @@ import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.Keyspaces; import org.apache.cassandra.schema.Schema; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.schema.TableMetadataRef; +import org.apache.cassandra.schema.Tables; import org.apache.cassandra.schema.Types; +import org.apache.cassandra.service.ClientState; import org.apache.cassandra.spark.data.CassandraTypes; import org.apache.cassandra.spark.data.CqlField; import org.apache.cassandra.spark.data.CqlTable; @@ -148,6 +152,7 @@ public SchemaBuilder(String createStmt, partitioner, this.replicationFactor, tableId, enableCdc, + this.indexStatements, this::validateColumnMetaData)); this.keyspaceMetadata = updated.left; this.metadata = updated.right; @@ -164,6 +169,7 @@ private static Pair updateSchema(Schema schema, ReplicationFactor replicationFactor, UUID tableId, boolean enableCdc, + Set indexStatements, Consumer columnValidator) { // Set up and open keyspace if needed @@ -224,6 +230,8 @@ private static Pair updateSchema(Schema schema, } tableMetadata.columns().forEach(columnValidator); + tableMetadata = maybeApplyStorageAttachedIndexes(schema, keyspace, tableMetadata, + maybeExistingTableMetadata, indexStatements); setupTableAndUdt(schema, keyspace, tableMetadata, types); return validateKeyspaceTable(schema, keyspace, tableMetadata.name); @@ -341,6 +349,70 @@ private static boolean tableInstanceExists(Schema schema, String keyspaceName, S return true; } + /** + * Ensures the table metadata registered in the schema carries its Storage Attached Index (SAI) definitions. + *

+ * {@code buildSchema} is invoked repeatedly for the same table within a JVM — once with the index statements + * (from {@code AbstractBulkWriterContext}) and once per partition with an empty index set (from + * {@code RecordWriter}). To keep the registered metadata stable across these calls: + *

    + *
  • If the table is already registered with indexes, reuse that metadata verbatim. Rebuilding would either + * drop the indexes (index-less callers) or mint fresh non-deterministic index ids (indexed callers), and + * either difference forces a table-metadata update that fails with {@code AlreadyExistsException}.
  • + *
  • Otherwise apply only the SAI statements; legacy (2i) and empty index sets are left untouched, so non-SAI + * and Cassandra 4.0 paths are unchanged.
  • + *
+ */ + private static TableMetadata maybeApplyStorageAttachedIndexes(Schema schema, + String keyspace, + TableMetadata tableMetadata, + TableMetadata existingTableMetadata, + Set indexStatements) + { + if (existingTableMetadata != null && !existingTableMetadata.indexes.isEmpty()) + { + return existingTableMetadata; + } + + if (indexStatements == null || indexStatements.isEmpty()) + { + return tableMetadata; + } + + List saiStatements = indexStatements.stream() + .filter(SchemaBuilder::isStorageAttachedIndex) + .collect(Collectors.toList()); + if (saiStatements.isEmpty()) + { + return tableMetadata; + } + + KeyspaceMetadata keyspaceMetadata = schema.getKeyspaceMetadata(keyspace); + Keyspaces keyspaces = Keyspaces.of(keyspaceMetadata.withSwapped(Tables.of(tableMetadata))); + ClientState state = ClientState.forInternalCalls(); + for (String saiStatement : saiStatements) + { + CreateIndexStatement.Raw raw = CQLFragmentParser.parseAny(CqlParser::createIndexStatement, + saiStatement, "CREATE INDEX"); + keyspaces = raw.prepare(state).apply(keyspaces); + } + + TableMetadata withIndexes = keyspaces.get(keyspace) + .flatMap(ks -> ks.tables.get(tableMetadata.name)) + .orElseThrow(() -> new IllegalStateException( + "SAI index application produced no table metadata for " + + keyspace + '.' + tableMetadata.name)); + + LOGGER.info("Applied {} SAI index(es) to table metadata keyspace={} table={}", + saiStatements.size(), keyspace, tableMetadata.name); + return withIndexes; + } + + private static boolean isStorageAttachedIndex(String createIndexStatement) + { + return createIndexStatement.toUpperCase().contains("USING 'STORAGEATTACHEDINDEX'"); + } + // Check whether keyspace metadata exists. Create keyspace metadata, if not. // Check whether keyspace instance is opened. Open the keyspace, if not. // NOTE: It is possible that external code that just creates metadata, but does not open the keyspace From d3d583dd898fd5292eb6c907d9b839e4488b02ee Mon Sep 17 00:00:00 2001 From: Shailaja Koppu Date: Tue, 16 Jun 2026 20:54:25 +0100 Subject: [PATCH 6/6] SAI support in restore path --- .../common/data/SSTableImportOptions.java | 19 +- .../CreateRestoreJobRequestPayloadTest.java | 8 +- .../BufferingCommitLogReaderTests.java | 2 +- .../cassandra/spark/utils/CqlUtils.java | 30 ++- .../bulkwriter/BroadcastableSchemaInfo.java | 4 +- .../bulkwriter/BroadcastableTableSchema.java | 2 +- .../CassandraBulkSourceRelation.java | 11 + .../CassandraDirectDataTransportContext.java | 3 +- .../spark/bulkwriter/TableSchema.java | 32 ++- .../cloudstorage/SSTableLister.java | 3 +- .../bulkwriter/TableSchemaTestCommon.java | 1 + .../analytics/BulkWriteSAIIndexTest.java | 9 + .../BulkWriteS3CompatModeSAIIndexTest.java | 245 ++++++++++++++++++ .../bridge/CassandraBridgeImplementation.java | 11 +- .../bridge/SSTableWriterImplementation.java | 14 +- .../reader/StorageAttachedIndexApplier.java | 114 ++++++++ .../bridge/CassandraBridgeImplementation.java | 2 + .../cassandra/spark/reader/SchemaBuilder.java | 86 +----- 18 files changed, 487 insertions(+), 109 deletions(-) create mode 100644 cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/testcontainer/BulkWriteS3CompatModeSAIIndexTest.java create mode 100644 cassandra-five-zero-types/src/main/java/org/apache/cassandra/spark/reader/StorageAttachedIndexApplier.java diff --git a/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/data/SSTableImportOptions.java b/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/data/SSTableImportOptions.java index f9c62c448..168b08a32 100644 --- a/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/data/SSTableImportOptions.java +++ b/analytics-sidecar-client-common/src/main/java/org/apache/cassandra/sidecar/common/data/SSTableImportOptions.java @@ -18,19 +18,14 @@ package org.apache.cassandra.sidecar.common.data; -import java.util.LinkedHashMap; +import java.util.HashMap; /** * Options for Cassandra import nodetool command. It is like properties. * Supports Json serialization and deserialization. - *

- * This class is used exclusively in the S3 restore job path ({@code CreateRestoreJobRequestPayload}). */ -public class SSTableImportOptions extends LinkedHashMap +public class SSTableImportOptions extends HashMap { - public static final boolean DEFAULT_FAIL_ON_MISSING_INDEX = false; - public static final boolean DEFAULT_VALIDATE_INDEX_CHECKSUM = false; - private static final String RESET_LEVEL = "resetLevel"; private static final String CLEAR_REPAIRED = "clearRepaired"; private static final String VERIFY_SSTABLES = "verifySSTables"; @@ -55,8 +50,6 @@ private SSTableImportOptions() put(INVALIDATE_CACHES, Boolean.toString(true)); put(EXTENDED_VERIFY, Boolean.toString(true)); put(COPY_DATA, Boolean.toString(false)); // note: the default is false - put(FAIL_ON_MISSING_INDEX, Boolean.toString(DEFAULT_FAIL_ON_MISSING_INDEX)); - put(VALIDATE_INDEX_CHECKSUM, Boolean.toString(DEFAULT_VALIDATE_INDEX_CHECKSUM)); } public SSTableImportOptions resetLevel(boolean enabled) @@ -136,6 +129,10 @@ public boolean copyData() return Boolean.parseBoolean(get(COPY_DATA)); } + /** + * When enabled, SSTable import fails if a table with Storage Attached Indexes (SAI) is missing its index + * components, instead of silently rebuilding them. Only meaningful for SAI tables (Cassandra 5.0+). + */ public SSTableImportOptions failOnMissingIndex(boolean enabled) { put(FAIL_ON_MISSING_INDEX, Boolean.toString(enabled)); @@ -147,6 +144,10 @@ public boolean failOnMissingIndex() return Boolean.parseBoolean(get(FAIL_ON_MISSING_INDEX)); } + /** + * When enabled, SSTable import validates the checksums of Storage Attached Index (SAI) components. + * Only meaningful for SAI tables (Cassandra 5.0+). + */ public SSTableImportOptions validateIndexChecksum(boolean enabled) { put(VALIDATE_INDEX_CHECKSUM, Boolean.toString(enabled)); diff --git a/analytics-sidecar-client-common/src/test/java/org/apache/cassandra/sidecar/common/request/CreateRestoreJobRequestPayloadTest.java b/analytics-sidecar-client-common/src/test/java/org/apache/cassandra/sidecar/common/request/CreateRestoreJobRequestPayloadTest.java index 89eced983..5f064718a 100644 --- a/analytics-sidecar-client-common/src/test/java/org/apache/cassandra/sidecar/common/request/CreateRestoreJobRequestPayloadTest.java +++ b/analytics-sidecar-client-common/src/test/java/org/apache/cassandra/sidecar/common/request/CreateRestoreJobRequestPayloadTest.java @@ -67,15 +67,13 @@ void testSerDeser() throws JsonProcessingException "\"jobAgent\":\"agent\"," + "\"secrets\":" + MAPPER.writeValueAsString(secrets) + "," + "\"importOptions\":{" + + "\"verifyTokens\":\"true\"," + "\"resetLevel\":\"true\"," + "\"clearRepaired\":\"true\"," + + "\"extendedVerify\":\"true\"," + "\"verifySSTables\":\"true\"," + - "\"verifyTokens\":\"true\"," + "\"invalidateCaches\":\"true\"," + - "\"extendedVerify\":\"true\"," + - "\"copyData\":\"false\"," + - "\"failOnMissingIndex\":\"false\"," + - "\"validateIndexChecksum\":\"false\"}," + + "\"copyData\":\"false\"}," + "\"expireAt\":" + expireAt + "," + "\"consistencyLevel\":\"QUORUM\"}"); CreateRestoreJobRequestPayload test = MAPPER.readValue(json, CreateRestoreJobRequestPayload.class); diff --git a/cassandra-analytics-cdc/src/test/java/org/apache/cassandra/db/commitlog/BufferingCommitLogReaderTests.java b/cassandra-analytics-cdc/src/test/java/org/apache/cassandra/db/commitlog/BufferingCommitLogReaderTests.java index b273f6ce3..8fdb72ffb 100644 --- a/cassandra-analytics-cdc/src/test/java/org/apache/cassandra/db/commitlog/BufferingCommitLogReaderTests.java +++ b/cassandra-analytics-cdc/src/test/java/org/apache/cassandra/db/commitlog/BufferingCommitLogReaderTests.java @@ -89,7 +89,7 @@ public void testReaderSeek(CassandraVersion version) ReplicationFactor.simpleStrategy(1), Partitioner.Murmur3Partitioner, Collections.emptySet(), - null, 0, true); + null, Collections.emptySet(), true); int numRows = 1000; // write some rows to a CommitLog diff --git a/cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/utils/CqlUtils.java b/cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/utils/CqlUtils.java index f7dcffc86..b613f72d9 100644 --- a/cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/utils/CqlUtils.java +++ b/cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/utils/CqlUtils.java @@ -25,6 +25,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; @@ -59,6 +60,9 @@ public final class CqlUtils private static final Pattern ESCAPED_DOUBLE_BACKSLASH = Pattern.compile("\\\\"); private static final Pattern COMPACTION_STRATEGY_PATTERN = Pattern.compile("compaction\\s*=\\s*\\{\\s*'class'\\s*:\\s*'([^']+)'"); + private static final Pattern MULTI_WHITESPACE_PATTERN = Pattern.compile("\\s+"); + private static final Pattern SAI_USING_PATTERN = Pattern.compile("USING '[^']*STORAGEATTACHEDINDEX'"); + private CqlUtils() { throw new IllegalStateException(getClass() + " is static utility class and shall not be instantiated"); @@ -288,13 +292,37 @@ public static Set extractIndexStatements(@NotNull String schemaStr, /** * Returns true if the given CREATE INDEX statement defines a Storage Attached Index (SAI). + *

+ * SAI class may appear either as the short name ("StorageAttachedIndex") or fully qualified + * ("org.apache.cassandra.index.sai.StorageAttachedIndex"). * * @param createIndexStatement a CREATE INDEX CQL statement * @return true if the index uses SAI */ public static boolean isSaiIndex(@NotNull String createIndexStatement) { - return createIndexStatement.toUpperCase().contains("USING 'STORAGEATTACHEDINDEX'"); + // Matches any run of whitespace (spaces, tabs, newlines). Used to collapse a CREATE INDEX statement to + // single-spaced text so the SAI marker can be matched regardless of how the schema text was formatted + // (e.g. "USING 'StorageAttachedIndex'", a newline before USING, or a tab all normalize to one space). + String normalized = MULTI_WHITESPACE_PATTERN.matcher(createIndexStatement).replaceAll(" ").toUpperCase(Locale.ROOT); + + // Matches the SAI marker inside a USING '...' clause (statement already upper-cased and whitespace-collapsed). + // The leading [^']* tolerates the fully-qualified class form + // (e.g. 'org.apache.cassandra.index.sai.StorageAttachedIndex') as well as the short name. + return SAI_USING_PATTERN.matcher(normalized).find(); + } + + /** + * Returns true when {@code indexStatements} is non-empty and every statement defines a Storage Attached Index. + * This is the single "all-SAI table" predicate shared by schema validation and the write/commit paths, so the + * decision to generate SAI components and the decision to enable SAI import options can never disagree. + * + * @param indexStatements the CREATE INDEX statements for a table + * @return true if all indexes are SAI (and at least one exists) + */ + public static boolean hasOnlySaiIndexes(@NotNull Set indexStatements) + { + return !indexStatements.isEmpty() && indexStatements.stream().allMatch(CqlUtils::isSaiIndex); } /** diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableSchemaInfo.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableSchemaInfo.java index 2ad28ffc5..9c0a164b6 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableSchemaInfo.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableSchemaInfo.java @@ -28,8 +28,6 @@ * Broadcastable wrapper for schema information with ZERO transient fields to optimize Spark broadcasting. *

* Contains BroadcastableTableSchema (pre-computed schema data) and UDT statements. - * Index statements are now stored in CqlTable (serialized via Kryo in BroadcastableTableSchema) - * and accessed through TableSchema. * Executors reconstruct CassandraSchemaInfo and TableSchema from these fields. *

* Why ZERO transient fields matters:
@@ -45,7 +43,7 @@ */ public final class BroadcastableSchemaInfo implements Serializable { - private static final long serialVersionUID = 4719283056718294301L; + private static final long serialVersionUID = -8727074052066841748L; // Essential fields broadcast to executors private final BroadcastableTableSchema broadcastableTableSchema; diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableTableSchema.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableTableSchema.java index 693b07740..41afd5ebb 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableTableSchema.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/BroadcastableTableSchema.java @@ -47,7 +47,7 @@ */ public final class BroadcastableTableSchema implements Serializable { - private static final long serialVersionUID = 1L; + private static final long serialVersionUID = 2L; // All fields from TableSchema needed for reconstruction on executors private final String createStatement; diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraBulkSourceRelation.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraBulkSourceRelation.java index 7ba220de3..1e695f27a 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraBulkSourceRelation.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraBulkSourceRelation.java @@ -26,6 +26,7 @@ import java.util.Iterator; import java.util.List; import java.util.Objects; +import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.function.Consumer; import java.util.stream.Collectors; @@ -502,11 +503,21 @@ private CreateRestoreJobRequestPayload.Builder createJobPayloadBuilder(JobInfo j @org.jetbrains.annotations.Nullable String credentialTypeName) { CreateRestoreJobRequestPayload.Builder builder = CreateRestoreJobRequestPayload.builder(secrets, updatedLeaseTime()); + Set indexStatements = writerContext.schema().getTableSchema().getIndexStatements(); + boolean hasSaiIndexes = TableSchema.isSaiWrite(indexStatements, writerContext.cluster().getLowestCassandraVersion()); builder.jobAgent(BuildInfo.APPLICATION_NAME) .jobId(job.getRestoreJobId(clusterId)) .updateImportOptions(importOptions -> { importOptions.verifySSTables(true) // we disallow the end-user to bypass the non-extended verify anymore .extendedVerify(false); // always turn off + + // When the table has SAI indexes, the bulk writer generates SAI components alongside the + // SSTables (Cassandra 5.0+). Enable SAI validation on import so that a slice missing its + // index components fails instead of silently rebuilding, mirroring the direct write path. + if (hasSaiIndexes) + { + importOptions.failOnMissingIndex(true).validateIndexChecksum(true); + } }); if (credentialTypeName != null) { diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java index 1ced66cdf..157ccdccc 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/CassandraDirectDataTransportContext.java @@ -28,7 +28,6 @@ import org.apache.cassandra.bridge.CassandraBridge; import org.apache.cassandra.bridge.CassandraBridgeFactory; import org.apache.cassandra.spark.bulkwriter.token.ReplicaAwareFailureHandler; -import org.apache.cassandra.spark.utils.CqlUtils; import org.jetbrains.annotations.NotNull; public class CassandraDirectDataTransportContext implements TransportContext.DirectDataBulkWriterContext @@ -78,7 +77,7 @@ protected DirectDataTransferApi createDirectDataTransferApi() { CassandraBridge bridge = CassandraBridgeFactory.get(clusterInfo.getLowestCassandraVersion()); Set indexStatements = schemaInfo.getTableSchema().getIndexStatements(); - boolean hasSaiIndexes = !indexStatements.isEmpty() && indexStatements.stream().allMatch(CqlUtils::isSaiIndex); + boolean hasSaiIndexes = TableSchema.isSaiWrite(indexStatements, clusterInfo.getLowestCassandraVersion()); return new SidecarDataTransferApi(clusterInfo.getCassandraContext(), bridge, jobInfo, hasSaiIndexes); } diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableSchema.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableSchema.java index b19d2e0f3..7543220f9 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableSchema.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/TableSchema.java @@ -34,6 +34,7 @@ import org.apache.cassandra.bridge.CassandraBridge; import org.apache.cassandra.bridge.CassandraBridgeFactory; +import org.apache.cassandra.bridge.CassandraVersionFeatures; import org.apache.cassandra.spark.common.schema.ColumnType; import org.apache.cassandra.spark.data.CqlField; import org.apache.cassandra.spark.exception.UnsupportedAnalyticsOperationException; @@ -322,13 +323,10 @@ static void validateSecondaryIndexes(TableInfoProvider tableInfo, return; // No indexes — nothing to validate } - boolean allSai = !indexStatements.isEmpty() && indexStatements.stream().allMatch(CqlUtils::isSaiIndex); - boolean isCassandra5OrLater = isCassandra5OrLater(lowestCassandraVersion); - - if (allSai && isCassandra5OrLater) + if (isSaiWrite(indexStatements, lowestCassandraVersion)) { - LOGGER.info("Table has SAI indexes only on Cassandra 5.0+. SAI index components will be generated " - + "alongside SSTables for immediate queryability after import. indexCount={}", indexStatements.size()); + LOGGER.info("Table has SAI indexes on Cassandra 5.0+. SAI index components will be generated " + + "alongside SSTables. indexCount={}", indexStatements.size()); return; } @@ -363,17 +361,33 @@ static boolean isCassandra5OrLater(String version) { return false; } + try { - int majorVersion = Integer.parseInt(version.split("\\.")[0]); - return majorVersion >= 5; + return CassandraVersionFeatures.cassandraVersionFeaturesFromCassandraVersion(version).getMajorVersion() >= 50; } - catch (NumberFormatException exception) + catch (RuntimeException exception) { + // Unparseable version string — treat as "not 5.0+". return false; } } + /** + * Returns true when the bulk writer generates SAI components alongside SSTables for this table, i.e. the + * table's indexes are all SAI and the cluster is Cassandra 5.0+. This single predicate is shared by schema + * validation (to allow the write) and by the commit/restore paths (to enable SAI import options), so the + * "we generate SAI components" and "we validate SAI components on import" decisions can never diverge. + * + * @param indexStatements the CREATE INDEX statements for the table + * @param lowestCassandraVersion the lowest Cassandra version in the cluster + * @return true if this is an all-SAI write on Cassandra 5.0+ + */ + static boolean isSaiWrite(Set indexStatements, String lowestCassandraVersion) + { + return CqlUtils.hasOnlySaiIndexes(indexStatements) && isCassandra5OrLater(lowestCassandraVersion); + } + private static List getKeyFieldPositions(StructType dfSchema, List columnNames, List keyFieldNames) diff --git a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/SSTableLister.java b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/SSTableLister.java index 2897956dc..5d3ff9759 100644 --- a/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/SSTableLister.java +++ b/cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/bulkwriter/cloudstorage/SSTableLister.java @@ -43,6 +43,7 @@ import org.apache.cassandra.bridge.CassandraBridge; import org.apache.cassandra.bridge.SSTableSummary; import org.apache.cassandra.spark.common.Digest; +import org.apache.cassandra.spark.common.SSTables; import org.apache.cassandra.spark.data.FileSystemSSTable; import org.apache.cassandra.spark.data.QualifiedTableName; import org.apache.cassandra.spark.data.SSTable; @@ -218,7 +219,7 @@ private String getSSTablePrefix(String componentName) private SSTable buildSSTable(List components) { List dataComponents = components.stream() - .filter(path -> path.getFileName().toString().contains("Data.db")) + .filter(SSTables::isDataComponent) .collect(Collectors.toList()); if (dataComponents.size() != 1) { diff --git a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TableSchemaTestCommon.java b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TableSchemaTestCommon.java index 3a6441725..630b87f30 100644 --- a/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TableSchemaTestCommon.java +++ b/cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/bulkwriter/TableSchemaTestCommon.java @@ -505,6 +505,7 @@ public Set getIndexStatements() { return new HashSet<>(Collections.singletonList("CREATE INDEX test_idx ON test." + uniqueTableName + " (col);")); } + return Collections.emptySet(); } diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriteSAIIndexTest.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriteSAIIndexTest.java index c3a05653b..972f893bd 100644 --- a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriteSAIIndexTest.java +++ b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/BulkWriteSAIIndexTest.java @@ -94,6 +94,15 @@ void testBulkWriteAndReadWithMultipleSaiIndexes() TABLE_SAI), ConsistencyLevel.ALL); assertThat(marksResults).isNotNull(); + assertThat(marksResults.length) + .as("SAI filter on marks=50 should return the matching row") + .isGreaterThan(0); + + for (Object[] row : marksResults) + { + // marks is the third column (id, course, marks) + assertThat(row[2]).isEqualTo(50); + } } /** diff --git a/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/testcontainer/BulkWriteS3CompatModeSAIIndexTest.java b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/testcontainer/BulkWriteS3CompatModeSAIIndexTest.java new file mode 100644 index 000000000..97ab488c7 --- /dev/null +++ b/cassandra-analytics-integration-tests/src/test/java/org/apache/cassandra/analytics/testcontainer/BulkWriteS3CompatModeSAIIndexTest.java @@ -0,0 +1,245 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.analytics.testcontainer; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.function.Function; +import java.util.stream.Stream; + +import com.google.common.collect.ImmutableMap; +import com.vdurmont.semver4j.Semver; +import org.junit.jupiter.api.Test; + +import com.adobe.testing.s3mock.testcontainers.S3MockContainer; +import org.apache.cassandra.analytics.DataGenerationUtils; +import org.apache.cassandra.analytics.SharedClusterSparkIntegrationTestBase; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.sidecar.config.S3ClientConfiguration; +import org.apache.cassandra.sidecar.config.S3ProxyConfiguration; +import org.apache.cassandra.sidecar.config.yaml.S3ClientConfigurationImpl; +import org.apache.cassandra.sidecar.config.yaml.S3ProxyConfigurationImpl; +import org.apache.cassandra.sidecar.config.yaml.SidecarConfigurationImpl; +import org.apache.cassandra.sidecar.testing.QualifiedName; +import org.apache.cassandra.testing.ClusterBuilderConfiguration; +import org.apache.cassandra.testing.TestUtils; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; + +import static org.apache.cassandra.sidecar.config.yaml.S3ClientConfigurationImpl.DEFAULT_API_CALL_TIMEOUT; +import static org.apache.cassandra.sidecar.config.yaml.S3ClientConfigurationImpl.DEFAULT_THREAD_KEEP_ALIVE; +import static org.apache.cassandra.testing.TestUtils.CREATE_TEST_TABLE_STATEMENT; +import static org.apache.cassandra.testing.TestUtils.DC1_RF3; +import static org.apache.cassandra.testing.TestUtils.ROW_COUNT; +import static org.apache.cassandra.testing.TestUtils.TEST_KEYSPACE; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assumptions.assumeThat; + +/** + * End-to-end integration test for bulk writing a table with Storage Attached Indexes (SAI) via the + * S3-compatible (cloud storage / restore) transport. + * + *

This exercises both stages of the cloud-storage write path: + *

    + *
  • Stage 1 (write & upload): SSTables and their SAI index components are produced on the Spark + * side by {@code CQLSSTableWriter}, bundled, and uploaded to S3. Because the restore payload now sets + * {@code failOnMissingIndex=true} for SAI tables, the subsequent import would fail if the SAI components + * were absent from the bundle — so a successful write proves the components were generated and uploaded.
  • + *
  • Stage 2 (restore & import): the sidecar restore job downloads the slices and runs Cassandra's + * SSTable import with {@code failOnMissingIndex=true}/{@code validateIndexChecksum=true}, attaching the SAI + * components. Successful completion, the presence of SAI files on disk, and working SAI-filtered queries + * confirm the indexes are usable after restore.
  • + *
+ * + * SAI is a Cassandra 5.0+ feature, so the test is skipped on older clusters. + */ +class BulkWriteS3CompatModeSAIIndexTest extends SharedClusterSparkIntegrationTestBase +{ + // Must match the bucket reported by LocalStorageTransportExtension.getStorageConfiguration() + // (it imports BUCKET_NAME from BulkWriteS3CompatModeSimpleTest = "sbw-bucket"); the writer uploads + // bundles to that bucket, so the S3Mock initial bucket must use the same name or uploads 404. + public static final String BUCKET_NAME = "sbw-bucket"; + private static final QualifiedName TABLE_NAME = + new QualifiedName(TEST_KEYSPACE, BulkWriteS3CompatModeSAIIndexTest.class.getSimpleName().toLowerCase()); + private S3MockContainer s3Mock; + + /** + * Stage 1 + Stage 2 end-to-end: write a SAI table via S3_COMPAT, then verify the data and the SAI indexes + * survive the restore/import round-trip. + */ + @Test + void testS3CompatBulkWriteWithSaiIndexes() + { + SparkSession spark = getOrCreateSparkSession(); + Dataset df = DataGenerationUtils.generateCourseData(spark, ROW_COUNT); + Map s3CompatOptions = ImmutableMap.of( + "data_transport", "S3_COMPAT", + "data_transport_extension_class", LocalStorageTransportExtension.class.getCanonicalName(), + "storage_client_endpoint_override", s3Mock.getHttpEndpoint() // point to s3Mock server + ); + + // Stage 1 + Stage 2: write to S3 and restore/import. With failOnMissingIndex=true for SAI tables, this + // save() would throw if the SAI components were not generated, uploaded, and validated on import. + bulkWriterDataFrameWriter(df, TABLE_NAME, s3CompatOptions).save(); + + // Verify all rows are present after the restore/import round-trip. + sparkTestUtils.validateWrites(df.collectAsList(), queryAllData(TABLE_NAME)); + + // Stage 2 evidence: SAI index components landed on disk after import. + assertThat(hasSaiIndexFiles()) + .as("SAI index files should exist on disk after S3 restore/import") + .isTrue(); + + // Stage 2 evidence: SAI indexes are attached and usable for filtering on non-key columns. + Object[][] courseResults = cluster.getFirstRunningInstance() + .coordinator() + .execute(String.format("SELECT * FROM %s WHERE course = 'course0';", TABLE_NAME), + ConsistencyLevel.ALL); + assertThat(courseResults.length) + .as("SAI filter on course='course0' should return the matching row") + .isGreaterThan(0); + for (Object[] row : courseResults) + { + // course is the second column (id, course, marks) + assertThat(row[1]).isEqualTo("course0"); + } + + Object[][] marksResults = cluster.getFirstRunningInstance() + .coordinator() + .execute(String.format("SELECT * FROM %s WHERE marks = 50;", TABLE_NAME), + ConsistencyLevel.ALL); + assertThat(marksResults.length) + .as("SAI filter on marks=50 should return the matching row") + .isGreaterThan(0); + for (Object[] row : marksResults) + { + // marks is the third column (id, course, marks) + assertThat(row[2]).isEqualTo(50); + } + } + + /** + * Checks whether SAI index component files exist on disk for the test keyspace on the first node. + */ + private boolean hasSaiIndexFiles() + { + String[] dataDirs = (String[]) cluster.get(1) + .config() + .getParams() + .get("data_file_directories"); + Path keyspacePath = Paths.get(dataDirs[0], TEST_KEYSPACE); + try (Stream walkStream = Files.walk(keyspacePath)) + { + return walkStream + .filter(Files::isRegularFile) + .anyMatch(path -> { + String pathStr = path.toString(); + return pathStr.contains("SAI") || pathStr.contains(".sai"); + }); + } + catch (IOException e) + { + return false; + } + } + + @Override + protected void beforeClusterProvisioning() + { + assumeThat(TestUtils.getDTestClusterVersion().isGreaterThanOrEqualTo(new Semver("5.0", Semver.SemverType.LOOSE))) + .describedAs("Storage Attached Index (SAI) is only available in Cassandra 5.0 and above") + .isTrue(); + } + + @Override + protected void afterClusterProvisioned() + { + // must start s3Mock before starting sidecar, in order to provide the actual s3 server port to start sidecar + super.afterClusterProvisioned(); + s3Mock = new S3MockContainer("2.17.0") + .withInitialBuckets(BUCKET_NAME); + s3Mock.start(); + assertThat(s3Mock.isRunning()).isTrue(); + } + + @Override + protected void afterClusterShutdown() + { + if (s3Mock != null) + { + s3Mock.stop(); + } + } + + @Override + protected ClusterBuilderConfiguration testClusterConfiguration() + { + return super.testClusterConfiguration() + .nodesPerDc(3); + } + + @Override + protected void initializeSchemaForTest() + { + createTestKeyspace(TEST_KEYSPACE, DC1_RF3); + createTestTable(TABLE_NAME, CREATE_TEST_TABLE_STATEMENT); + cluster.schemaChangeIgnoringStoppedInstances( + String.format("CREATE INDEX ON %s(course) USING 'StorageAttachedIndex';", TABLE_NAME)); + cluster.schemaChangeIgnoringStoppedInstances( + String.format("CREATE INDEX ON %s(marks) USING 'StorageAttachedIndex';", TABLE_NAME)); + } + + @Override + protected Function configurationOverrides() + { + return builder -> { + S3ClientConfiguration s3ClientConfig = new S3ClientConfigurationImpl("s3-client", 4, DEFAULT_THREAD_KEEP_ALIVE, + 5242880, DEFAULT_API_CALL_TIMEOUT, + buildTestS3ProxyConfig()); + builder.s3ClientConfiguration(s3ClientConfig); + return builder; + }; + } + + @Override + protected void beforeTestStart() + { + super.beforeTestStart(); + waitForSchemaReady(30, TimeUnit.SECONDS); + } + + private S3ProxyConfiguration buildTestS3ProxyConfig() + { + return new S3MockProxyConfigurationImpl(s3Mock.getHttpEndpoint()); + } + + public static class S3MockProxyConfigurationImpl extends S3ProxyConfigurationImpl + { + S3MockProxyConfigurationImpl(String endpointOverride) + { + super(null, null, null, endpointOverride); + } + } +} diff --git a/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java b/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java index e4e9567bb..ad33708b4 100644 --- a/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java +++ b/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java @@ -100,6 +100,7 @@ import org.apache.cassandra.spark.reader.ReaderUtils; import org.apache.cassandra.spark.reader.RowData; import org.apache.cassandra.spark.reader.SchemaBuilder; +import org.apache.cassandra.spark.reader.StorageAttachedIndexApplier; import org.apache.cassandra.spark.reader.StreamScanner; import org.apache.cassandra.spark.reader.SummaryDbUtils; import org.apache.cassandra.spark.sparksql.CellIterator; @@ -284,8 +285,14 @@ public CqlTable buildSchema(String createStatement, Set indexStatements, boolean enableCdc) { - return new SchemaBuilder(createStatement, keyspace, replicationFactor, partitioner, - cassandraTypes -> udts, tableId, indexStatements, enableCdc).build(); + CqlTable cqlTable = new SchemaBuilder(createStatement, keyspace, replicationFactor, partitioner, + cassandraTypes -> udts, tableId, indexStatements, enableCdc).build(); + // SAI is a Cassandra 5.0+ feature: register any Storage Attached Index definitions on the table metadata + // after the shared SchemaBuilder has built/registered the (index-less) table. + // buildSchema is invoked repeatedly within a JVM (the index-carrying per-partition RecordWriter included), + // so this re-applies the SAI definitions whenever a rebuild left the table without them. + StorageAttachedIndexApplier.maybeApply(keyspace, cqlTable.table(), indexStatements); + return cqlTable; } @Override diff --git a/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/SSTableWriterImplementation.java b/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/SSTableWriterImplementation.java index 600823c98..67195f210 100644 --- a/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/SSTableWriterImplementation.java +++ b/cassandra-five-zero-bridge/src/main/java/org/apache/cassandra/bridge/SSTableWriterImplementation.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.Collection; +import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Set; @@ -40,6 +41,7 @@ import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.io.sstable.CQLSSTableWriter; import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.spark.utils.CqlUtils; import org.jetbrains.annotations.NotNull; /** @@ -167,11 +169,17 @@ static CQLSSTableWriter.Builder configureBuilder(String inDirectory, .openSSTableOnProduced() .withMaxSSTableSizeInMiB(bufferSizeMB); - if (!indexCreateStatements.isEmpty()) + // Only SAI indexes are generated alongside SSTables. Filter out any non-SAI (legacy 2i) statements so they + // are never handed to CQLSSTableWriter.withIndexes, which expects SAI definitions. Mixed-index tables can + // only reach this point via SKIP_SECONDARY_INDEX_CHECK; their 2i indexes are rebuilt by Cassandra on import. + List saiIndexStatements = indexCreateStatements.stream() + .filter(CqlUtils::isSaiIndex) + .collect(Collectors.toList()); + if (!saiIndexStatements.isEmpty()) { - builder.withIndexes(indexCreateStatements.toArray(new String[0])); + builder.withIndexes(saiIndexStatements.toArray(new String[0])); builder.withBuildIndexes(true); - LOGGER.info("SAI index generation enabled for {} index(es)", indexCreateStatements.size()); + LOGGER.info("SAI index generation enabled for {} index(es)", saiIndexStatements.size()); } return builder; diff --git a/cassandra-five-zero-types/src/main/java/org/apache/cassandra/spark/reader/StorageAttachedIndexApplier.java b/cassandra-five-zero-types/src/main/java/org/apache/cassandra/spark/reader/StorageAttachedIndexApplier.java new file mode 100644 index 000000000..62f280a58 --- /dev/null +++ b/cassandra-five-zero-types/src/main/java/org/apache/cassandra/spark/reader/StorageAttachedIndexApplier.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.cassandra.spark.reader; + +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.bridge.CassandraSchema; +import org.apache.cassandra.bridge.SchemaUpdater; +import org.apache.cassandra.cql3.CQLFragmentParser; +import org.apache.cassandra.cql3.CqlParser; +import org.apache.cassandra.cql3.statements.schema.CreateIndexStatement; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.Keyspaces; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.Tables; +import org.apache.cassandra.service.ClientState; +import org.apache.cassandra.spark.utils.CqlUtils; + +/** + * Registers Storage Attached Index (SAI) definitions onto a table's metadata in the in-JVM Cassandra schema. + *

+ * SAI is a Cassandra 5.0+ feature, so this is invoked only by the 5.0 bridge, after the shared {@link SchemaBuilder} + * has registered the (index-less) table. Idempotent: a no-op if the registered table already carries indexes (the + * shared builder preserves previously-registered indexes across repeated index-less rebuilds within a JVM). + */ +public final class StorageAttachedIndexApplier +{ + private static final Logger LOGGER = LoggerFactory.getLogger(StorageAttachedIndexApplier.class); + + private StorageAttachedIndexApplier() + { + } + + /** + * Applies any Storage Attached Index definitions in {@code indexStatements} to the already-registered table. + * Non-SAI (legacy 2i) and empty index sets are ignored. Idempotent: if the registered table already carries + * indexes (e.g. a prior call within the same JVM applied them), this is a no-op. + * + * @param keyspace the keyspace of the table + * @param table the table name + * @param indexStatements the CREATE INDEX statements associated with the table (may be empty or null) + */ + public static void maybeApply(String keyspace, String table, Set indexStatements) + { + if (indexStatements == null || indexStatements.isEmpty()) + { + return; + } + + List saiStatements = indexStatements.stream() + .filter(CqlUtils::isSaiIndex) + .collect(Collectors.toList()); + if (saiStatements.isEmpty()) + { + return; + } + + CassandraSchema.update(schema -> { + TableMetadata current = schema.getTableMetadata(keyspace, table); + if (current == null) + { + throw new IllegalStateException("SAI index application found no registered table metadata for " + + keyspace + '.' + table); + } + + if (!current.indexes.isEmpty()) + { + // Indexes already registered for this table (buildSchema is invoked repeatedly within a JVM). + return; + } + + KeyspaceMetadata keyspaceMetadata = schema.getKeyspaceMetadata(keyspace); + Keyspaces keyspaces = Keyspaces.of(keyspaceMetadata.withSwapped(Tables.of(current))); + ClientState state = ClientState.forInternalCalls(); + for (String saiStatement : saiStatements) + { + CreateIndexStatement.Raw raw = CQLFragmentParser.parseAny(CqlParser::createIndexStatement, + saiStatement, "CREATE INDEX"); + keyspaces = raw.prepare(state).apply(keyspaces); + } + + TableMetadata withIndexes = keyspaces.get(keyspace) + .flatMap(ks -> ks.tables.get(table)) + .orElseThrow(() -> new IllegalStateException( + "SAI index application produced no table metadata for " + + keyspace + '.' + table)); + SchemaUpdater.updateTable(schema, keyspaceMetadata, withIndexes); + LOGGER.info("Applied {} SAI index(es) to table metadata keyspace={} table={}", + saiStatements.size(), keyspace, table); + }); + } +} diff --git a/cassandra-four-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java b/cassandra-four-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java index 0c0926b2a..59cef5bb3 100644 --- a/cassandra-four-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java +++ b/cassandra-four-zero-bridge/src/main/java/org/apache/cassandra/bridge/CassandraBridgeImplementation.java @@ -605,6 +605,8 @@ public SSTableWriter getSSTableWriter(String inDirectory, Set indexCreateStatements, int bufferSizeMB) { + // indexCreateStatements is intentionally ignored: SAI component generation is a Cassandra 5.0+ feature + // handled by the five-zero bridge. On 4.0, indexes are rebuilt by Cassandra after import as before. return new SSTableWriterImplementation(inDirectory, partitioner, createStatement, insertStatement, userDefinedTypeStatements, bufferSizeMB); } diff --git a/cassandra-four-zero-types/src/main/java/org/apache/cassandra/spark/reader/SchemaBuilder.java b/cassandra-four-zero-types/src/main/java/org/apache/cassandra/spark/reader/SchemaBuilder.java index 70c81cfcb..c3e1c09a3 100644 --- a/cassandra-four-zero-types/src/main/java/org/apache/cassandra/spark/reader/SchemaBuilder.java +++ b/cassandra-four-zero-types/src/main/java/org/apache/cassandra/spark/reader/SchemaBuilder.java @@ -44,7 +44,6 @@ import org.apache.cassandra.cql3.CQL3Type; import org.apache.cassandra.cql3.CQLFragmentParser; import org.apache.cassandra.cql3.CqlParser; -import org.apache.cassandra.cql3.statements.schema.CreateIndexStatement; import org.apache.cassandra.cql3.statements.schema.CreateTableStatement; import org.apache.cassandra.cql3.statements.schema.CreateTypeStatement; import org.apache.cassandra.db.Keyspace; @@ -59,14 +58,11 @@ import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.KeyspaceParams; -import org.apache.cassandra.schema.Keyspaces; import org.apache.cassandra.schema.Schema; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.schema.TableMetadataRef; -import org.apache.cassandra.schema.Tables; import org.apache.cassandra.schema.Types; -import org.apache.cassandra.service.ClientState; import org.apache.cassandra.spark.data.CassandraTypes; import org.apache.cassandra.spark.data.CqlField; import org.apache.cassandra.spark.data.CqlTable; @@ -152,7 +148,6 @@ public SchemaBuilder(String createStmt, partitioner, this.replicationFactor, tableId, enableCdc, - this.indexStatements, this::validateColumnMetaData)); this.keyspaceMetadata = updated.left; this.metadata = updated.right; @@ -169,7 +164,6 @@ private static Pair updateSchema(Schema schema, ReplicationFactor replicationFactor, UUID tableId, boolean enableCdc, - Set indexStatements, Consumer columnValidator) { // Set up and open keyspace if needed @@ -230,8 +224,20 @@ private static Pair updateSchema(Schema schema, } tableMetadata.columns().forEach(columnValidator); - tableMetadata = maybeApplyStorageAttachedIndexes(schema, keyspace, tableMetadata, - maybeExistingTableMetadata, indexStatements); + + // This builder always produces an index-less table (indexes are applied later by the 5.0 bridge), so a + // rebuild never carries indexes even if the caller passed index statements. buildSchema runs repeatedly per + // table in a JVM; if an earlier call already registered indexes, copy them onto this rebuild so it matches + // the registered table. + if (maybeExistingTableMetadata != null + && !maybeExistingTableMetadata.indexes.isEmpty() + && tableMetadata.indexes.isEmpty()) + { + tableMetadata = tableMetadata.unbuild() + .indexes(maybeExistingTableMetadata.indexes) + .build(); + } + setupTableAndUdt(schema, keyspace, tableMetadata, types); return validateKeyspaceTable(schema, keyspace, tableMetadata.name); @@ -349,70 +355,6 @@ private static boolean tableInstanceExists(Schema schema, String keyspaceName, S return true; } - /** - * Ensures the table metadata registered in the schema carries its Storage Attached Index (SAI) definitions. - *

- * {@code buildSchema} is invoked repeatedly for the same table within a JVM — once with the index statements - * (from {@code AbstractBulkWriterContext}) and once per partition with an empty index set (from - * {@code RecordWriter}). To keep the registered metadata stable across these calls: - *

    - *
  • If the table is already registered with indexes, reuse that metadata verbatim. Rebuilding would either - * drop the indexes (index-less callers) or mint fresh non-deterministic index ids (indexed callers), and - * either difference forces a table-metadata update that fails with {@code AlreadyExistsException}.
  • - *
  • Otherwise apply only the SAI statements; legacy (2i) and empty index sets are left untouched, so non-SAI - * and Cassandra 4.0 paths are unchanged.
  • - *
- */ - private static TableMetadata maybeApplyStorageAttachedIndexes(Schema schema, - String keyspace, - TableMetadata tableMetadata, - TableMetadata existingTableMetadata, - Set indexStatements) - { - if (existingTableMetadata != null && !existingTableMetadata.indexes.isEmpty()) - { - return existingTableMetadata; - } - - if (indexStatements == null || indexStatements.isEmpty()) - { - return tableMetadata; - } - - List saiStatements = indexStatements.stream() - .filter(SchemaBuilder::isStorageAttachedIndex) - .collect(Collectors.toList()); - if (saiStatements.isEmpty()) - { - return tableMetadata; - } - - KeyspaceMetadata keyspaceMetadata = schema.getKeyspaceMetadata(keyspace); - Keyspaces keyspaces = Keyspaces.of(keyspaceMetadata.withSwapped(Tables.of(tableMetadata))); - ClientState state = ClientState.forInternalCalls(); - for (String saiStatement : saiStatements) - { - CreateIndexStatement.Raw raw = CQLFragmentParser.parseAny(CqlParser::createIndexStatement, - saiStatement, "CREATE INDEX"); - keyspaces = raw.prepare(state).apply(keyspaces); - } - - TableMetadata withIndexes = keyspaces.get(keyspace) - .flatMap(ks -> ks.tables.get(tableMetadata.name)) - .orElseThrow(() -> new IllegalStateException( - "SAI index application produced no table metadata for " - + keyspace + '.' + tableMetadata.name)); - - LOGGER.info("Applied {} SAI index(es) to table metadata keyspace={} table={}", - saiStatements.size(), keyspace, tableMetadata.name); - return withIndexes; - } - - private static boolean isStorageAttachedIndex(String createIndexStatement) - { - return createIndexStatement.toUpperCase().contains("USING 'STORAGEATTACHEDINDEX'"); - } - // Check whether keyspace metadata exists. Create keyspace metadata, if not. // Check whether keyspace instance is opened. Open the keyspace, if not. // NOTE: It is possible that external code that just creates metadata, but does not open the keyspace