apache
diff --git a/‎CHANGES.txt‎
Lines changed: 1 addition & 0 deletions b/‎CHANGES.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/data/DynamicSizing.java‎
Lines changed: 122 additions & 0 deletions b/‎cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/data/DynamicSizing.java‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/data/TableSizeProvider.java‎
Lines changed: 36 additions & 0 deletions b/‎cassandra-analytics-common/src/main/java/org/apache/cassandra/spark/data/TableSizeProvider.java‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎…sandra/spark/data/DefaultSizingTest.java‎ ‎…sandra/spark/data/DefaultSizingTest.java‎cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/DefaultSizingTest.java renamed to cassandra-analytics-common/src/test/java/org/apache/cassandra/spark/data/DefaultSizingTest.java b/‎…sandra/spark/data/DefaultSizingTest.java‎ ‎…sandra/spark/data/DefaultSizingTest.java‎cassandra-analytics-core/src/test/java/org/apache/cassandra/spark/data/DefaultSizingTest.java renamed to cassandra-analytics-common/src/test/java/org/apache/cassandra/spark/data/DefaultSizingTest.java
diff --git a/‎cassandra-analytics-common/src/test/java/org/apache/cassandra/spark/data/DynamicSizingTest.java‎
Lines changed: 95 additions & 0 deletions b/‎cassandra-analytics-common/src/test/java/org/apache/cassandra/spark/data/DynamicSizingTest.java‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/common/SizingFactory.java‎
Lines changed: 99 additions & 0 deletions b/‎cassandra-analytics-core/src/main/java/org/apache/cassandra/spark/common/SizingFactory.java‎
Lines changed: 99 additions & 0 deletions
@@ -1,4 +1,5 @@
 1.0.0
+ * Bulk Reader should dynamically size the Spark job based on estimated table size (CASSANALYTICS-36)
  * Allow getting cassandra role in Spark options for use in Sidecar requests for RBAC (CASSANALYTICS-61)
  * Fix NPE in the deserialized CassandraClusterInfoGroup (CASSANALYTICS-59)
  * Replace NotImplementedException with UnsupportedOperationException in SparkType (CASSANALYTICS-55)
 
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.cassandra.spark.data;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.spark.data.partitioner.ConsistencyLevel;
+
+/**
+ * Dynamic {@link Sizing} implementation that uses table size, minimum number of replicas, maximum partition size,
+ * and available Spark cores to determine the effective number of executor cores to use during the spark job execution.
+ *
+ * <p>This class is typically used when the table size is relatively small (few GBs). When reading small datasets,
+ * this class will allocate a limited number of resources to read the table. This in turn helps reduce the cost of
+ * coordinating a large number of executor cores when the dataset does not justify using the entire spark cluster
+ * for reading.
+ */
+public class DynamicSizing implements Sizing
+{
+    private static final Logger LOGGER = LoggerFactory.getLogger(DynamicSizing.class);
+
+    private final ReplicationFactor replicationFactor;
+    private final int maxPartitionSize;
+    private final int availableCores;
+    private final String keyspace;
+    private final String table;
+    private final String dc;
+    private final TableSizeProvider tableSizeProvider;
+    private final ConsistencyLevel consistencyLevel;
+
+    /**
+     * Constructs a new Sizing object.
+     *
+     * @param tableSizeProvider the table size provider
+     * @param consistencyLevel  the consistency level for the read operation
+     * @param replicationFactor the replication factor for the keyspace
+     * @param keyspace          the Cassandra keyspace
+     * @param table             the Cassandra table
+     * @param datacenter        the Cassandra datacenter
+     * @param maxPartitionSize  the maximum partition size desired
+     * @param availableCores    the maximum number of cores available
+     */
+    public DynamicSizing(TableSizeProvider tableSizeProvider,
+                         ConsistencyLevel consistencyLevel,
+                         ReplicationFactor replicationFactor,
+                         String keyspace,
+                         String table,
+                         String datacenter,
+                         int maxPartitionSize,
+                         int availableCores)
+    {
+        this.tableSizeProvider = tableSizeProvider;
+        this.consistencyLevel = consistencyLevel;
+        this.replicationFactor = replicationFactor;
+        this.keyspace = keyspace;
+        this.table = table;
+        this.dc = datacenter;
+        this.maxPartitionSize = maxPartitionSize;
+        this.availableCores = availableCores;
+    }
+
+    /**
+     * Returns the effective number of cores to be used during the spark execution.
+     * The value is calculated by getting the table size * the number of replicas
+     * we will use to read the data and then dividing it by the maximum partition
+     * size in GB. For example, assume we have a table with 7.25 GB of data, and
+     * assume a maximum partition size of 2.5 GB. Also, assume that a consistency
+     * level of {@code LOCAL_QUORUM} and replication factor of 3. The number of
+     * cores is calculated by the following formula:
+     *
+     * <pre>
+     *                                           totalTableSize * minReplicas
+     *     effectiveNumberOfCores = Math.ceil( --------------------------------- )
+     *                                              maxPartitionSize
+     * </pre>
+     *
+     * <p>In the example above, we have:
+     *
+     * <pre>
+     *                                7.25 GB * 2
+     *     effectiveNumberOfCores = --------------- = 5.8 ~&gt; 6 cores
+     *                                  2.5 GB
+     * </pre>
+     *
+     * <p>This method is guaranteed to return at least 1 core and at most {@code availableCores}
+     *
+     * @return the effective number of cores to be used during the spark execution
+     */
+    @Override
+    public int getEffectiveNumberOfCores()
+    {
+        double tableSizeInGiB = ((double) tableSizeProvider.tableSizeInBytes(keyspace, table, dc)
+                                 / (double) (1024 /* KiB */ * 1024 /* MiB */ * 1024 /* GiB */));
+        double minReplicas = consistencyLevel.blockFor(replicationFactor, dc);
+
+        // Guarantee at least one core and at most availableCores
+        int effectiveNumberOfCores = Math.min(Math.max(1, (int) Math.ceil(tableSizeInGiB * minReplicas / maxPartitionSize)), availableCores);
+
+        LOGGER.info("Using Dynamic Sizing. tableSize {}GiB, minReplicas {}, maxPartitionSize {}GiB, availableCores {}, effectiveNumberOfCores {}",
+                    tableSizeInGiB, minReplicas, maxPartitionSize, availableCores, effectiveNumberOfCores);
+
+        return effectiveNumberOfCores;
+    }
+}
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.cassandra.spark.data;
+
+/**
+ * Defines an interface to provide the size of a table in a given keyspace
+ */
+public interface TableSizeProvider
+{
+    /**
+     * Returns the total used space for {@code table} across the datacenter.
+     *
+     * @param keyspace   the keyspace where the table lives
+     * @param table      the table to get the size from
+     * @param datacenter the datacenter
+     * @return the total used space for {@code table} across the datacenter
+     */
+    long tableSizeInBytes(String keyspace, String table, String datacenter);
+}
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.cassandra.spark.data;
+
+import java.util.Map;
+import java.util.stream.Stream;
+
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+import org.apache.cassandra.spark.data.partitioner.ConsistencyLevel;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Unit tests for the {@link DynamicSizing} class
+ */
+class DynamicSizingTest
+{
+    public static final long TEN_GIB = 10L * 1024L * 1024L * 1024L;
+    private static final ReplicationFactor RF = new ReplicationFactor(ReplicationFactor.ReplicationStrategy.NetworkTopologyStrategy,
+                                                                      Map.of("datacenter1", 3));
+
+    @ParameterizedTest
+    @MethodSource("scenarios")
+    void testSizingScenario(SizingScenario scenario)
+    {
+        TableSizeProvider tableSizeProvider = (keyspace, table, datacenter) -> scenario.tableSizeInBytes;
+        Sizing sizing = new DynamicSizing(tableSizeProvider,
+                                          ConsistencyLevel.LOCAL_QUORUM,
+                                          RF,
+                                          "big-data",
+                                          "customers",
+                                          "datacenter1",
+                                          scenario.maxPartitionSize,
+                                          scenario.numCores);
+        assertThat(sizing.getEffectiveNumberOfCores()).as("Number of cores does not match").isEqualTo(scenario.expectedNumberOfCores);
+    }
+
+    static Stream<Arguments> scenarios()
+    {
+        return Stream.of(
+        Arguments.arguments(new SizingScenario(1000, 5, TEN_GIB, 4)),
+        Arguments.arguments(new SizingScenario(1000, 1, TEN_GIB, 20)),
+        Arguments.arguments(new SizingScenario(1000, 1, TEN_GIB, 20)),
+        Arguments.arguments(new SizingScenario(1000, 5, TEN_GIB, 4)),
+        Arguments.arguments(new SizingScenario(1000, 5, TEN_GIB, 4))
+        );
+    }
+
+    static class SizingScenario
+    {
+        private final int numCores;
+        private final int maxPartitionSize;
+        private final long tableSizeInBytes;
+        private final int expectedNumberOfCores;
+
+        SizingScenario(int numCores, int maxPartitionSize, long tableSizeInBytes, int expectedNumberOfCores)
+        {
+            this.numCores = numCores;
+            this.maxPartitionSize = maxPartitionSize;
+            this.tableSizeInBytes = tableSizeInBytes;
+            this.expectedNumberOfCores = expectedNumberOfCores;
+        }
+
+        @Override
+        public String toString()
+        {
+            return "Scenario{" +
+                   "numCores=" + numCores +
+                   ", maxPartitionSize=" + maxPartitionSize +
+                   ", tableSizeInBytes=" + tableSizeInBytes +
+                   ", expectedNumberOfCores=" + expectedNumberOfCores +
+                   '}';
+        }
+    }
+}
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.cassandra.spark.common;
+
+import java.util.concurrent.CompletableFuture;
+
+import o.a.c.sidecar.client.shaded.common.response.RingResponse;
+import org.apache.cassandra.clients.Sidecar;
+import org.apache.cassandra.sidecar.client.SidecarClient;
+import org.apache.cassandra.spark.data.ClientConfig;
+import org.apache.cassandra.spark.data.DefaultSizing;
+import org.apache.cassandra.spark.data.DynamicSizing;
+import org.apache.cassandra.spark.data.ReplicationFactor;
+import org.apache.cassandra.spark.data.SidecarTableSizeProvider;
+import org.apache.cassandra.spark.data.Sizing;
+import org.apache.cassandra.spark.data.TableSizeProvider;
+import org.apache.cassandra.spark.data.partitioner.ConsistencyLevel;
+
+import static org.apache.cassandra.spark.data.ClientConfig.SIZING_DEFAULT;
+import static org.apache.cassandra.spark.data.ClientConfig.SIZING_DYNAMIC;
+
+/**
+ * A factory class that creates {@link Sizing} based on the client-supplied configuration
+ */
+public class SizingFactory
+{
+    /**
+     * Private constructor that prevents unnecessary instantiation
+     *
+     * @throws IllegalStateException when called
+     */
+    private SizingFactory()
+    {
+        throw new IllegalStateException(getClass() + " is a static utility class and shall not be instantiated");
+    }
+
+    /**
+     * Returns the {@link Sizing} object based on the {@code sizing} option provided by the user,
+     * or {@link DefaultSizing} as the default sizing
+     *
+     * @param replicationFactor   the replication factor
+     * @param options             the {@link ClientConfig} options
+     * @param consistencyLevel    the ConsistencyLevel to use
+     * @param keyspace            the keyspace
+     * @param table               the table
+     * @param datacenter          the DataCenter to use
+     * @param sidecarClient       the sidecar client instance to use
+     * @param sidecarClientConfig the configuration to use with the sidecar client
+     * @param ringFuture          a future representing the result of getting the current ring from the sidecar
+     * @return the {@link Sizing} object based on the {@code sizing} option provided by the user
+     */
+    public static Sizing create(ReplicationFactor replicationFactor,
+                                ClientConfig options,
+                                ConsistencyLevel consistencyLevel,
+                                String keyspace,
+                                String table,
+                                String datacenter,
+                                SidecarClient sidecarClient,
+                                Sidecar.ClientConfig sidecarClientConfig,
+                                CompletableFuture<RingResponse> ringFuture)
+    {
+        if (SIZING_DYNAMIC.equalsIgnoreCase(options.sizing()))
+        {
+            TableSizeProvider tableSizeProvider = getTableSizeProvider(sidecarClient, sidecarClientConfig, ringFuture);
+            return new DynamicSizing(tableSizeProvider, consistencyLevel, replicationFactor,
+                                     keyspace, table, datacenter,
+                                     options.maxPartitionSize(), options.numCores());
+        }
+        else if (options.sizing() == null || options.sizing().isEmpty() || SIZING_DEFAULT.equalsIgnoreCase(options.sizing()))
+        {
+            return new DefaultSizing(options.numCores());
+        }
+        throw new RuntimeException(String.format("Invalid sizing option provided '%s'", options.sizing()));
+    }
+
+    protected static TableSizeProvider getTableSizeProvider(SidecarClient sidecarClient,
+                                                            Sidecar.ClientConfig sidecarClientConfig,
+                                                            CompletableFuture<RingResponse> ringFuture)
+    {
+        return new SidecarTableSizeProvider(sidecarClient, sidecarClientConfig, ringFuture);
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`1.0.0`
	`2`	`+ * Bulk Reader should dynamically size the Spark job based on estimated table size (CASSANALYTICS-36)`
`2`	`3`	`* Allow getting cassandra role in Spark options for use in Sidecar requests for RBAC (CASSANALYTICS-61)`
`3`	`4`	`* Fix NPE in the deserialized CassandraClusterInfoGroup (CASSANALYTICS-59)`
`4`	`5`	`* Replace NotImplementedException with UnsupportedOperationException in SparkType (CASSANALYTICS-55)`