Azure · FabianMeiswinkel · Apr 24, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 14, 2026
@@ -2,3 +2,5 @@
 
 metastore_db/*
 spark-warehouse/*
+
+.temp/
@@ -3,6 +3,8 @@
 ### 4.48.0-beta.1 (Unreleased)
 
 #### Features Added
+* Added new `CosmosItemsDataSource.readManyByPartitionKeys` Spark function to execute bulk queries by a list of pk-values with better efficiency. Configure null handling via `spark.cosmos.read.readManyByPk.nullHandling` - default `Null` treats a null PK column as JSON null (`addNullValue`), `None` treats it as `PartitionKey.NONE` (`addNoneValue` / `NOT IS_DEFINED`). These route to different physical partitions - picking the wrong mode silently returns zero rows. See [PR 48801](https://github.com/Azure/azure-sdk-for-java/pull/48801)
+* Added Spark config `spark.cosmos.read.readManyByPk.maxConcurrentBatchPrefetch` (default `1`) to bound the per-task prefetch parallelism the SDK uses inside `readManyByPartitionKeys`. See [PR 48801](https://github.com/Azure/azure-sdk-for-java/pull/48801)
 
 #### Breaking Changes
 

@@ -3,6 +3,8 @@
 ### 4.48.0-beta.1 (Unreleased)
 
 #### Features Added
+* Added new `CosmosItemsDataSource.readManyByPartitionKeys` Spark function to execute bulk queries by a list of pk-values with better efficiency. Configure null handling via `spark.cosmos.read.readManyByPk.nullHandling` - default `Null` treats a null PK column as JSON null (`addNullValue`), `None` treats it as `PartitionKey.NONE` (`addNoneValue` / `NOT IS_DEFINED`). These route to different physical partitions - picking the wrong mode silently returns zero rows. See [PR 48801](https://github.com/Azure/azure-sdk-for-java/pull/48801)
+* Added Spark config `spark.cosmos.read.readManyByPk.maxConcurrentBatchPrefetch` (default `1`) to bound the per-task prefetch parallelism the SDK uses inside `readManyByPartitionKeys`. See [PR 48801](https://github.com/Azure/azure-sdk-for-java/pull/48801)
 
 #### Breaking Changes
 

@@ -4,13 +4,20 @@
 package com.azure.cosmos.spark
 
 import com.azure.cosmos.implementation.TestConfigurations
+import com.azure.cosmos.models.{CosmosContainerProperties, CosmosItemRequestOptions, PartitionKey, PartitionKeyBuilder, PartitionKeyDefinition, PartitionKeyDefinitionVersion, PartitionKind, ThroughputProperties}
+import com.azure.cosmos.spark.udf.GetCosmosPartitionKeyValue
 import com.fasterxml.jackson.databind.node.ObjectNode
+import org.apache.spark.sql.functions.expr
+import org.apache.spark.sql.types.StringType
 
-import java.util.UUID
+import java.util.{ArrayList, UUID}
+
+import scala.collection.JavaConverters._
 
 class SparkE2EQueryITest
     extends SparkE2EQueryITestBase {
 
+    // scalastyle:off multiple.string.literals
     "spark query" can "return proper Cosmos specific query plan on explain with nullable properties" in {
         val cosmosEndpoint = TestConfigurations.HOST
         val cosmosMasterKey = TestConfigurations.MASTER_KEY
@@ -67,4 +74,194 @@ class SparkE2EQueryITest
         val item = rowsArray(0)
         item.getAs[String]("id") shouldEqual id
     }
+
+    "spark readManyByPartitionKeys" can "use a matching top-level partition key column without the UDF" in {
+        val cosmosEndpoint = TestConfigurations.HOST
+        val cosmosMasterKey = TestConfigurations.MASTER_KEY
+        val container = cosmosClient.getDatabase(cosmosDatabase).getContainer(cosmosContainersWithPkAsPartitionKey)
+        val requestOptions = new CosmosItemRequestOptions()
+
+        Seq("pkA", "pkB").foreach { pkValue =>
+            val item = objectMapper.createObjectNode()
+            item.put("id", s"item-$pkValue")
+            item.put("pk", pkValue)
+            item.put("payload", s"value-$pkValue")
+
+            container.createItem(item, new PartitionKey(pkValue), requestOptions).block()
+        }
+
+        val cfg = Map(
+            "spark.cosmos.accountEndpoint" -> cosmosEndpoint,
+            "spark.cosmos.accountKey" -> cosmosMasterKey,
+            "spark.cosmos.database" -> cosmosDatabase,
+            "spark.cosmos.container" -> cosmosContainersWithPkAsPartitionKey,
+            "spark.cosmos.read.inferSchema.enabled" -> "true"
+        )
+
+        val sparkSession = spark
+        import sparkSession.implicits._
+
+        val rows = CosmosItemsDataSource
+            .readManyByPartitionKeys(Seq("pkA", "pkB").toDF("pk"), cfg.asJava)
+            .selectExpr("id", "pk", "payload")
+            .collect()
+
+        rows should have size 2
+        rows.map(_.getAs[String]("id")).toSet shouldEqual Set("item-pkA", "item-pkB")
+        rows.map(_.getAs[String]("pk")).toSet shouldEqual Set("pkA", "pkB")
+        rows.map(_.getAs[String]("payload")).toSet shouldEqual Set("value-pkA", "value-pkB")
+    }
+
+    "spark readManyByPartitionKeys" can "require the UDF for nested partition key paths and succeed with it" in {
+        val cosmosEndpoint = TestConfigurations.HOST
+        val cosmosMasterKey = TestConfigurations.MASTER_KEY
+        val containerName = s"nested-pk-${UUID.randomUUID()}"
+
+        val pkPaths = new ArrayList[String]()
+        pkPaths.add("/tenant/id")
+
+        val pkDefinition = new PartitionKeyDefinition()
+        pkDefinition.setPaths(pkPaths)
+        pkDefinition.setKind(PartitionKind.HASH)
+        pkDefinition.setVersion(PartitionKeyDefinitionVersion.V2)
+
+        val containerProperties = new CosmosContainerProperties(containerName, pkDefinition)
+        cosmosClient
+            .getDatabase(cosmosDatabase)
+            .createContainerIfNotExists(containerProperties, ThroughputProperties.createManualThroughput(400))
+            .block()
+
+        try {
+            val container = cosmosClient.getDatabase(cosmosDatabase).getContainer(containerName)
+            val requestOptions = new CosmosItemRequestOptions()
+
+            Seq("tenantA", "tenantB").foreach { tenantId =>
+                val item = objectMapper.createObjectNode()
+                item.put("id", s"item-$tenantId")
+                item.put("payload", s"value-$tenantId")
+                item.putObject("tenant").put("id", tenantId)
+
+                container.createItem(item, new PartitionKey(tenantId), requestOptions).block()
+            }
+
+            val cfg = Map(
+                "spark.cosmos.accountEndpoint" -> cosmosEndpoint,
+                "spark.cosmos.accountKey" -> cosmosMasterKey,
+                "spark.cosmos.database" -> cosmosDatabase,
+                "spark.cosmos.container" -> containerName,
+                "spark.cosmos.read.inferSchema.enabled" -> "true"
+            )
+
+            val sparkSession = spark
+            import sparkSession.implicits._
+
+            val missingUdfError = the[IllegalArgumentException] thrownBy {
+                CosmosItemsDataSource.readManyByPartitionKeys(Seq("tenantA").toDF("tenantId"), cfg.asJava)
+            }
+
+            missingUdfError.getMessage should include("Nested paths cannot be resolved from DataFrame columns automatically")
+            missingUdfError.getMessage should include("_partitionKeyIdentity")
+
+            spark.udf.register("GetCosmosPartitionKeyValue", new GetCosmosPartitionKeyValue(), StringType)
+
+            val inputDf = Seq("tenantA", "tenantB")
+                .toDF("tenantId")
+                .withColumn("_partitionKeyIdentity", expr("GetCosmosPartitionKeyValue(tenantId)"))
+
+            val rows = CosmosItemsDataSource
+                .readManyByPartitionKeys(inputDf, cfg.asJava)
+                .selectExpr("id", "tenant.id as tenantId")
+                .collect()
+
+            rows should have size 2
+            rows.map(_.getAs[String]("id")).toSet shouldEqual Set("item-tenantA", "item-tenantB")
+            rows.map(_.getAs[String]("tenantId")).toSet shouldEqual Set("tenantA", "tenantB")
+        } finally {
+            cosmosClient
+                .getDatabase(cosmosDatabase)
+                .getContainer(containerName)
+                .delete()
+                .block()
+        }
+    }
+
+
+    "spark readManyByPartitionKeys" can "support partial top-level hierarchical partition keys from DataFrame columns without the UDF" in {
+        val cosmosEndpoint = TestConfigurations.HOST
+        val cosmosMasterKey = TestConfigurations.MASTER_KEY
+        val containerName = s"top-level-hpk-${UUID.randomUUID()}"
+
+        val pkPaths = new ArrayList[String]()
+        pkPaths.add("/tenant")
+        pkPaths.add("/region")
+        pkPaths.add("/team")
+
+        val pkDefinition = new PartitionKeyDefinition()
+        pkDefinition.setPaths(pkPaths)
+        pkDefinition.setKind(PartitionKind.MULTI_HASH)
+        pkDefinition.setVersion(PartitionKeyDefinitionVersion.V2)
+
+        val containerProperties = new CosmosContainerProperties(containerName, pkDefinition)
+        cosmosClient
+            .getDatabase(cosmosDatabase)
+            .createContainerIfNotExists(containerProperties, ThroughputProperties.createManualThroughput(400))
+            .block()
+
+        try {
+            val container = cosmosClient.getDatabase(cosmosDatabase).getContainer(containerName)
+            val requestOptions = new CosmosItemRequestOptions()
+
+            Seq(
+                ("tenantA", "east", "sales", "item-a1"),
+                ("tenantA", "west", "hr", "item-a2"),
+                ("tenantB", "east", "sales", "item-b1")
+            ).foreach { case (tenant, region, team, id) =>
+                val item = objectMapper.createObjectNode()
+                item.put("id", id)
+                item.put("tenant", tenant)
+                item.put("region", region)
+                item.put("team", team)
+                item.put("payload", s"$tenant-$region-$team")
+
+                val pk = new PartitionKeyBuilder().add(tenant).add(region).add(team).build()
+                container.createItem(item, pk, requestOptions).block()
+            }
+
+            val cfg = Map(
+                "spark.cosmos.accountEndpoint" -> cosmosEndpoint,
+                "spark.cosmos.accountKey" -> cosmosMasterKey,
+                "spark.cosmos.database" -> cosmosDatabase,
+                "spark.cosmos.container" -> containerName,
+                "spark.cosmos.read.inferSchema.enabled" -> "true"
+            )
+
+            val sparkSession = spark
+            import sparkSession.implicits._
+
+            val tenantRows = CosmosItemsDataSource
+                .readManyByPartitionKeys(Seq("tenantA").toDF("tenant"), cfg.asJava)
+                .selectExpr("id", "tenant", "region", "team")
+                .collect()
+
+            tenantRows should have size 2
+            tenantRows.map(_.getAs[String]("id")).toSet shouldEqual Set("item-a1", "item-a2")
+            tenantRows.map(_.getAs[String]("tenant")).toSet shouldEqual Set("tenantA")
+
+            val tenantRegionRows = CosmosItemsDataSource
+                .readManyByPartitionKeys(Seq(("tenantA", "east")).toDF("tenant", "region"), cfg.asJava)
+                .selectExpr("id", "tenant", "region", "team")
+                .collect()
+
+            tenantRegionRows should have size 1
+            tenantRegionRows.head.getAs[String]("id") shouldEqual "item-a1"
+        } finally {
+            cosmosClient
+                .getDatabase(cosmosDatabase)
+                .getContainer(containerName)
+                .delete()
+                .block()
+        }
+    }
+
+    // scalastyle:on multiple.string.literals
 }
@@ -3,6 +3,8 @@
 ### 4.48.0-beta.1 (Unreleased)
 
 #### Features Added
+* Added new `CosmosItemsDataSource.readManyByPartitionKeys` Spark function to execute bulk queries by a list of pk-values with better efficiency. Configure null handling via `spark.cosmos.read.readManyByPk.nullHandling` - default `Null` treats a null PK column as JSON null (`addNullValue`), `None` treats it as `PartitionKey.NONE` (`addNoneValue` / `NOT IS_DEFINED`). These route to different physical partitions - picking the wrong mode silently returns zero rows. See [PR 48801](https://github.com/Azure/azure-sdk-for-java/pull/48801)
+* Added Spark config `spark.cosmos.read.readManyByPk.maxConcurrentBatchPrefetch` (default `1`) to bound the per-task prefetch parallelism the SDK uses inside `readManyByPartitionKeys`. See [PR 48801](https://github.com/Azure/azure-sdk-for-java/pull/48801)
 
 #### Breaking Changes
 

@@ -3,6 +3,8 @@
 ### 4.48.0-beta.1 (Unreleased)
 
 #### Features Added
+* Added new `CosmosItemsDataSource.readManyByPartitionKeys` Spark function to execute bulk queries by a list of pk-values with better efficiency. Configure null handling via `spark.cosmos.read.readManyByPk.nullHandling` - default `Null` treats a null PK column as JSON null (`addNullValue`), `None` treats it as `PartitionKey.NONE` (`addNoneValue` / `NOT IS_DEFINED`). These route to different physical partitions - picking the wrong mode silently returns zero rows. See [PR 48801](https://github.com/Azure/azure-sdk-for-java/pull/48801)
+* Added Spark config `spark.cosmos.read.readManyByPk.maxConcurrentBatchPrefetch` (default `1`) to bound the per-task prefetch parallelism the SDK uses inside `readManyByPartitionKeys`. See [PR 48801](https://github.com/Azure/azure-sdk-for-java/pull/48801)
 
 #### Breaking Changes
 

@@ -14,4 +14,12 @@ private[cosmos] abstract class CosmosItemSerializerNoExceptionWrapping extends C
     .CosmosItemSerializerHelper
     .getCosmosItemSerializerAccessor
     .setCanSerialize(this, false)
+
+  // canSerialize is set to false above, so the SDK will never call serialize().
+  // This default implementation satisfies the abstract method contract and throws if
+  // called unexpectedly.
+  override def serialize[T](item: T): java.util.Map[String, AnyRef] = {
+    throw new UnsupportedOperationException(
+      "serialize() is not supported on CosmosItemSerializerNoExceptionWrapping (canSerialize = false)")
+  }
 }
@@ -92,6 +92,8 @@ private[spark] object CosmosConfigNames {
   val ReadPartitioningFeedRangeFilter = "spark.cosmos.partitioning.feedRangeFilter"
   val ReadRuntimeFilteringEnabled = "spark.cosmos.read.runtimeFiltering.enabled"
   val ReadManyFilteringEnabled = "spark.cosmos.read.readManyFiltering.enabled"
+  val ReadManyByPkNullHandling = "spark.cosmos.read.readManyByPk.nullHandling"
+  val ReadManyByPkMaxConcurrentBatchPrefetch = "spark.cosmos.read.readManyByPk.maxConcurrentBatchPrefetch"
   val ViewsRepositoryPath = "spark.cosmos.views.repositoryPath"
   val DiagnosticsMode = "spark.cosmos.diagnostics"
   val DiagnosticsSamplingMaxCount = "spark.cosmos.diagnostics.sampling.maxCount"
@@ -226,6 +228,8 @@ private[spark] object CosmosConfigNames {
     ReadPartitioningFeedRangeFilter,
     ReadRuntimeFilteringEnabled,
     ReadManyFilteringEnabled,
+    ReadManyByPkNullHandling,
+    ReadManyByPkMaxConcurrentBatchPrefetch,
     ViewsRepositoryPath,
     DiagnosticsMode,
     DiagnosticsSamplingIntervalInSeconds,
@@ -1042,7 +1046,9 @@ private case class CosmosReadConfig(readConsistencyStrategy: ReadConsistencyStra
                                     throughputControlConfig: Option[CosmosThroughputControlConfig] = None,
                                     runtimeFilteringEnabled: Boolean,
                                     readManyFilteringConfig: CosmosReadManyFilteringConfig,
-                                    responseContinuationTokenLimitInKb: Option[Int] = None)
+                                    responseContinuationTokenLimitInKb: Option[Int] = None,
+                                    readManyByPkTreatNullAsNone: Boolean = false,
+                                    readManyByPkMaxConcurrentBatchPrefetch: Int = 1)
 
 private object SchemaConversionModes extends Enumeration {
   type SchemaConversionMode = Value
@@ -1136,6 +1142,31 @@ private object CosmosReadConfig {
     helpMessage = " Indicates whether dynamic partition pruning filters will be pushed down when applicable."
   )
 
+  private val ReadManyByPkNullHandling = CosmosConfigEntry[String](
+    key = CosmosConfigNames.ReadManyByPkNullHandling,
+    mandatory = false,
+    defaultValue = Some("Null"),
+    parseFromStringFunction = value => value,
+    helpMessage = "Determines how null values in partition key columns are treated for " +
+      "readManyByPartitionKeys. 'Null' (default) maps null to a JSON null via addNullValue(), which " +
+      "is appropriate when the document field exists with an explicit null value. 'None' maps null " +
+      "to PartitionKey.NONE via addNoneValue(), which is only supported for single-path partition keys " +
+      "and should only be used when the partition key path does not exist at all in the document. " +
+      "Hierarchical partition keys reject this mode. These two semantics hash to DIFFERENT physical " +
+      "partitions - picking the wrong mode for your data will silently return zero rows."
+  )
+
+  private val ReadManyByPkMaxConcurrentBatchPrefetch = CosmosConfigEntry[Int](
+    key = CosmosConfigNames.ReadManyByPkMaxConcurrentBatchPrefetch,
+    mandatory = false,
+    defaultValue = Some(1),
+    parseFromStringFunction = value => Math.min(64, Math.max(1, value.toInt)),
+    helpMessage = "The maximum number of per-physical-partition batches whose first page is prefetched " +
+      "concurrently inside a single Spark task by the SDK's readManyByPartitionKeys execution. The " +
+      "default is `1` - max is `64`, because Spark already parallelises across tasks - increase this when individual " +
+      "tasks span many physical partitions and additional intra-task prefetch is desired."
+  )
+
   def parseCosmosReadConfig(cfg: Map[String, String]): CosmosReadConfig = {
     val forceEventualConsistency = CosmosConfigEntry.parse(cfg, ForceEventualConsistency)
     val readConsistencyStrategyOverride = CosmosConfigEntry.parse(cfg, ReadConsistencyStrategyOverride)
@@ -1158,6 +1189,9 @@ private object CosmosReadConfig {
     val throughputControlConfigOpt = CosmosThroughputControlConfig.parseThroughputControlConfig(cfg)
     val runtimeFilteringEnabled = CosmosConfigEntry.parse(cfg, ReadRuntimeFilteringEnabled)
     val readManyFilteringConfig = CosmosReadManyFilteringConfig.parseCosmosReadManyFilterConfig(cfg)
+    val readManyByPkNullHandling = CosmosConfigEntry.parse(cfg, ReadManyByPkNullHandling)
+    val readManyByPkTreatNullAsNone = readManyByPkNullHandling.getOrElse("Null").equalsIgnoreCase("None")
+    val readManyByPkMaxConcurrentBatchPrefetch = CosmosConfigEntry.parse(cfg, ReadManyByPkMaxConcurrentBatchPrefetch).getOrElse(1)
 
     val effectiveReadConsistencyStrategy = if (readConsistencyStrategyOverride.getOrElse(ReadConsistencyStrategy.DEFAULT) != ReadConsistencyStrategy.DEFAULT) {
       readConsistencyStrategyOverride.get
@@ -1189,7 +1223,9 @@ private object CosmosReadConfig {
       throughputControlConfigOpt,
       runtimeFilteringEnabled.get,
       readManyFilteringConfig,
-      responseContinuationTokenLimitInKb)
+      responseContinuationTokenLimitInKb,
+      readManyByPkTreatNullAsNone,
+      readManyByPkMaxConcurrentBatchPrefetch)
   }
 }
 

@@ -45,6 +45,7 @@ private[cosmos] object CosmosConstants {
     val Id = "id"
     val ETag = "_etag"
     val ItemIdentity = "_itemIdentity"
+    val PartitionKeyIdentity = "_partitionKeyIdentity"
   }
 
   object StatusCodes {
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,3 +2,5 @@

		metastore_db/*
		spark-warehouse/*

		.temp/