From 66667d41a8352535afe2ad1e3472e8f60de3227b Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Tue, 8 Aug 2023 09:28:49 -0700 Subject: [PATCH 01/38] init --- .../spark/sql/pulsar/PulsarSource.scala | 72 ++++++++++++++++++- .../spark/sql/pulsar/PulsarSources.scala | 30 ++++++++ 2 files changed, 99 insertions(+), 3 deletions(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala index 851ddca7..2ad87549 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala @@ -13,19 +13,25 @@ */ package org.apache.spark.sql.pulsar -import java.{util => ju} +import org.apache.pulsar.client.admin.PulsarAdmin +import java.{util => ju} import org.apache.pulsar.client.api.MessageId import org.apache.pulsar.client.impl.MessageIdImpl +import org.apache.pulsar.client.internal.DefaultImplementation import org.apache.pulsar.common.schema.SchemaInfo - import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.json.JSONOptionsInRead +import org.apache.spark.sql.connector.read.streaming +import org.apache.spark.sql.connector.read.streaming.{ReadAllAvailable, ReadLimit, ReadMaxFiles, SupportsAdmissionControl} import org.apache.spark.sql.execution.streaming.{Offset, Source} +import org.apache.spark.sql.pulsar.SpecificPulsarOffset.getTopicOffsets import org.apache.spark.sql.types.StructType +import scala.collection.mutable + private[pulsar] class PulsarSource( sqlContext: SQLContext, pulsarHelper: PulsarHelper, @@ -38,7 +44,8 @@ private[pulsar] class PulsarSource( subscriptionNamePrefix: String, jsonOptions: JSONOptionsInRead) extends Source - with Logging { + with Logging + with SupportsAdmissionControl { import PulsarSourceUtils._ @@ -54,6 +61,8 @@ private[pulsar] class PulsarSource( private var currentTopicOffsets: Option[Map[String, MessageId]] = None + private val pulsarAdmin = PulsarAdmin.builder().serviceHttpUrl(clientConf.get("serviceUrl").toString).build() + private lazy val pulsarSchema: SchemaInfo = pulsarHelper.getPulsarSchema override def schema(): StructType = SchemaUtils.pulsarSourceSchema(pulsarSchema) @@ -67,6 +76,60 @@ private[pulsar] class PulsarSource( Some(latest.asInstanceOf[Offset]) } + override def latestOffset(startingOffset: streaming.Offset, readLimit: ReadLimit): streaming.Offset = { + initialTopicOffsets + val latestOffsets = pulsarHelper.fetchLatestOffsets().topicOffsets + // add new partitions from PulsarAdmin, set to earliest entry and ledger id based on limit + val existingStartOffsets = getTopicOffsets(startingOffset.asInstanceOf[SpecificPulsarOffset]) + val newTopics = latestOffsets.keySet.diff(existingStartOffsets.keySet) + val startPartitionOffsets = existingStartOffsets ++ newTopics.map(topicPartition => topicPartition -> MessageId.earliest) + val totalReadLimit = AdmissionLimits(readLimit).get.bytesToTake + val offsets = mutable.Map[String, MessageIdImpl]() + + val numPartitions = startPartitionOffsets.size + startPartitionOffsets.keys.foreach { topicPartition => + var readLimit = totalReadLimit / numPartitions + pulsarHelper.fetchLatestOffsetForTopic(topicPartition) + val messageId = startPartitionOffsets.apply(topicPartition) + val ledgerId = getLedgerId(messageId) + val entryId = getEntryId(messageId) + pulsarAdmin.topics().getPartitionedInternalStats(topicPartition).partitions.forEach { (_, partitionMetadata) => + partitionMetadata.ledgers.sort((ledger1, ledger2) => { + (ledger1.ledgerId - ledger2.ledgerId).toInt + }) + partitionMetadata.ledgers.forEach { ledger => + if (ledger.ledgerId >= ledgerId) { + val avgBytesPerEntries = ledger.size / ledger.entries + // approximation of bytes left in ledger to deal with case + // where we are at the middle of the ledger + val bytesLeftInLedger = avgBytesPerEntries * (ledger.entries - entryId) + if (readLimit > bytesLeftInLedger) { + readLimit -= bytesLeftInLedger + offsets += (topicPartition -> DefaultImplementation + .getDefaultImplementation + .newMessageId(ledger.ledgerId, ledger.entries, -1)) + } else { + offsets += (topicPartition -> DefaultImplementation + .getDefaultImplementation + .newMessageId(ledger.ledgerId, entryId + readLimit / avgBytesPerEntries, -1)) + readLimit = 0 + } + } + } + } + } + SpecificPulsarOffset(offsets.toMap) + } + + class AdmissionLimits(var bytesToTake: Long) + + object AdmissionLimits { + def apply(limit: ReadLimit): Option[AdmissionLimits] = limit match { + case maxBytes: ReadMaxBytes => Some (new AdmissionLimits(maxBytes.maxBytes) ) + } + + } + override def getBatch(start: Option[Offset], end: Offset): DataFrame = { // Make sure initialTopicOffsets is initialized initialTopicOffsets @@ -169,3 +232,6 @@ private[pulsar] class PulsarSource( } } + +/** A read limit that admits a soft-max of `maxBytes` per micro-batch. */ +case class ReadMaxBytes(maxBytes: Long) extends ReadLimit \ No newline at end of file diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSources.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSources.scala index ec86a488..990578d6 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSources.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSources.scala @@ -120,6 +120,36 @@ private[pulsar] object PulsarSourceUtils extends Logging { } } + def getLedgerId(mid: MessageId): Long = { + mid match { + case bmid: BatchMessageIdImpl => + bmid.getLedgerId + case midi: MessageIdImpl => midi.getLedgerId + case t: TopicMessageIdImpl => getLedgerId(t.getInnerMessageId) + case up: UserProvidedMessageId => up.getLedgerId + } + } + + def getEntryId(mid: MessageId): Long = { + mid match { + case bmid: BatchMessageIdImpl => + bmid.getEntryId + case midi: MessageIdImpl => midi.getEntryId + case t: TopicMessageIdImpl => getEntryId(t.getInnerMessageId) + case up: UserProvidedMessageId => up.getEntryId + } + } + + def getPartitionIndex(mid: MessageId): Int = { + mid match { + case bmid: BatchMessageIdImpl => + bmid.getPartitionIndex + case midi: MessageIdImpl => midi.getPartitionIndex + case t: TopicMessageIdImpl => getPartitionIndex(t.getInnerMessageId) + case up: UserProvidedMessageId => up.getPartitionIndex + } + } + def seekableLatestMid(mid: MessageId): MessageId = { if (messageExists(mid)) mid else MessageId.earliest } From 9d5ceef541d9e182d12fa0514cd6237169235811 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Tue, 8 Aug 2023 09:38:44 -0700 Subject: [PATCH 02/38] correcting bytesLeftInLedger calculation --- .../scala/org/apache/spark/sql/pulsar/PulsarSource.scala | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala index 2ad87549..5229747d 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala @@ -102,7 +102,13 @@ private[pulsar] class PulsarSource( val avgBytesPerEntries = ledger.size / ledger.entries // approximation of bytes left in ledger to deal with case // where we are at the middle of the ledger - val bytesLeftInLedger = avgBytesPerEntries * (ledger.entries - entryId) + val bytesLeftInLedger = avgBytesPerEntries * { + if (ledger.ledgerId == ledgerId) { + ledger.entries - entryId + } else { + ledger.entries + } + } if (readLimit > bytesLeftInLedger) { readLimit -= bytesLeftInLedger offsets += (topicPartition -> DefaultImplementation From 10b944466e3cebcf09e736c46a0387f338ebdcd0 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Tue, 8 Aug 2023 11:43:56 -0700 Subject: [PATCH 03/38] adding check for startoffset --- .../spark/sql/pulsar/PulsarProvider.scala | 1 + .../apache/spark/sql/pulsar/PulsarSource.scala | 17 ++++++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala index a0fa5023..737e246c 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala @@ -106,6 +106,7 @@ private[pulsar] class PulsarProvider pulsarHelper.setupCursor(offset) new PulsarSource( + serviceUrl, sqlContext, pulsarHelper, clientConfig, diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala index 5229747d..a095166a 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala @@ -27,12 +27,14 @@ import org.apache.spark.sql.catalyst.json.JSONOptionsInRead import org.apache.spark.sql.connector.read.streaming import org.apache.spark.sql.connector.read.streaming.{ReadAllAvailable, ReadLimit, ReadMaxFiles, SupportsAdmissionControl} import org.apache.spark.sql.execution.streaming.{Offset, Source} +import org.apache.spark.sql.pulsar.PulsarOptions.ServiceUrlOptionKey import org.apache.spark.sql.pulsar.SpecificPulsarOffset.getTopicOffsets import org.apache.spark.sql.types.StructType import scala.collection.mutable private[pulsar] class PulsarSource( + serviceUrl: String, sqlContext: SQLContext, pulsarHelper: PulsarHelper, clientConf: ju.Map[String, Object], @@ -61,7 +63,7 @@ private[pulsar] class PulsarSource( private var currentTopicOffsets: Option[Map[String, MessageId]] = None - private val pulsarAdmin = PulsarAdmin.builder().serviceHttpUrl(clientConf.get("serviceUrl").toString).build() + private lazy val pulsarAdmin = PulsarAdmin.builder().serviceHttpUrl(serviceUrl).build() private lazy val pulsarSchema: SchemaInfo = pulsarHelper.getPulsarSchema @@ -80,12 +82,16 @@ private[pulsar] class PulsarSource( initialTopicOffsets val latestOffsets = pulsarHelper.fetchLatestOffsets().topicOffsets // add new partitions from PulsarAdmin, set to earliest entry and ledger id based on limit - val existingStartOffsets = getTopicOffsets(startingOffset.asInstanceOf[SpecificPulsarOffset]) + val existingStartOffsets = if (startingOffset != null) { + getTopicOffsets(startingOffset.asInstanceOf[SpecificPulsarOffset]) + } else { + Map[String, MessageId]() + } + print(s"readLimit: ${readLimit.toString}\n") val newTopics = latestOffsets.keySet.diff(existingStartOffsets.keySet) val startPartitionOffsets = existingStartOffsets ++ newTopics.map(topicPartition => topicPartition -> MessageId.earliest) val totalReadLimit = AdmissionLimits(readLimit).get.bytesToTake - val offsets = mutable.Map[String, MessageIdImpl]() - + val offsets = mutable.Map[String, MessageId]() val numPartitions = startPartitionOffsets.size startPartitionOffsets.keys.foreach { topicPartition => var readLimit = totalReadLimit / numPartitions @@ -131,7 +137,8 @@ private[pulsar] class PulsarSource( object AdmissionLimits { def apply(limit: ReadLimit): Option[AdmissionLimits] = limit match { - case maxBytes: ReadMaxBytes => Some (new AdmissionLimits(maxBytes.maxBytes) ) + case maxBytes: ReadMaxBytes => Some(new AdmissionLimits(maxBytes.maxBytes)) + case _ : ReadAllAvailable => Some(new AdmissionLimits(Int.MaxValue)) } } From 63ac0a90efa5fa03427bd59ab01a1ddf92e97bec Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Tue, 8 Aug 2023 16:24:54 -0700 Subject: [PATCH 04/38] adminUrl correction --- .../spark/sql/pulsar/PulsarOptions.scala | 1 + .../spark/sql/pulsar/PulsarProvider.scala | 17 +++--- .../spark/sql/pulsar/PulsarSource.scala | 53 +++++++++---------- .../pulsar/PulsarMicroBatchSourceSuite.scala | 11 +++- .../sql/pulsar/PulsarSourceSuiteBase.scala | 10 ++++ 5 files changed, 56 insertions(+), 36 deletions(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarOptions.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarOptions.scala index d0f51224..8e5c4b1e 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarOptions.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarOptions.scala @@ -36,6 +36,7 @@ private[pulsar] object PulsarOptions { val TopicOptionKeys: Set[String] = Set(TopicSingle, TopicMulti, TopicPattern) val ServiceUrlOptionKey: String = "service.url" + val AdminUrlOptionKey: String = "admin.url" val StartingOffsetsOptionKey: String = "startingOffsets".toLowerCase(Locale.ROOT) val StartingTime: String = "startingTime".toLowerCase(Locale.ROOT) val EndingTime: String = "endingTime".toLowerCase(Locale.ROOT) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala index 737e246c..fb0158de 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala @@ -56,7 +56,7 @@ private[pulsar] class PulsarProvider parameters: Map[String, String]): (String, StructType) = { val caseInsensitiveParams = validateStreamOptions(parameters) - val (clientConfig, _, serviceUrlConfig) = prepareConfForReader(parameters) + val (clientConfig, _, serviceUrlConfig, _) = prepareConfForReader(parameters) val subscriptionNamePrefix = s"spark-pulsar-${UUID.randomUUID}" val inferredSchema = Utils.tryWithResource( @@ -84,7 +84,7 @@ private[pulsar] class PulsarProvider logDebug(s"Creating Pulsar source: $parameters") val caseInsensitiveParams = validateStreamOptions(parameters) - val (clientConfig, readerConfig, serviceUrl) = prepareConfForReader(parameters) + val (clientConfig, readerConfig, serviceUrl, adminUrl) = prepareConfForReader(parameters) logDebug( s"Client config: $clientConfig; Reader config: $readerConfig; Service URL: $serviceUrl") @@ -106,7 +106,7 @@ private[pulsar] class PulsarProvider pulsarHelper.setupCursor(offset) new PulsarSource( - serviceUrl, + adminUrl, sqlContext, pulsarHelper, clientConfig, @@ -126,7 +126,7 @@ private[pulsar] class PulsarProvider val subscriptionNamePrefix = getSubscriptionPrefix(parameters, isBatch = true) - val (clientConfig, readerConfig, serviceUrl) = prepareConfForReader(parameters) + val (clientConfig, readerConfig, serviceUrl, _) = prepareConfForReader(parameters) val (start, end, schema, pSchema) = Utils.tryWithResource( PulsarHelper( serviceUrl, @@ -367,6 +367,10 @@ private[pulsar] object PulsarProvider extends Logging { parameters(ServiceUrlOptionKey) } + private def getAdminUrl(parameters: Map[String, String]): String = { + parameters(AdminUrlOptionKey) + } + private def getAllowDifferentTopicSchemas(parameters: Map[String, String]): Boolean = { parameters.getOrElse(AllowDifferentTopicSchemas, "false").toBoolean } @@ -487,9 +491,10 @@ private[pulsar] object PulsarProvider extends Logging { } private def prepareConfForReader(parameters: Map[String, String]) - : (ju.Map[String, Object], ju.Map[String, Object], String) = { + : (ju.Map[String, Object], ju.Map[String, Object], String, String) = { val serviceUrl = getServiceUrl(parameters) + val adminUrl = getAdminUrl(parameters) var clientParams = getClientParams(parameters) clientParams += (ServiceUrlOptionKey -> serviceUrl) val readerParams = getReaderParams(parameters) @@ -497,7 +502,7 @@ private[pulsar] object PulsarProvider extends Logging { ( paramsToPulsarConf("pulsar.client", clientParams), paramsToPulsarConf("pulsar.reader", readerParams), - serviceUrl) + serviceUrl, adminUrl) } private def prepareConfForProducer(parameters: Map[String, String]) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala index a095166a..923b004a 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala @@ -31,6 +31,7 @@ import org.apache.spark.sql.pulsar.PulsarOptions.ServiceUrlOptionKey import org.apache.spark.sql.pulsar.SpecificPulsarOffset.getTopicOffsets import org.apache.spark.sql.types.StructType +import scala.collection.JavaConverters.asScalaBufferConverter import scala.collection.mutable private[pulsar] class PulsarSource( @@ -80,14 +81,15 @@ private[pulsar] class PulsarSource( override def latestOffset(startingOffset: streaming.Offset, readLimit: ReadLimit): streaming.Offset = { initialTopicOffsets + // implement helper inside PulsarHelper in order to use getTopicPartitions val latestOffsets = pulsarHelper.fetchLatestOffsets().topicOffsets // add new partitions from PulsarAdmin, set to earliest entry and ledger id based on limit + // start a reader, get to the earliest offset for new topic partitions val existingStartOffsets = if (startingOffset != null) { getTopicOffsets(startingOffset.asInstanceOf[SpecificPulsarOffset]) } else { Map[String, MessageId]() } - print(s"readLimit: ${readLimit.toString}\n") val newTopics = latestOffsets.keySet.diff(existingStartOffsets.keySet) val startPartitionOffsets = existingStartOffsets ++ newTopics.map(topicPartition => topicPartition -> MessageId.earliest) val totalReadLimit = AdmissionLimits(readLimit).get.bytesToTake @@ -95,40 +97,35 @@ private[pulsar] class PulsarSource( val numPartitions = startPartitionOffsets.size startPartitionOffsets.keys.foreach { topicPartition => var readLimit = totalReadLimit / numPartitions - pulsarHelper.fetchLatestOffsetForTopic(topicPartition) val messageId = startPartitionOffsets.apply(topicPartition) val ledgerId = getLedgerId(messageId) val entryId = getEntryId(messageId) - pulsarAdmin.topics().getPartitionedInternalStats(topicPartition).partitions.forEach { (_, partitionMetadata) => - partitionMetadata.ledgers.sort((ledger1, ledger2) => { - (ledger1.ledgerId - ledger2.ledgerId).toInt - }) - partitionMetadata.ledgers.forEach { ledger => - if (ledger.ledgerId >= ledgerId) { - val avgBytesPerEntries = ledger.size / ledger.entries - // approximation of bytes left in ledger to deal with case - // where we are at the middle of the ledger - val bytesLeftInLedger = avgBytesPerEntries * { - if (ledger.ledgerId == ledgerId) { - ledger.entries - entryId - } else { - ledger.entries - } - } - if (readLimit > bytesLeftInLedger) { - readLimit -= bytesLeftInLedger - offsets += (topicPartition -> DefaultImplementation - .getDefaultImplementation - .newMessageId(ledger.ledgerId, ledger.entries, -1)) +// pulsarAdmin.topics().getPartitionedInternalStats(topicPartition).partitions.forEach { (_, partitionMetadata) => +// partitionMetadata.ledgers.asScala.filter(_.ledgerId < ledgerId).sortBy(_.ledgerId).foreach { ledger => + pulsarAdmin.topics.getInternalStats(topicPartition).ledgers.asScala.filter(_.ledgerId < ledgerId).sortBy(_.ledgerId).foreach { ledger => + val avgBytesPerEntries = ledger.size / ledger.entries + // approximation of bytes left in ledger to deal with case + // where we are at the middle of the ledger + val bytesLeftInLedger = avgBytesPerEntries * { + if (ledger.ledgerId == ledgerId) { + ledger.entries - entryId } else { - offsets += (topicPartition -> DefaultImplementation - .getDefaultImplementation - .newMessageId(ledger.ledgerId, entryId + readLimit / avgBytesPerEntries, -1)) - readLimit = 0 + ledger.entries } } + if (readLimit > bytesLeftInLedger) { + readLimit -= bytesLeftInLedger + offsets += (topicPartition -> DefaultImplementation + .getDefaultImplementation + .newMessageId(ledger.ledgerId, ledger.entries, -1)) + } else { + offsets += (topicPartition -> DefaultImplementation + .getDefaultImplementation + .newMessageId(ledger.ledgerId, entryId + readLimit / avgBytesPerEntries, -1)) + readLimit = 0 + } } - } +// } } SpecificPulsarOffset(offsets.toMap) } diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarMicroBatchSourceSuite.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarMicroBatchSourceSuite.scala index 7b0b51a9..2c700c3a 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarMicroBatchSourceSuite.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarMicroBatchSourceSuite.scala @@ -14,13 +14,12 @@ package org.apache.spark.sql.pulsar import java.util.concurrent.ConcurrentLinkedQueue - import org.apache.pulsar.client.admin.PulsarAdmin import org.apache.spark.SparkException import org.apache.spark.sql.ForeachWriter import org.apache.spark.sql.execution.streaming.StreamingExecutionRelation import org.apache.spark.sql.functions.{count, window} -import org.apache.spark.sql.pulsar.PulsarOptions.{ServiceUrlOptionKey, TopicPattern} +import org.apache.spark.sql.pulsar.PulsarOptions.{AdminUrlOptionKey, ServiceUrlOptionKey, TopicPattern} import org.apache.spark.sql.streaming.Trigger.ProcessingTime import org.apache.spark.util.Utils @@ -31,6 +30,7 @@ class PulsarMicroBatchV1SourceSuite extends PulsarMicroBatchSourceSuiteBase { val pulsar = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) .option(TopicPattern, s"$topic.*") .load() @@ -56,6 +56,7 @@ abstract class PulsarMicroBatchSourceSuiteBase extends PulsarSourceSuiteBase { val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) .option(TopicSingle, topic) testStream(reader.load)(makeSureGetOffsetCalled, StopStream, StartStream(), StopStream) @@ -70,6 +71,7 @@ abstract class PulsarMicroBatchSourceSuiteBase extends PulsarSourceSuiteBase { .format("pulsar") .option(TopicSingle, topic) .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) .load() .selectExpr("CAST(__key AS STRING)", "CAST(value AS STRING)") .as[(String, String)] @@ -98,6 +100,7 @@ abstract class PulsarMicroBatchSourceSuiteBase extends PulsarSourceSuiteBase { val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) .option(TopicPattern, s"$topicPrefix-.*") .option("failOnDataLoss", "false") @@ -133,6 +136,7 @@ abstract class PulsarMicroBatchSourceSuiteBase extends PulsarSourceSuiteBase { val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) .option(TopicPattern, s"$topicPrefix-.*") .option("failOnDataLoss", "true") .option("startingOffsets", "earliest") @@ -180,6 +184,7 @@ abstract class PulsarMicroBatchSourceSuiteBase extends PulsarSourceSuiteBase { val pulsar = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) .option(StartingOffsetsOptionKey, "earliest") .option(TopicSingle, topic) .load() @@ -218,6 +223,7 @@ abstract class PulsarMicroBatchSourceSuiteBase extends PulsarSourceSuiteBase { val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) .option(TopicSingle, topic) .option(StartingOffsetsOptionKey, "earliest") .option(PollTimeoutMS, "1000") @@ -261,6 +267,7 @@ abstract class PulsarMicroBatchSourceSuiteBase extends PulsarSourceSuiteBase { .format("pulsar") .option(TopicSingle, topic) .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) .load() val values = pulsar diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarSourceSuiteBase.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarSourceSuiteBase.scala index 8a2ade1e..eaef2e2e 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarSourceSuiteBase.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarSourceSuiteBase.scala @@ -36,6 +36,7 @@ abstract class PulsarSourceSuiteBase extends PulsarSourceTest { val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) .option(TopicPattern, s"$topic.*") val pulsar = reader @@ -146,6 +147,7 @@ abstract class PulsarSourceSuiteBase extends PulsarSourceTest { val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) options.foreach { case (k, v) => reader.option(k, v) } reader.load() } @@ -197,6 +199,7 @@ abstract class PulsarSourceSuiteBase extends PulsarSourceTest { val pulsar = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) .option(StartingOffsetsOptionKey, "earliest") .option(TopicMulti, topic) .load() @@ -239,6 +242,7 @@ abstract class PulsarSourceSuiteBase extends PulsarSourceTest { .format("pulsar") .option(StartingOffsetsOptionKey, "earliest") .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, true) .option(TopicSingle, topic) @@ -328,6 +332,7 @@ abstract class PulsarSourceSuiteBase extends PulsarSourceTest { .format("pulsar") .option(StartingOffsetsOptionKey, "earliest") .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, true) .option(TopicSingle, topic) @@ -348,6 +353,7 @@ abstract class PulsarSourceSuiteBase extends PulsarSourceTest { .format("pulsar") .option(StartingOffsetsOptionKey, "earliest") .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, true) .option(TopicSingle, topic) @@ -372,6 +378,7 @@ abstract class PulsarSourceSuiteBase extends PulsarSourceTest { .format("pulsar") .option(StartingOffsetsOptionKey, "latest") .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, failOnDataLoss.toString) options.foreach { case (k, v) => reader.option(k, v) } @@ -416,6 +423,7 @@ abstract class PulsarSourceSuiteBase extends PulsarSourceTest { .format("pulsar") .option(StartingOffsetsOptionKey, "earliest") .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, failOnDataLoss.toString) options.foreach { case (k, v) => reader.option(k, v) } val pulsar = reader @@ -459,6 +467,7 @@ abstract class PulsarSourceSuiteBase extends PulsarSourceTest { .format("pulsar") .option(StartingTime, time0) .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, failOnDataLoss.toString) options.foreach { case (k, v) => reader.option(k, v) } val pulsar = reader @@ -493,6 +502,7 @@ abstract class PulsarSourceSuiteBase extends PulsarSourceTest { .format("pulsar") .option(StartingOffsetsOptionKey, s1) .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, failOnDataLoss.toString) options.foreach { case (k, v) => reader.option(k, v) } val pulsar = reader From eb7309353e41ba6d0d765fce0b43e3a02d3e6030 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Thu, 10 Aug 2023 14:15:16 -0700 Subject: [PATCH 05/38] only MessageId is null tests failing --- .../spark/sql/pulsar/PulsarSource.scala | 78 ++++++++++++------- 1 file changed, 50 insertions(+), 28 deletions(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala index 923b004a..dca9c798 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.json.JSONOptionsInRead import org.apache.spark.sql.connector.read.streaming import org.apache.spark.sql.connector.read.streaming.{ReadAllAvailable, ReadLimit, ReadMaxFiles, SupportsAdmissionControl} -import org.apache.spark.sql.execution.streaming.{Offset, Source} +import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset, Source} import org.apache.spark.sql.pulsar.PulsarOptions.ServiceUrlOptionKey import org.apache.spark.sql.pulsar.SpecificPulsarOffset.getTopicOffsets import org.apache.spark.sql.types.StructType @@ -48,7 +48,8 @@ private[pulsar] class PulsarSource( jsonOptions: JSONOptionsInRead) extends Source with Logging - with SupportsAdmissionControl { + with SupportsAdmissionControl + { import PulsarSourceUtils._ @@ -86,46 +87,67 @@ private[pulsar] class PulsarSource( // add new partitions from PulsarAdmin, set to earliest entry and ledger id based on limit // start a reader, get to the earliest offset for new topic partitions val existingStartOffsets = if (startingOffset != null) { - getTopicOffsets(startingOffset.asInstanceOf[SpecificPulsarOffset]) + getTopicOffsets(startingOffset.asInstanceOf[org.apache.spark.sql.execution.streaming.Offset]) } else { Map[String, MessageId]() } val newTopics = latestOffsets.keySet.diff(existingStartOffsets.keySet) - val startPartitionOffsets = existingStartOffsets ++ newTopics.map(topicPartition => topicPartition -> MessageId.earliest) + val startPartitionOffsets = existingStartOffsets ++ newTopics.map(topicPartition => topicPartition -> pulsarHelper.fetchLatestOffsetForTopic(topicPartition)) val totalReadLimit = AdmissionLimits(readLimit).get.bytesToTake val offsets = mutable.Map[String, MessageId]() + offsets ++= startPartitionOffsets val numPartitions = startPartitionOffsets.size - startPartitionOffsets.keys.foreach { topicPartition => + val startingOffsetStr = startPartitionOffsets.map { case (k, v) => + val ledgerId = getLedgerId(v) // Assuming getLedgerId is a method of the object + val entryId = getEntryId(v) // Assuming getEntryId is a method of the object + val stats = pulsarAdmin.topics().getInternalStats(k) + val numEntries = stats.numberOfEntries + val numEntriesPerLedger = pulsarAdmin.topics().getInternalStats(k).ledgers.asScala.map{ ledger => + s"[LedgerID: ${ledger.ledgerId}, Size: ${ledger.size}, Entries: ${ledger.entries}]" + }.mkString(", ") + + s"[$k, LedgerId: $ledgerId, " + + s"EntryId: $entryId, " + + s"Entries: ${numEntries}, " + + s"CurrentLedgerEntries: ${stats.currentLedgerEntries}, " + + s"CurrentLedgerSize: ${stats.currentLedgerSize}, " + + s"NumLedgers: ${stats.ledgers.size()}, " + + s"TotalSize: ${stats.totalSize}, " + + s"EntriesPerLedger: ${numEntriesPerLedger}]" + }.mkString(", ") + print(s"STARTOFFSETS: $startingOffsetStr\n") + startPartitionOffsets.keys.filter(topicPartition => { + pulsarAdmin.topics.getInternalStats(topicPartition).currentLedgerEntries > 0 + }).foreach { topicPartition => var readLimit = totalReadLimit / numPartitions val messageId = startPartitionOffsets.apply(topicPartition) val ledgerId = getLedgerId(messageId) val entryId = getEntryId(messageId) -// pulsarAdmin.topics().getPartitionedInternalStats(topicPartition).partitions.forEach { (_, partitionMetadata) => -// partitionMetadata.ledgers.asScala.filter(_.ledgerId < ledgerId).sortBy(_.ledgerId).foreach { ledger => - pulsarAdmin.topics.getInternalStats(topicPartition).ledgers.asScala.filter(_.ledgerId < ledgerId).sortBy(_.ledgerId).foreach { ledger => - val avgBytesPerEntries = ledger.size / ledger.entries - // approximation of bytes left in ledger to deal with case - // where we are at the middle of the ledger - val bytesLeftInLedger = avgBytesPerEntries * { - if (ledger.ledgerId == ledgerId) { - ledger.entries - entryId - } else { - ledger.entries - } - } - if (readLimit > bytesLeftInLedger) { - readLimit -= bytesLeftInLedger - offsets += (topicPartition -> DefaultImplementation - .getDefaultImplementation - .newMessageId(ledger.ledgerId, ledger.entries, -1)) + val stats = pulsarAdmin.topics.getInternalStats(topicPartition) + pulsarAdmin.topics.getInternalStats(topicPartition).ledgers.asScala.filter(_.ledgerId >= ledgerId).sortBy(_.ledgerId).foreach{ ledger => + ledger.entries = stats.currentLedgerEntries + val avgBytesPerEntries = stats.currentLedgerSize / stats.currentLedgerEntries + // approximation of bytes left in ledger to deal with case + // where we are at the middle of the ledger + val bytesLeftInLedger = avgBytesPerEntries * { + if (ledger.ledgerId == ledgerId) { + ledger.entries - entryId } else { - offsets += (topicPartition -> DefaultImplementation - .getDefaultImplementation - .newMessageId(ledger.ledgerId, entryId + readLimit / avgBytesPerEntries, -1)) - readLimit = 0 + ledger.entries } } -// } + if (readLimit > bytesLeftInLedger) { + readLimit -= bytesLeftInLedger + offsets += (topicPartition -> DefaultImplementation + .getDefaultImplementation + .newMessageId(ledger.ledgerId, ledger.entries - 1, -1)) + } else { + offsets += (topicPartition -> DefaultImplementation + .getDefaultImplementation + .newMessageId(ledger.ledgerId, entryId + readLimit / avgBytesPerEntries, -1)) + readLimit = 0 + } + } } SpecificPulsarOffset(offsets.toMap) } From 6b858d7c68322c62835f4b038846c4408fa2dc71 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Mon, 14 Aug 2023 11:49:17 -0700 Subject: [PATCH 06/38] adding pulsaroption --- .../spark/sql/pulsar/PulsarOptions.scala | 1 + .../spark/sql/pulsar/PulsarProvider.scala | 8 ++++++ .../spark/sql/pulsar/PulsarSource.scala | 27 ++++--------------- 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarOptions.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarOptions.scala index 8e5c4b1e..d9ec02f0 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarOptions.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarOptions.scala @@ -46,6 +46,7 @@ private[pulsar] object PulsarOptions { val SubscriptionPrefix: String = "subscriptionPrefix".toLowerCase(Locale.ROOT) val PredefinedSubscription: String = "predefinedSubscription".toLowerCase(Locale.ROOT) + val MaxBytesPerTrigger: String = "maxBytesPerTrigger".toLowerCase(Locale.ROOT) val PollTimeoutMS: String = "pollTimeoutMs".toLowerCase(Locale.ROOT) val FailOnDataLossOptionKey: String = "failOnDataLoss".toLowerCase(Locale.ROOT) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala index fb0158de..122d3d4b 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala @@ -114,6 +114,7 @@ private[pulsar] class PulsarProvider metadataPath, offset, pollTimeoutMs(caseInsensitiveParams), + maxBytesPerTrigger(caseInsensitiveParams), failOnDataLoss(caseInsensitiveParams), subscriptionNamePrefix, jsonOptions) @@ -385,6 +386,13 @@ private[pulsar] object PulsarProvider extends Logging { (SparkEnv.get.conf.getTimeAsSeconds("spark.network.timeout", "120s") * 1000).toString) .toInt + private def maxBytesPerTrigger(caseInsensitiveParams: Map[String, String]): Long = + caseInsensitiveParams + .getOrElse( + PulsarOptions.MaxBytesPerTrigger, + Long.MaxValue.toString + ).toLong + private def validateGeneralOptions( caseInsensitiveParams: Map[String, String]): Map[String, String] = { if (!caseInsensitiveParams.contains(ServiceUrlOptionKey)) { diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala index dca9c798..4ecbc528 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala @@ -43,13 +43,13 @@ private[pulsar] class PulsarSource( metadataPath: String, startingOffsets: PerTopicOffset, pollTimeoutMs: Int, + maxBytesPerTrigger: Long, failOnDataLoss: Boolean, subscriptionNamePrefix: String, jsonOptions: JSONOptionsInRead) extends Source with Logging - with SupportsAdmissionControl - { + with SupportsAdmissionControl { import PulsarSourceUtils._ @@ -97,25 +97,6 @@ private[pulsar] class PulsarSource( val offsets = mutable.Map[String, MessageId]() offsets ++= startPartitionOffsets val numPartitions = startPartitionOffsets.size - val startingOffsetStr = startPartitionOffsets.map { case (k, v) => - val ledgerId = getLedgerId(v) // Assuming getLedgerId is a method of the object - val entryId = getEntryId(v) // Assuming getEntryId is a method of the object - val stats = pulsarAdmin.topics().getInternalStats(k) - val numEntries = stats.numberOfEntries - val numEntriesPerLedger = pulsarAdmin.topics().getInternalStats(k).ledgers.asScala.map{ ledger => - s"[LedgerID: ${ledger.ledgerId}, Size: ${ledger.size}, Entries: ${ledger.entries}]" - }.mkString(", ") - - s"[$k, LedgerId: $ledgerId, " + - s"EntryId: $entryId, " + - s"Entries: ${numEntries}, " + - s"CurrentLedgerEntries: ${stats.currentLedgerEntries}, " + - s"CurrentLedgerSize: ${stats.currentLedgerSize}, " + - s"NumLedgers: ${stats.ledgers.size()}, " + - s"TotalSize: ${stats.totalSize}, " + - s"EntriesPerLedger: ${numEntriesPerLedger}]" - }.mkString(", ") - print(s"STARTOFFSETS: $startingOffsetStr\n") startPartitionOffsets.keys.filter(topicPartition => { pulsarAdmin.topics.getInternalStats(topicPartition).currentLedgerEntries > 0 }).foreach { topicPartition => @@ -151,7 +132,9 @@ private[pulsar] class PulsarSource( } SpecificPulsarOffset(offsets.toMap) } - + override def getDefaultReadLimit: ReadLimit = { + ReadMaxBytes.apply(maxBytesPerTrigger) + } class AdmissionLimits(var bytesToTake: Long) object AdmissionLimits { From 43477442c37198f761635db3caa2a3d87750afc2 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Tue, 15 Aug 2023 14:22:27 -0700 Subject: [PATCH 07/38] test case --- .../spark/sql/pulsar/PulsarSource.scala | 25 +++++--- .../pulsar/PulsarAdmissionControlSuite.scala | 57 +++++++++++++++++++ 2 files changed, 73 insertions(+), 9 deletions(-) create mode 100644 src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala index 4ecbc528..4eb15802 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala @@ -13,13 +13,18 @@ */ package org.apache.spark.sql.pulsar -import org.apache.pulsar.client.admin.PulsarAdmin import java.{util => ju} + +import scala.collection.JavaConverters.asScalaBufferConverter +import scala.collection.mutable + +import org.apache.pulsar.client.admin.PulsarAdmin import org.apache.pulsar.client.api.MessageId import org.apache.pulsar.client.impl.MessageIdImpl import org.apache.pulsar.client.internal.DefaultImplementation import org.apache.pulsar.common.schema.SchemaInfo + import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.catalyst.InternalRow @@ -31,8 +36,6 @@ import org.apache.spark.sql.pulsar.PulsarOptions.ServiceUrlOptionKey import org.apache.spark.sql.pulsar.SpecificPulsarOffset.getTopicOffsets import org.apache.spark.sql.types.StructType -import scala.collection.JavaConverters.asScalaBufferConverter -import scala.collection.mutable private[pulsar] class PulsarSource( serviceUrl: String, @@ -76,11 +79,11 @@ private[pulsar] class PulsarSource( initialTopicOffsets val latest = pulsarHelper.fetchLatestOffsets() currentTopicOffsets = Some(latest.topicOffsets) - logDebug(s"GetOffset: ${latest.topicOffsets.toSeq.map(_.toString).sorted}") Some(latest.asInstanceOf[Offset]) } - override def latestOffset(startingOffset: streaming.Offset, readLimit: ReadLimit): streaming.Offset = { + override def latestOffset(startingOffset: streaming.Offset, + readLimit: ReadLimit): streaming.Offset = { initialTopicOffsets // implement helper inside PulsarHelper in order to use getTopicPartitions val latestOffsets = pulsarHelper.fetchLatestOffsets().topicOffsets @@ -92,7 +95,8 @@ private[pulsar] class PulsarSource( Map[String, MessageId]() } val newTopics = latestOffsets.keySet.diff(existingStartOffsets.keySet) - val startPartitionOffsets = existingStartOffsets ++ newTopics.map(topicPartition => topicPartition -> pulsarHelper.fetchLatestOffsetForTopic(topicPartition)) + val startPartitionOffsets = existingStartOffsets ++ newTopics.map(topicPartition + => topicPartition -> pulsarHelper.fetchLatestOffsetForTopic(topicPartition)) val totalReadLimit = AdmissionLimits(readLimit).get.bytesToTake val offsets = mutable.Map[String, MessageId]() offsets ++= startPartitionOffsets @@ -105,7 +109,8 @@ private[pulsar] class PulsarSource( val ledgerId = getLedgerId(messageId) val entryId = getEntryId(messageId) val stats = pulsarAdmin.topics.getInternalStats(topicPartition) - pulsarAdmin.topics.getInternalStats(topicPartition).ledgers.asScala.filter(_.ledgerId >= ledgerId).sortBy(_.ledgerId).foreach{ ledger => + pulsarAdmin.topics.getInternalStats(topicPartition).ledgers. + asScala.filter(_.ledgerId >= ledgerId).sortBy(_.ledgerId).foreach{ ledger => ledger.entries = stats.currentLedgerEntries val avgBytesPerEntries = stats.currentLedgerSize / stats.currentLedgerEntries // approximation of bytes left in ledger to deal with case @@ -123,9 +128,11 @@ private[pulsar] class PulsarSource( .getDefaultImplementation .newMessageId(ledger.ledgerId, ledger.entries - 1, -1)) } else { + val numEntriesToRead = Math.max(1, readLimit / avgBytesPerEntries) + val lastEntryRead = Math.min(ledger.entries - 1, entryId + numEntriesToRead) offsets += (topicPartition -> DefaultImplementation .getDefaultImplementation - .newMessageId(ledger.ledgerId, entryId + readLimit / avgBytesPerEntries, -1)) + .newMessageId(ledger.ledgerId, lastEntryRead, -1)) readLimit = 0 } } @@ -249,4 +256,4 @@ private[pulsar] class PulsarSource( } /** A read limit that admits a soft-max of `maxBytes` per micro-batch. */ -case class ReadMaxBytes(maxBytes: Long) extends ReadLimit \ No newline at end of file +case class ReadMaxBytes(maxBytes: Long) extends ReadLimit diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala new file mode 100644 index 00000000..06d76ddd --- /dev/null +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala @@ -0,0 +1,57 @@ +package org.apache.spark.sql.pulsar + +import org.apache.pulsar.client.admin.PulsarAdmin +import org.apache.spark.sql.streaming.Trigger.{Once, ProcessingTime} +import org.apache.spark.util.Utils + +class PulsarAdmissionControlSuite extends PulsarSourceTest { + + import PulsarOptions._ + import testImplicits._ + + override def beforeAll(): Unit = { + super.beforeAll() + } + + /** + * Write unit test to create limits, can construct fake ledger statistics + * Can call latestOffset() directly from the unit test + * + * Just need to verify that each microbatch is <= maxBytesPerTrigger (within some threshold) + * Can send message of specific size in AddPulsarData here + */ + + test("Admission Control") { + val topic = newTopic() + sendMessages(topic, Array("-1")) + require(getLatestOffsets(Set(topic)).size === 1) + sparkContext.setLogLevel("INFO") + val pulsar = spark.readStream + .format("pulsar") + .option(TopicSingle, topic) + .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) + .option(MaxBytesPerTrigger, 120) + .load() + .selectExpr("CAST(__key AS STRING)", "CAST(value AS STRING)") + .as[(String, String)] + + val mapped = pulsar.map(kv => kv._2.toInt + 1) + + // Each Int adds 38 bytes to message size, so we expect 3 Ints in each message + testStream(mapped)( + StartStream(trigger = ProcessingTime(100)), + makeSureGetOffsetCalled, + AddPulsarData(Set(topic), 1, 2, 3), + CheckLastBatch(2, 3, 4), + AddPulsarData(Set(topic), 4, 5, 6, 7, 8, 9), + CheckLastBatch(8, 9, 10), + AssertOnQuery { query => + val recordsRead = query.recentProgress.map(_.numInputRows).sum + recordsRead == 9 + } + ) + + } + +} From b49184471abd542e38dd722ffe7489128f70082d Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Tue, 15 Aug 2023 16:06:49 -0700 Subject: [PATCH 08/38] moving functionality to PulsarHelper --- .../spark/sql/pulsar/PulsarHelper.scala | 66 +++++++++++++++ .../spark/sql/pulsar/PulsarProvider.scala | 7 +- .../spark/sql/pulsar/PulsarSource.scala | 82 ++++--------------- 3 files changed, 85 insertions(+), 70 deletions(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala index 275c4d67..58f3712f 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala @@ -22,16 +22,22 @@ import scala.collection.mutable import scala.language.postfixOps import scala.util.control.NonFatal +import org.apache.pulsar.client.admin.PulsarAdmin import org.apache.pulsar.client.api.{MessageId, PulsarClient} import org.apache.pulsar.client.impl.{MessageIdImpl, PulsarClientImpl} import org.apache.pulsar.client.impl.schema.BytesSchema +import org.apache.pulsar.client.internal.DefaultImplementation import org.apache.pulsar.common.api.proto.CommandGetTopicsOfNamespace import org.apache.pulsar.common.naming.TopicName import org.apache.pulsar.common.schema.SchemaInfo import org.apache.pulsar.shade.com.google.common.util.concurrent.Uninterruptibles import org.apache.spark.internal.Logging +import org.apache.spark.sql.connector.read.streaming +import org.apache.spark.sql.connector.read.streaming.{ReadAllAvailable, ReadLimit} import org.apache.spark.sql.pulsar.PulsarOptions._ +import org.apache.spark.sql.pulsar.PulsarSourceUtils.{getEntryId, getLedgerId} +import org.apache.spark.sql.pulsar.SpecificPulsarOffset.getTopicOffsets import org.apache.spark.sql.types.StructType /** @@ -40,6 +46,7 @@ import org.apache.spark.sql.types.StructType */ private[pulsar] case class PulsarHelper( serviceUrl: String, + adminUrl: String, clientConf: ju.Map[String, Object], driverGroupIdPrefix: String, caseInsensitiveParameters: Map[String, String], @@ -55,6 +62,8 @@ private[pulsar] case class PulsarHelper( private var topics: Seq[String] = _ private var topicPartitions: Seq[String] = _ + private lazy val pulsarAdmin = PulsarAdmin.builder().serviceHttpUrl(adminUrl).build() + override def close(): Unit = { // do nothing } @@ -207,6 +216,63 @@ private[pulsar] case class PulsarHelper( }.toMap) } + def latestOffsets(startingOffset: streaming.Offset, + admissionLimits: AdmissionLimits): SpecificPulsarOffset = { + // implement helper inside PulsarHelper in order to use getTopicPartitions + val latestOffsets = fetchLatestOffsets().topicOffsets + // add new partitions from PulsarAdmin, set to earliest entry and ledger id based on limit + // start a reader, get to the earliest offset for new topic partitions + val existingStartOffsets = if (startingOffset != null) { + getTopicOffsets(startingOffset.asInstanceOf[org.apache.spark.sql.execution.streaming.Offset]) + } else { + Map[String, MessageId]() + } + val newTopics = latestOffsets.keySet.diff(existingStartOffsets.keySet) + val startPartitionOffsets = existingStartOffsets ++ newTopics.map(topicPartition + => topicPartition -> fetchLatestOffsetForTopic(topicPartition)) + val totalReadLimit = admissionLimits.bytesToTake + val offsets = mutable.Map[String, MessageId]() + offsets ++= startPartitionOffsets + val numPartitions = startPartitionOffsets.size + startPartitionOffsets.keys.filter(topicPartition => { + pulsarAdmin.topics.getInternalStats(topicPartition).currentLedgerEntries > 0 + }).foreach { topicPartition => + var readLimit = totalReadLimit / numPartitions + val messageId = startPartitionOffsets.apply(topicPartition) + val ledgerId = getLedgerId(messageId) + val entryId = getEntryId(messageId) + val stats = pulsarAdmin.topics.getInternalStats(topicPartition) + pulsarAdmin.topics.getInternalStats(topicPartition).ledgers. + asScala.filter(_.ledgerId >= ledgerId).sortBy(_.ledgerId).foreach { ledger => + ledger.entries = stats.currentLedgerEntries + val avgBytesPerEntries = stats.currentLedgerSize / stats.currentLedgerEntries + // approximation of bytes left in ledger to deal with case + // where we are at the middle of the ledger + val bytesLeftInLedger = avgBytesPerEntries * { + if (ledger.ledgerId == ledgerId) { + ledger.entries - entryId + } else { + ledger.entries + } + } + if (readLimit > bytesLeftInLedger) { + readLimit -= bytesLeftInLedger + offsets += (topicPartition -> DefaultImplementation + .getDefaultImplementation + .newMessageId(ledger.ledgerId, ledger.entries - 1, -1)) + } else { + val numEntriesToRead = Math.max(1, readLimit / avgBytesPerEntries) + val lastEntryRead = Math.min(ledger.entries - 1, entryId + numEntriesToRead) + offsets += (topicPartition -> DefaultImplementation + .getDefaultImplementation + .newMessageId(ledger.ledgerId, lastEntryRead, -1)) + readLimit = 0 + } + } + } + SpecificPulsarOffset(offsets.toMap) + } + def fetchLatestOffsetForTopic(topic: String): MessageId = { val messageId = try { diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala index 122d3d4b..16560009 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala @@ -56,12 +56,13 @@ private[pulsar] class PulsarProvider parameters: Map[String, String]): (String, StructType) = { val caseInsensitiveParams = validateStreamOptions(parameters) - val (clientConfig, _, serviceUrlConfig, _) = prepareConfForReader(parameters) + val (clientConfig, _, serviceUrlConfig, adminUrl) = prepareConfForReader(parameters) val subscriptionNamePrefix = s"spark-pulsar-${UUID.randomUUID}" val inferredSchema = Utils.tryWithResource( PulsarHelper( serviceUrlConfig, + adminUrl, clientConfig, subscriptionNamePrefix, caseInsensitiveParams, @@ -91,6 +92,7 @@ private[pulsar] class PulsarProvider val subscriptionNamePrefix = getSubscriptionPrefix(parameters) val pulsarHelper = PulsarHelper( serviceUrl, + adminUrl, clientConfig, subscriptionNamePrefix, caseInsensitiveParams, @@ -127,10 +129,11 @@ private[pulsar] class PulsarProvider val subscriptionNamePrefix = getSubscriptionPrefix(parameters, isBatch = true) - val (clientConfig, readerConfig, serviceUrl, _) = prepareConfForReader(parameters) + val (clientConfig, readerConfig, serviceUrl, adminUrl) = prepareConfForReader(parameters) val (start, end, schema, pSchema) = Utils.tryWithResource( PulsarHelper( serviceUrl, + adminUrl, clientConfig, subscriptionNamePrefix, caseInsensitiveParams, diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala index 4eb15802..0469f445 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala @@ -68,89 +68,25 @@ private[pulsar] class PulsarSource( private var currentTopicOffsets: Option[Map[String, MessageId]] = None - private lazy val pulsarAdmin = PulsarAdmin.builder().serviceHttpUrl(serviceUrl).build() private lazy val pulsarSchema: SchemaInfo = pulsarHelper.getPulsarSchema override def schema(): StructType = SchemaUtils.pulsarSourceSchema(pulsarSchema) override def getOffset: Option[Offset] = { - // Make sure initialTopicOffsets is initialized - initialTopicOffsets - val latest = pulsarHelper.fetchLatestOffsets() - currentTopicOffsets = Some(latest.topicOffsets) - Some(latest.asInstanceOf[Offset]) + throw new UnsupportedOperationException( + "latestOffset(Offset, ReadLimit) should be called instead of this method") } override def latestOffset(startingOffset: streaming.Offset, readLimit: ReadLimit): streaming.Offset = { initialTopicOffsets - // implement helper inside PulsarHelper in order to use getTopicPartitions - val latestOffsets = pulsarHelper.fetchLatestOffsets().topicOffsets - // add new partitions from PulsarAdmin, set to earliest entry and ledger id based on limit - // start a reader, get to the earliest offset for new topic partitions - val existingStartOffsets = if (startingOffset != null) { - getTopicOffsets(startingOffset.asInstanceOf[org.apache.spark.sql.execution.streaming.Offset]) - } else { - Map[String, MessageId]() - } - val newTopics = latestOffsets.keySet.diff(existingStartOffsets.keySet) - val startPartitionOffsets = existingStartOffsets ++ newTopics.map(topicPartition - => topicPartition -> pulsarHelper.fetchLatestOffsetForTopic(topicPartition)) - val totalReadLimit = AdmissionLimits(readLimit).get.bytesToTake - val offsets = mutable.Map[String, MessageId]() - offsets ++= startPartitionOffsets - val numPartitions = startPartitionOffsets.size - startPartitionOffsets.keys.filter(topicPartition => { - pulsarAdmin.topics.getInternalStats(topicPartition).currentLedgerEntries > 0 - }).foreach { topicPartition => - var readLimit = totalReadLimit / numPartitions - val messageId = startPartitionOffsets.apply(topicPartition) - val ledgerId = getLedgerId(messageId) - val entryId = getEntryId(messageId) - val stats = pulsarAdmin.topics.getInternalStats(topicPartition) - pulsarAdmin.topics.getInternalStats(topicPartition).ledgers. - asScala.filter(_.ledgerId >= ledgerId).sortBy(_.ledgerId).foreach{ ledger => - ledger.entries = stats.currentLedgerEntries - val avgBytesPerEntries = stats.currentLedgerSize / stats.currentLedgerEntries - // approximation of bytes left in ledger to deal with case - // where we are at the middle of the ledger - val bytesLeftInLedger = avgBytesPerEntries * { - if (ledger.ledgerId == ledgerId) { - ledger.entries - entryId - } else { - ledger.entries - } - } - if (readLimit > bytesLeftInLedger) { - readLimit -= bytesLeftInLedger - offsets += (topicPartition -> DefaultImplementation - .getDefaultImplementation - .newMessageId(ledger.ledgerId, ledger.entries - 1, -1)) - } else { - val numEntriesToRead = Math.max(1, readLimit / avgBytesPerEntries) - val lastEntryRead = Math.min(ledger.entries - 1, entryId + numEntriesToRead) - offsets += (topicPartition -> DefaultImplementation - .getDefaultImplementation - .newMessageId(ledger.ledgerId, lastEntryRead, -1)) - readLimit = 0 - } - } - } - SpecificPulsarOffset(offsets.toMap) + val admissionLimits = AdmissionLimits(readLimit) + pulsarHelper.latestOffsets(startingOffset, admissionLimits.get) } override def getDefaultReadLimit: ReadLimit = { ReadMaxBytes.apply(maxBytesPerTrigger) } - class AdmissionLimits(var bytesToTake: Long) - - object AdmissionLimits { - def apply(limit: ReadLimit): Option[AdmissionLimits] = limit match { - case maxBytes: ReadMaxBytes => Some(new AdmissionLimits(maxBytes.maxBytes)) - case _ : ReadAllAvailable => Some(new AdmissionLimits(Int.MaxValue)) - } - - } override def getBatch(start: Option[Offset], end: Offset): DataFrame = { // Make sure initialTopicOffsets is initialized @@ -257,3 +193,13 @@ private[pulsar] class PulsarSource( /** A read limit that admits a soft-max of `maxBytes` per micro-batch. */ case class ReadMaxBytes(maxBytes: Long) extends ReadLimit + +class AdmissionLimits(var bytesToTake: Long) + +object AdmissionLimits { + def apply(limit: ReadLimit): Option[AdmissionLimits] = limit match { + case maxBytes: ReadMaxBytes => Some(new AdmissionLimits(maxBytes.maxBytes)) + case _: ReadAllAvailable => Some(new AdmissionLimits(Int.MaxValue)) + } + +} From 423efe402025b268c9f0eb2064001bc91846b443 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Tue, 15 Aug 2023 19:18:12 -0700 Subject: [PATCH 09/38] feedback and refactoring --- .../spark/sql/pulsar/PulsarHelper.scala | 80 +++++++++++-------- .../spark/sql/pulsar/PulsarSource.scala | 12 ++- 2 files changed, 55 insertions(+), 37 deletions(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala index 58f3712f..c9d56b97 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala @@ -234,41 +234,11 @@ private[pulsar] case class PulsarHelper( val offsets = mutable.Map[String, MessageId]() offsets ++= startPartitionOffsets val numPartitions = startPartitionOffsets.size - startPartitionOffsets.keys.filter(topicPartition => { - pulsarAdmin.topics.getInternalStats(topicPartition).currentLedgerEntries > 0 - }).foreach { topicPartition => - var readLimit = totalReadLimit / numPartitions - val messageId = startPartitionOffsets.apply(topicPartition) - val ledgerId = getLedgerId(messageId) - val entryId = getEntryId(messageId) - val stats = pulsarAdmin.topics.getInternalStats(topicPartition) - pulsarAdmin.topics.getInternalStats(topicPartition).ledgers. - asScala.filter(_.ledgerId >= ledgerId).sortBy(_.ledgerId).foreach { ledger => - ledger.entries = stats.currentLedgerEntries - val avgBytesPerEntries = stats.currentLedgerSize / stats.currentLedgerEntries - // approximation of bytes left in ledger to deal with case - // where we are at the middle of the ledger - val bytesLeftInLedger = avgBytesPerEntries * { - if (ledger.ledgerId == ledgerId) { - ledger.entries - entryId - } else { - ledger.entries - } - } - if (readLimit > bytesLeftInLedger) { - readLimit -= bytesLeftInLedger - offsets += (topicPartition -> DefaultImplementation - .getDefaultImplementation - .newMessageId(ledger.ledgerId, ledger.entries - 1, -1)) - } else { - val numEntriesToRead = Math.max(1, readLimit / avgBytesPerEntries) - val lastEntryRead = Math.min(ledger.entries - 1, entryId + numEntriesToRead) - offsets += (topicPartition -> DefaultImplementation - .getDefaultImplementation - .newMessageId(ledger.ledgerId, lastEntryRead, -1)) - readLimit = 0 - } - } + // move all topic partition logic to helper function + startPartitionOffsets.keys.foreach { topicPartition => + val readLimit = totalReadLimit / numPartitions + val startMessageId = startPartitionOffsets.apply(topicPartition) + offsets += (topicPartition -> latestOffsetForTopic(topicPartition, startMessageId, readLimit)) } SpecificPulsarOffset(offsets.toMap) } @@ -286,6 +256,46 @@ private[pulsar] case class PulsarHelper( PulsarSourceUtils.seekableLatestMid(messageId) } + def latestOffsetForTopic(topicPartition: String, + startMessageId: MessageId, + readLimit: Long): MessageId = { + val startLedgerId = getLedgerId(startMessageId) + val startEntryId = getEntryId(startMessageId) + val stats = pulsarAdmin.topics.getInternalStats(topicPartition) + val ledgers = pulsarAdmin.topics.getInternalStats(topicPartition).ledgers. + asScala.filter(_.ledgerId >= startLedgerId).sortBy(_.ledgerId) + if (ledgers.nonEmpty) { + ledgers.last.size = stats.currentLedgerSize + ledgers.last.entries = stats.currentLedgerEntries + } + var messageId = startMessageId + var readLimitLeft = readLimit + ledgers.filter(_.entries != 0).sortBy(_.ledgerId).foreach { ledger => + val avgBytesPerEntries = ledger.size / ledger.entries + // approximation of bytes left in ledger to deal with case + // where we are at the middle of the ledger + val bytesLeftInLedger = if (ledger.ledgerId == startLedgerId) { + avgBytesPerEntries * (ledger.entries - startEntryId - 1) + } else { + ledger.size + } + if (readLimitLeft > bytesLeftInLedger) { + readLimitLeft -= bytesLeftInLedger + messageId = DefaultImplementation + .getDefaultImplementation + .newMessageId(ledger.ledgerId, ledger.entries - 1, -1) + } else { + val numEntriesToRead = Math.max(1, readLimit / avgBytesPerEntries) + val lastEntryRead = Math.min(ledger.entries - 1, startEntryId + numEntriesToRead) + messageId = DefaultImplementation + .getDefaultImplementation + .newMessageId(ledger.ledgerId, lastEntryRead, -1) + readLimitLeft = 0 + } + } + messageId + } + def fetchEarliestOffsets(topics: Seq[String]): Map[String, MessageId] = { if (topics.isEmpty) { Map.empty[String, MessageId] diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala index 0469f445..3a09d6d1 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala @@ -82,10 +82,18 @@ private[pulsar] class PulsarSource( readLimit: ReadLimit): streaming.Offset = { initialTopicOffsets val admissionLimits = AdmissionLimits(readLimit) - pulsarHelper.latestOffsets(startingOffset, admissionLimits.get) + if (admissionLimits.isEmpty) { + pulsarHelper.fetchLatestOffsets() + } else { + pulsarHelper.latestOffsets(startingOffset, admissionLimits.get) + } } override def getDefaultReadLimit: ReadLimit = { + if (maxBytesPerTrigger == Long.MaxValue) { + ReadLimit.allAvailable() + } else { ReadMaxBytes.apply(maxBytesPerTrigger) + } } override def getBatch(start: Option[Offset], end: Offset): DataFrame = { @@ -199,7 +207,7 @@ class AdmissionLimits(var bytesToTake: Long) object AdmissionLimits { def apply(limit: ReadLimit): Option[AdmissionLimits] = limit match { case maxBytes: ReadMaxBytes => Some(new AdmissionLimits(maxBytes.maxBytes)) - case _: ReadAllAvailable => Some(new AdmissionLimits(Int.MaxValue)) + case _: ReadAllAvailable => None } } From 769890e7f149102f7b0c8378ea62aa3f7a90ab3a Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Wed, 16 Aug 2023 11:00:58 -0700 Subject: [PATCH 10/38] feedback --- .../spark/sql/pulsar/PulsarHelper.scala | 14 ++++++++------ .../spark/sql/pulsar/PulsarProvider.scala | 19 ++++++++++++++----- .../spark/sql/pulsar/PulsarSource.scala | 11 ++++------- .../pulsar/PulsarAdmissionControlSuite.scala | 5 ++--- 4 files changed, 28 insertions(+), 21 deletions(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala index c9d56b97..cce9d4ba 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala @@ -46,7 +46,7 @@ import org.apache.spark.sql.types.StructType */ private[pulsar] case class PulsarHelper( serviceUrl: String, - adminUrl: String, + adminUrl: Option[String], clientConf: ju.Map[String, Object], driverGroupIdPrefix: String, caseInsensitiveParameters: Map[String, String], @@ -62,7 +62,7 @@ private[pulsar] case class PulsarHelper( private var topics: Seq[String] = _ private var topicPartitions: Seq[String] = _ - private lazy val pulsarAdmin = PulsarAdmin.builder().serviceHttpUrl(adminUrl).build() + private lazy val pulsarAdmin = PulsarAdmin.builder().serviceHttpUrl(adminUrl.get).build() override def close(): Unit = { // do nothing @@ -217,9 +217,9 @@ private[pulsar] case class PulsarHelper( } def latestOffsets(startingOffset: streaming.Offset, - admissionLimits: AdmissionLimits): SpecificPulsarOffset = { + totalReadLimit: Long): SpecificPulsarOffset = { // implement helper inside PulsarHelper in order to use getTopicPartitions - val latestOffsets = fetchLatestOffsets().topicOffsets + val topicPartitions = getTopicPartitions // add new partitions from PulsarAdmin, set to earliest entry and ledger id based on limit // start a reader, get to the earliest offset for new topic partitions val existingStartOffsets = if (startingOffset != null) { @@ -227,10 +227,9 @@ private[pulsar] case class PulsarHelper( } else { Map[String, MessageId]() } - val newTopics = latestOffsets.keySet.diff(existingStartOffsets.keySet) + val newTopics = topicPartitions.toSet.diff(existingStartOffsets.keySet) val startPartitionOffsets = existingStartOffsets ++ newTopics.map(topicPartition => topicPartition -> fetchLatestOffsetForTopic(topicPartition)) - val totalReadLimit = admissionLimits.bytesToTake val offsets = mutable.Map[String, MessageId]() offsets ++= startPartitionOffsets val numPartitions = startPartitionOffsets.size @@ -264,6 +263,9 @@ private[pulsar] case class PulsarHelper( val stats = pulsarAdmin.topics.getInternalStats(topicPartition) val ledgers = pulsarAdmin.topics.getInternalStats(topicPartition).ledgers. asScala.filter(_.ledgerId >= startLedgerId).sortBy(_.ledgerId) + // The last ledger of the ledgers list doesn't have .size or .entries + // properly populated, and the corresponding info is in currentLedgerSize + // and currentLedgerEntries if (ledgers.nonEmpty) { ledgers.last.size = stats.currentLedgerSize ledgers.last.entries = stats.currentLedgerEntries diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala index 16560009..945b0644 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala @@ -107,8 +107,13 @@ private[pulsar] class PulsarProvider pulsarHelper.offsetForEachTopic(caseInsensitiveParams, LatestOffset, StartOptionKey) pulsarHelper.setupCursor(offset) + val maxBytes = maxBytesPerTrigger(caseInsensitiveParams) + if (adminUrl.isEmpty && maxBytes != 0L) { + throw new IllegalArgumentException("admin.url " + + "must be specified if maxBytesPerTrigger is specified") + } + new PulsarSource( - adminUrl, sqlContext, pulsarHelper, clientConfig, @@ -371,8 +376,12 @@ private[pulsar] object PulsarProvider extends Logging { parameters(ServiceUrlOptionKey) } - private def getAdminUrl(parameters: Map[String, String]): String = { - parameters(AdminUrlOptionKey) + private def getAdminUrl(parameters: Map[String, String]): Option[String] = { + val adminUrl = parameters.getOrElse(AdminUrlOptionKey, "") + adminUrl match { + case "" => None + case s => Option(s) + } } private def getAllowDifferentTopicSchemas(parameters: Map[String, String]): Boolean = { @@ -393,7 +402,7 @@ private[pulsar] object PulsarProvider extends Logging { caseInsensitiveParams .getOrElse( PulsarOptions.MaxBytesPerTrigger, - Long.MaxValue.toString + 0L.toString ).toLong private def validateGeneralOptions( @@ -502,7 +511,7 @@ private[pulsar] object PulsarProvider extends Logging { } private def prepareConfForReader(parameters: Map[String, String]) - : (ju.Map[String, Object], ju.Map[String, Object], String, String) = { + : (ju.Map[String, Object], ju.Map[String, Object], String, Option[String]) = { val serviceUrl = getServiceUrl(parameters) val adminUrl = getAdminUrl(parameters) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala index 3a09d6d1..39e75c72 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala @@ -38,7 +38,6 @@ import org.apache.spark.sql.types.StructType private[pulsar] class PulsarSource( - serviceUrl: String, sqlContext: SQLContext, pulsarHelper: PulsarHelper, clientConf: ju.Map[String, Object], @@ -81,15 +80,13 @@ private[pulsar] class PulsarSource( override def latestOffset(startingOffset: streaming.Offset, readLimit: ReadLimit): streaming.Offset = { initialTopicOffsets - val admissionLimits = AdmissionLimits(readLimit) - if (admissionLimits.isEmpty) { - pulsarHelper.fetchLatestOffsets() - } else { - pulsarHelper.latestOffsets(startingOffset, admissionLimits.get) + readLimit match { + case ReadMaxBytes(maxBytes) => pulsarHelper.latestOffsets(startingOffset, maxBytes) + case _: ReadAllAvailable => pulsarHelper.fetchLatestOffsets() } } override def getDefaultReadLimit: ReadLimit = { - if (maxBytesPerTrigger == Long.MaxValue) { + if (maxBytesPerTrigger == 0L) { ReadLimit.allAvailable() } else { ReadMaxBytes.apply(maxBytesPerTrigger) diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala index 06d76ddd..c9db4b79 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala @@ -21,7 +21,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { * Can send message of specific size in AddPulsarData here */ - test("Admission Control") { + test("Check last batch where message size is greater than maxBytesPerTrigger") { val topic = newTopic() sendMessages(topic, Array("-1")) require(getLatestOffsets(Set(topic)).size === 1) @@ -40,7 +40,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { // Each Int adds 38 bytes to message size, so we expect 3 Ints in each message testStream(mapped)( - StartStream(trigger = ProcessingTime(100)), + StartStream(trigger = ProcessingTime(10)), makeSureGetOffsetCalled, AddPulsarData(Set(topic), 1, 2, 3), CheckLastBatch(2, 3, 4), @@ -51,7 +51,6 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { recordsRead == 9 } ) - } } From dbadb7e98109e0e2c7dcc935eb5d64649f546b94 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Wed, 16 Aug 2023 11:10:43 -0700 Subject: [PATCH 11/38] dealing with startLedgerId == ledger.id --- .../scala/org/apache/spark/sql/pulsar/PulsarHelper.scala | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala index cce9d4ba..7f8e84e5 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala @@ -288,7 +288,12 @@ private[pulsar] case class PulsarHelper( .newMessageId(ledger.ledgerId, ledger.entries - 1, -1) } else { val numEntriesToRead = Math.max(1, readLimit / avgBytesPerEntries) - val lastEntryRead = Math.min(ledger.entries - 1, startEntryId + numEntriesToRead) + val lastEntryId = if (ledger.ledgerId == startLedgerId) { + numEntriesToRead - 1 + } else { + startEntryId + numEntriesToRead + } + val lastEntryRead = Math.min(ledger.entries - 1, lastEntryId) messageId = DefaultImplementation .getDefaultImplementation .newMessageId(ledger.ledgerId, lastEntryRead, -1) From 54ea233df614361e15e74c9bbe024f0064a117cb Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Wed, 16 Aug 2023 11:18:02 -0700 Subject: [PATCH 12/38] fix --- src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala index 7f8e84e5..ec63eee8 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala @@ -288,7 +288,7 @@ private[pulsar] case class PulsarHelper( .newMessageId(ledger.ledgerId, ledger.entries - 1, -1) } else { val numEntriesToRead = Math.max(1, readLimit / avgBytesPerEntries) - val lastEntryId = if (ledger.ledgerId == startLedgerId) { + val lastEntryId = if (ledger.ledgerId != startLedgerId) { numEntriesToRead - 1 } else { startEntryId + numEntriesToRead From 843f7c971cadf489595e91a7f6c6b385d56301d9 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Wed, 16 Aug 2023 11:20:56 -0700 Subject: [PATCH 13/38] check readLimit greater than 0 --- src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala index ec63eee8..bdac9989 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala @@ -286,7 +286,7 @@ private[pulsar] case class PulsarHelper( messageId = DefaultImplementation .getDefaultImplementation .newMessageId(ledger.ledgerId, ledger.entries - 1, -1) - } else { + } else if (readLimitLeft > 0) { val numEntriesToRead = Math.max(1, readLimit / avgBytesPerEntries) val lastEntryId = if (ledger.ledgerId != startLedgerId) { numEntriesToRead - 1 From 8f51130db2fa366ac91631b53ca808697bd82be4 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Wed, 16 Aug 2023 11:40:07 -0700 Subject: [PATCH 14/38] early return if readLimitLeft == 0 --- .../scala/org/apache/spark/sql/pulsar/PulsarHelper.scala | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala index bdac9989..be07767a 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala @@ -62,6 +62,9 @@ private[pulsar] case class PulsarHelper( private var topics: Seq[String] = _ private var topicPartitions: Seq[String] = _ + // We can do this because pulsarAdmin will only be called if latestOffset is called + // and there should be an exception thrown in PulsarProvider if maxBytes is set, + // and maxBytes is not set private lazy val pulsarAdmin = PulsarAdmin.builder().serviceHttpUrl(adminUrl.get).build() override def close(): Unit = { @@ -273,6 +276,9 @@ private[pulsar] case class PulsarHelper( var messageId = startMessageId var readLimitLeft = readLimit ledgers.filter(_.entries != 0).sortBy(_.ledgerId).foreach { ledger => + if (readLimitLeft == 0) { + return messageId + } val avgBytesPerEntries = ledger.size / ledger.entries // approximation of bytes left in ledger to deal with case // where we are at the middle of the ledger From 0cf4dd79ba8e37ba263057c408e0ba58ac3f2a14 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Wed, 16 Aug 2023 11:48:43 -0700 Subject: [PATCH 15/38] increasing processing time --- .../apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala index c9db4b79..0b434df4 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala @@ -40,7 +40,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { // Each Int adds 38 bytes to message size, so we expect 3 Ints in each message testStream(mapped)( - StartStream(trigger = ProcessingTime(10)), + StartStream(trigger = ProcessingTime(100)), makeSureGetOffsetCalled, AddPulsarData(Set(topic), 1, 2, 3), CheckLastBatch(2, 3, 4), From fb645ec8518ad39dcb7201fcb4bfad0cc5406a1c Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Wed, 16 Aug 2023 13:04:23 -0700 Subject: [PATCH 16/38] removing unnecessary code --- .../org/apache/spark/sql/pulsar/PulsarSource.scala | 9 --------- .../sql/pulsar/PulsarMicroBatchSourceSuite.scala | 11 ++--------- .../spark/sql/pulsar/PulsarSourceSuiteBase.scala | 10 ---------- 3 files changed, 2 insertions(+), 28 deletions(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala index 39e75c72..b0d0c6b1 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala @@ -199,12 +199,3 @@ private[pulsar] class PulsarSource( /** A read limit that admits a soft-max of `maxBytes` per micro-batch. */ case class ReadMaxBytes(maxBytes: Long) extends ReadLimit -class AdmissionLimits(var bytesToTake: Long) - -object AdmissionLimits { - def apply(limit: ReadLimit): Option[AdmissionLimits] = limit match { - case maxBytes: ReadMaxBytes => Some(new AdmissionLimits(maxBytes.maxBytes)) - case _: ReadAllAvailable => None - } - -} diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarMicroBatchSourceSuite.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarMicroBatchSourceSuite.scala index 2c700c3a..7b0b51a9 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarMicroBatchSourceSuite.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarMicroBatchSourceSuite.scala @@ -14,12 +14,13 @@ package org.apache.spark.sql.pulsar import java.util.concurrent.ConcurrentLinkedQueue + import org.apache.pulsar.client.admin.PulsarAdmin import org.apache.spark.SparkException import org.apache.spark.sql.ForeachWriter import org.apache.spark.sql.execution.streaming.StreamingExecutionRelation import org.apache.spark.sql.functions.{count, window} -import org.apache.spark.sql.pulsar.PulsarOptions.{AdminUrlOptionKey, ServiceUrlOptionKey, TopicPattern} +import org.apache.spark.sql.pulsar.PulsarOptions.{ServiceUrlOptionKey, TopicPattern} import org.apache.spark.sql.streaming.Trigger.ProcessingTime import org.apache.spark.util.Utils @@ -30,7 +31,6 @@ class PulsarMicroBatchV1SourceSuite extends PulsarMicroBatchSourceSuiteBase { val pulsar = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) - .option(AdminUrlOptionKey, adminUrl) .option(TopicPattern, s"$topic.*") .load() @@ -56,7 +56,6 @@ abstract class PulsarMicroBatchSourceSuiteBase extends PulsarSourceSuiteBase { val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) - .option(AdminUrlOptionKey, adminUrl) .option(TopicSingle, topic) testStream(reader.load)(makeSureGetOffsetCalled, StopStream, StartStream(), StopStream) @@ -71,7 +70,6 @@ abstract class PulsarMicroBatchSourceSuiteBase extends PulsarSourceSuiteBase { .format("pulsar") .option(TopicSingle, topic) .option(ServiceUrlOptionKey, serviceUrl) - .option(AdminUrlOptionKey, adminUrl) .load() .selectExpr("CAST(__key AS STRING)", "CAST(value AS STRING)") .as[(String, String)] @@ -100,7 +98,6 @@ abstract class PulsarMicroBatchSourceSuiteBase extends PulsarSourceSuiteBase { val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) - .option(AdminUrlOptionKey, adminUrl) .option(TopicPattern, s"$topicPrefix-.*") .option("failOnDataLoss", "false") @@ -136,7 +133,6 @@ abstract class PulsarMicroBatchSourceSuiteBase extends PulsarSourceSuiteBase { val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) - .option(AdminUrlOptionKey, adminUrl) .option(TopicPattern, s"$topicPrefix-.*") .option("failOnDataLoss", "true") .option("startingOffsets", "earliest") @@ -184,7 +180,6 @@ abstract class PulsarMicroBatchSourceSuiteBase extends PulsarSourceSuiteBase { val pulsar = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) - .option(AdminUrlOptionKey, adminUrl) .option(StartingOffsetsOptionKey, "earliest") .option(TopicSingle, topic) .load() @@ -223,7 +218,6 @@ abstract class PulsarMicroBatchSourceSuiteBase extends PulsarSourceSuiteBase { val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) - .option(AdminUrlOptionKey, adminUrl) .option(TopicSingle, topic) .option(StartingOffsetsOptionKey, "earliest") .option(PollTimeoutMS, "1000") @@ -267,7 +261,6 @@ abstract class PulsarMicroBatchSourceSuiteBase extends PulsarSourceSuiteBase { .format("pulsar") .option(TopicSingle, topic) .option(ServiceUrlOptionKey, serviceUrl) - .option(AdminUrlOptionKey, adminUrl) .load() val values = pulsar diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarSourceSuiteBase.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarSourceSuiteBase.scala index eaef2e2e..8a2ade1e 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarSourceSuiteBase.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarSourceSuiteBase.scala @@ -36,7 +36,6 @@ abstract class PulsarSourceSuiteBase extends PulsarSourceTest { val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) - .option(AdminUrlOptionKey, adminUrl) .option(TopicPattern, s"$topic.*") val pulsar = reader @@ -147,7 +146,6 @@ abstract class PulsarSourceSuiteBase extends PulsarSourceTest { val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) - .option(AdminUrlOptionKey, adminUrl) options.foreach { case (k, v) => reader.option(k, v) } reader.load() } @@ -199,7 +197,6 @@ abstract class PulsarSourceSuiteBase extends PulsarSourceTest { val pulsar = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) - .option(AdminUrlOptionKey, adminUrl) .option(StartingOffsetsOptionKey, "earliest") .option(TopicMulti, topic) .load() @@ -242,7 +239,6 @@ abstract class PulsarSourceSuiteBase extends PulsarSourceTest { .format("pulsar") .option(StartingOffsetsOptionKey, "earliest") .option(ServiceUrlOptionKey, serviceUrl) - .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, true) .option(TopicSingle, topic) @@ -332,7 +328,6 @@ abstract class PulsarSourceSuiteBase extends PulsarSourceTest { .format("pulsar") .option(StartingOffsetsOptionKey, "earliest") .option(ServiceUrlOptionKey, serviceUrl) - .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, true) .option(TopicSingle, topic) @@ -353,7 +348,6 @@ abstract class PulsarSourceSuiteBase extends PulsarSourceTest { .format("pulsar") .option(StartingOffsetsOptionKey, "earliest") .option(ServiceUrlOptionKey, serviceUrl) - .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, true) .option(TopicSingle, topic) @@ -378,7 +372,6 @@ abstract class PulsarSourceSuiteBase extends PulsarSourceTest { .format("pulsar") .option(StartingOffsetsOptionKey, "latest") .option(ServiceUrlOptionKey, serviceUrl) - .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, failOnDataLoss.toString) options.foreach { case (k, v) => reader.option(k, v) } @@ -423,7 +416,6 @@ abstract class PulsarSourceSuiteBase extends PulsarSourceTest { .format("pulsar") .option(StartingOffsetsOptionKey, "earliest") .option(ServiceUrlOptionKey, serviceUrl) - .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, failOnDataLoss.toString) options.foreach { case (k, v) => reader.option(k, v) } val pulsar = reader @@ -467,7 +459,6 @@ abstract class PulsarSourceSuiteBase extends PulsarSourceTest { .format("pulsar") .option(StartingTime, time0) .option(ServiceUrlOptionKey, serviceUrl) - .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, failOnDataLoss.toString) options.foreach { case (k, v) => reader.option(k, v) } val pulsar = reader @@ -502,7 +493,6 @@ abstract class PulsarSourceSuiteBase extends PulsarSourceTest { .format("pulsar") .option(StartingOffsetsOptionKey, s1) .option(ServiceUrlOptionKey, serviceUrl) - .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, failOnDataLoss.toString) options.foreach { case (k, v) => reader.option(k, v) } val pulsar = reader From f4a3b39964e8e7005438da5876251cf5a5cfd5e0 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Wed, 16 Aug 2023 16:41:20 -0700 Subject: [PATCH 17/38] checking if consumer is connected + pulsaradmissionhelper --- .../spark/sql/pulsar/PulsarHelper.scala | 6 +- .../pulsar/PulsarAdmissionControlHelper.scala | 88 +++++++++++++++++++ .../pulsar/PulsarAdmissionControlSuite.scala | 16 +++- 3 files changed, 106 insertions(+), 4 deletions(-) create mode 100644 src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlHelper.scala diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala index be07767a..1ec4ab4d 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala @@ -134,7 +134,9 @@ private[pulsar] case class PulsarHelper( offset.foreach { case (tp, mid) => try { val (subscription, _) = extractSubscription(predefinedSubscription, tp) - CachedConsumer.getOrCreate(tp, subscription, client).seek(mid) + val consumer = CachedConsumer.getOrCreate(tp, subscription, client) + if (!consumer.isConnected) consumer.getLastMessageId + consumer.seek(mid) } catch { case e: Throwable => throw new RuntimeException( @@ -232,7 +234,7 @@ private[pulsar] case class PulsarHelper( } val newTopics = topicPartitions.toSet.diff(existingStartOffsets.keySet) val startPartitionOffsets = existingStartOffsets ++ newTopics.map(topicPartition - => topicPartition -> fetchLatestOffsetForTopic(topicPartition)) + => topicPartition -> MessageId.earliest) val offsets = mutable.Map[String, MessageId]() offsets ++= startPartitionOffsets val numPartitions = startPartitionOffsets.size diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlHelper.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlHelper.scala new file mode 100644 index 00000000..ab01b1e9 --- /dev/null +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlHelper.scala @@ -0,0 +1,88 @@ +package org.apache.spark.sql.pulsar + +import java.{util => ju} +import java.io.Closeable +import java.util.concurrent.TimeUnit +import java.util.regex.Pattern + +import scala.collection.mutable +import scala.language.postfixOps +import scala.util.control.NonFatal + +import org.apache.pulsar.client.admin.PulsarAdmin +import org.apache.pulsar.client.api.{MessageId, PulsarClient} +import org.apache.pulsar.client.impl.{MessageIdImpl, PulsarClientImpl} +import org.apache.pulsar.client.impl.schema.BytesSchema +import org.apache.pulsar.client.internal.DefaultImplementation +import org.apache.pulsar.common.api.proto.CommandGetTopicsOfNamespace +import org.apache.pulsar.common.naming.TopicName +import org.apache.pulsar.common.schema.SchemaInfo +import org.apache.pulsar.shade.com.google.common.util.concurrent.Uninterruptibles + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.connector.read.streaming +import org.apache.spark.sql.connector.read.streaming.{ReadAllAvailable, ReadLimit} +import org.apache.spark.sql.pulsar.PulsarOptions._ +import org.apache.spark.sql.pulsar.PulsarSourceUtils.{getEntryId, getLedgerId} +import org.apache.spark.sql.pulsar.SpecificPulsarOffset.getTopicOffsets +import org.apache.spark.sql.types.StructType + + +class PulsarAdmissionControlHelper(adminUrl: String) { + + import scala.collection.JavaConverters._ + + private lazy val pulsarAdmin = PulsarAdmin.builder().serviceHttpUrl(adminUrl).build() + + def latestOffsetForTopic(topicPartition: String, + startMessageId: MessageId, + readLimit: Long): MessageId = { + val startLedgerId = getLedgerId(startMessageId) + val startEntryId = getEntryId(startMessageId) + val stats = pulsarAdmin.topics.getInternalStats(topicPartition) + val ledgers = pulsarAdmin.topics.getInternalStats(topicPartition).ledgers. + asScala.filter(_.ledgerId >= startLedgerId).sortBy(_.ledgerId) + // The last ledger of the ledgers list doesn't have .size or .entries + // properly populated, and the corresponding info is in currentLedgerSize + // and currentLedgerEntries + if (ledgers.nonEmpty) { + ledgers.last.size = stats.currentLedgerSize + ledgers.last.entries = stats.currentLedgerEntries + } + var messageId = startMessageId + var readLimitLeft = readLimit + ledgers.filter(_.entries != 0).sortBy(_.ledgerId).foreach { ledger => + if (readLimitLeft == 0) { + return messageId + } + val avgBytesPerEntries = ledger.size / ledger.entries + // approximation of bytes left in ledger to deal with case + // where we are at the middle of the ledger + val bytesLeftInLedger = if (ledger.ledgerId == startLedgerId) { + avgBytesPerEntries * (ledger.entries - startEntryId - 1) + } else { + ledger.size + } + if (readLimitLeft > bytesLeftInLedger) { + readLimitLeft -= bytesLeftInLedger + messageId = DefaultImplementation + .getDefaultImplementation + .newMessageId(ledger.ledgerId, ledger.entries - 1, -1) + } else if (readLimitLeft > 0) { + val numEntriesToRead = Math.max(1, readLimit / avgBytesPerEntries) + val lastEntryId = if (ledger.ledgerId != startLedgerId) { + numEntriesToRead - 1 + } else { + startEntryId + numEntriesToRead + } + val lastEntryRead = Math.min(ledger.entries - 1, lastEntryId) + messageId = DefaultImplementation + .getDefaultImplementation + .newMessageId(ledger.ledgerId, lastEntryRead, -1) + readLimitLeft = 0 + } + } + messageId + } + +} diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala index 0b434df4..30de8e6b 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala @@ -1,6 +1,7 @@ package org.apache.spark.sql.pulsar import org.apache.pulsar.client.admin.PulsarAdmin +import org.apache.pulsar.client.internal.DefaultImplementation import org.apache.spark.sql.streaming.Trigger.{Once, ProcessingTime} import org.apache.spark.util.Utils @@ -13,6 +14,10 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { super.beforeAll() } + override def afterAll(): Unit = { + super.afterAll() + } + /** * Write unit test to create limits, can construct fake ledger statistics * Can call latestOffset() directly from the unit test @@ -31,7 +36,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { .option(TopicSingle, topic) .option(ServiceUrlOptionKey, serviceUrl) .option(AdminUrlOptionKey, adminUrl) - .option(MaxBytesPerTrigger, 120) + .option(MaxBytesPerTrigger, 150) .load() .selectExpr("CAST(__key AS STRING)", "CAST(value AS STRING)") .as[(String, String)] @@ -40,7 +45,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { // Each Int adds 38 bytes to message size, so we expect 3 Ints in each message testStream(mapped)( - StartStream(trigger = ProcessingTime(100)), + StartStream(trigger = ProcessingTime(1000)), makeSureGetOffsetCalled, AddPulsarData(Set(topic), 1, 2, 3), CheckLastBatch(2, 3, 4), @@ -53,4 +58,11 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { ) } +// test("latest") { +// val topic = newTopic() +// sendMessages(topic, Array("1")) +// +// val adminu: String = adminUrl +// val pulsarHelper = new PulsarAdmissionControlHelper(adminu) +// } } From be7d93c6322e4a1ed2742e67d077fce95ab78f5f Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Fri, 18 Aug 2023 09:11:12 -0700 Subject: [PATCH 18/38] putting latestOffsetForTopic in AdmissionControlHelper --- .../spark/sql/pulsar/PulsarHelper.scala | 110 +++++++++--------- .../pulsar/PulsarAdmissionControlHelper.scala | 88 -------------- .../pulsar/PulsarAdmissionControlSuite.scala | 21 ++-- 3 files changed, 72 insertions(+), 147 deletions(-) delete mode 100644 src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlHelper.scala diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala index 1ec4ab4d..5722c0a0 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala @@ -66,6 +66,7 @@ private[pulsar] case class PulsarHelper( // and there should be an exception thrown in PulsarProvider if maxBytes is set, // and maxBytes is not set private lazy val pulsarAdmin = PulsarAdmin.builder().serviceHttpUrl(adminUrl.get).build() + private lazy val admissionControlHelper = new PulsarAdmissionControlHelper(pulsarAdmin) override def close(): Unit = { // do nothing @@ -242,7 +243,8 @@ private[pulsar] case class PulsarHelper( startPartitionOffsets.keys.foreach { topicPartition => val readLimit = totalReadLimit / numPartitions val startMessageId = startPartitionOffsets.apply(topicPartition) - offsets += (topicPartition -> latestOffsetForTopic(topicPartition, startMessageId, readLimit)) + offsets += (topicPartition -> admissionControlHelper.latestOffsetForTopic( + topicPartition, startMessageId, readLimit)) } SpecificPulsarOffset(offsets.toMap) } @@ -260,57 +262,6 @@ private[pulsar] case class PulsarHelper( PulsarSourceUtils.seekableLatestMid(messageId) } - def latestOffsetForTopic(topicPartition: String, - startMessageId: MessageId, - readLimit: Long): MessageId = { - val startLedgerId = getLedgerId(startMessageId) - val startEntryId = getEntryId(startMessageId) - val stats = pulsarAdmin.topics.getInternalStats(topicPartition) - val ledgers = pulsarAdmin.topics.getInternalStats(topicPartition).ledgers. - asScala.filter(_.ledgerId >= startLedgerId).sortBy(_.ledgerId) - // The last ledger of the ledgers list doesn't have .size or .entries - // properly populated, and the corresponding info is in currentLedgerSize - // and currentLedgerEntries - if (ledgers.nonEmpty) { - ledgers.last.size = stats.currentLedgerSize - ledgers.last.entries = stats.currentLedgerEntries - } - var messageId = startMessageId - var readLimitLeft = readLimit - ledgers.filter(_.entries != 0).sortBy(_.ledgerId).foreach { ledger => - if (readLimitLeft == 0) { - return messageId - } - val avgBytesPerEntries = ledger.size / ledger.entries - // approximation of bytes left in ledger to deal with case - // where we are at the middle of the ledger - val bytesLeftInLedger = if (ledger.ledgerId == startLedgerId) { - avgBytesPerEntries * (ledger.entries - startEntryId - 1) - } else { - ledger.size - } - if (readLimitLeft > bytesLeftInLedger) { - readLimitLeft -= bytesLeftInLedger - messageId = DefaultImplementation - .getDefaultImplementation - .newMessageId(ledger.ledgerId, ledger.entries - 1, -1) - } else if (readLimitLeft > 0) { - val numEntriesToRead = Math.max(1, readLimit / avgBytesPerEntries) - val lastEntryId = if (ledger.ledgerId != startLedgerId) { - numEntriesToRead - 1 - } else { - startEntryId + numEntriesToRead - } - val lastEntryRead = Math.min(ledger.entries - 1, lastEntryId) - messageId = DefaultImplementation - .getDefaultImplementation - .newMessageId(ledger.ledgerId, lastEntryRead, -1) - readLimitLeft = 0 - } - } - messageId - } - def fetchEarliestOffsets(topics: Seq[String]): Map[String, MessageId] = { if (topics.isEmpty) { Map.empty[String, MessageId] @@ -563,3 +514,58 @@ private[pulsar] case class PulsarHelper( CachedConsumer.getOrCreate(topic, subscriptionName, client).getLastMessageId } } + +class PulsarAdmissionControlHelper(pulsarAdmin: PulsarAdmin) { + + import scala.collection.JavaConverters._ + def latestOffsetForTopic(topicPartition: String, + startMessageId: MessageId, + readLimit: Long): MessageId = { + val startLedgerId = getLedgerId(startMessageId) + val startEntryId = getEntryId(startMessageId) + val stats = pulsarAdmin.topics.getInternalStats(topicPartition) + val ledgers = pulsarAdmin.topics.getInternalStats(topicPartition).ledgers. + asScala.filter(_.ledgerId >= startLedgerId).sortBy(_.ledgerId) + // The last ledger of the ledgers list doesn't have .size or .entries + // properly populated, and the corresponding info is in currentLedgerSize + // and currentLedgerEntries + if (ledgers.nonEmpty) { + ledgers.last.size = stats.currentLedgerSize + ledgers.last.entries = stats.currentLedgerEntries + } + var messageId = startMessageId + var readLimitLeft = readLimit + ledgers.filter(_.entries != 0).sortBy(_.ledgerId).foreach { ledger => + if (readLimitLeft == 0) { + return messageId + } + val avgBytesPerEntries = ledger.size / ledger.entries + // approximation of bytes left in ledger to deal with case + // where we are at the middle of the ledger + val bytesLeftInLedger = if (ledger.ledgerId == startLedgerId) { + avgBytesPerEntries * (ledger.entries - startEntryId - 1) + } else { + ledger.size + } + if (readLimitLeft > bytesLeftInLedger) { + readLimitLeft -= bytesLeftInLedger + messageId = DefaultImplementation + .getDefaultImplementation + .newMessageId(ledger.ledgerId, ledger.entries - 1, -1) + } else if (readLimitLeft > 0) { + val numEntriesToRead = Math.max(1, readLimit / avgBytesPerEntries) + val lastEntryId = if (ledger.ledgerId != startLedgerId) { + numEntriesToRead - 1 + } else { + startEntryId + numEntriesToRead + } + val lastEntryRead = Math.min(ledger.entries - 1, lastEntryId) + messageId = DefaultImplementation + .getDefaultImplementation + .newMessageId(ledger.ledgerId, lastEntryRead, -1) + readLimitLeft = 0 + } + } + messageId + } +} diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlHelper.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlHelper.scala deleted file mode 100644 index ab01b1e9..00000000 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlHelper.scala +++ /dev/null @@ -1,88 +0,0 @@ -package org.apache.spark.sql.pulsar - -import java.{util => ju} -import java.io.Closeable -import java.util.concurrent.TimeUnit -import java.util.regex.Pattern - -import scala.collection.mutable -import scala.language.postfixOps -import scala.util.control.NonFatal - -import org.apache.pulsar.client.admin.PulsarAdmin -import org.apache.pulsar.client.api.{MessageId, PulsarClient} -import org.apache.pulsar.client.impl.{MessageIdImpl, PulsarClientImpl} -import org.apache.pulsar.client.impl.schema.BytesSchema -import org.apache.pulsar.client.internal.DefaultImplementation -import org.apache.pulsar.common.api.proto.CommandGetTopicsOfNamespace -import org.apache.pulsar.common.naming.TopicName -import org.apache.pulsar.common.schema.SchemaInfo -import org.apache.pulsar.shade.com.google.common.util.concurrent.Uninterruptibles - -import org.apache.spark.internal.Logging -import org.apache.spark.sql.connector.read.streaming -import org.apache.spark.sql.connector.read.streaming.{ReadAllAvailable, ReadLimit} -import org.apache.spark.sql.pulsar.PulsarOptions._ -import org.apache.spark.sql.pulsar.PulsarSourceUtils.{getEntryId, getLedgerId} -import org.apache.spark.sql.pulsar.SpecificPulsarOffset.getTopicOffsets -import org.apache.spark.sql.types.StructType - - -class PulsarAdmissionControlHelper(adminUrl: String) { - - import scala.collection.JavaConverters._ - - private lazy val pulsarAdmin = PulsarAdmin.builder().serviceHttpUrl(adminUrl).build() - - def latestOffsetForTopic(topicPartition: String, - startMessageId: MessageId, - readLimit: Long): MessageId = { - val startLedgerId = getLedgerId(startMessageId) - val startEntryId = getEntryId(startMessageId) - val stats = pulsarAdmin.topics.getInternalStats(topicPartition) - val ledgers = pulsarAdmin.topics.getInternalStats(topicPartition).ledgers. - asScala.filter(_.ledgerId >= startLedgerId).sortBy(_.ledgerId) - // The last ledger of the ledgers list doesn't have .size or .entries - // properly populated, and the corresponding info is in currentLedgerSize - // and currentLedgerEntries - if (ledgers.nonEmpty) { - ledgers.last.size = stats.currentLedgerSize - ledgers.last.entries = stats.currentLedgerEntries - } - var messageId = startMessageId - var readLimitLeft = readLimit - ledgers.filter(_.entries != 0).sortBy(_.ledgerId).foreach { ledger => - if (readLimitLeft == 0) { - return messageId - } - val avgBytesPerEntries = ledger.size / ledger.entries - // approximation of bytes left in ledger to deal with case - // where we are at the middle of the ledger - val bytesLeftInLedger = if (ledger.ledgerId == startLedgerId) { - avgBytesPerEntries * (ledger.entries - startEntryId - 1) - } else { - ledger.size - } - if (readLimitLeft > bytesLeftInLedger) { - readLimitLeft -= bytesLeftInLedger - messageId = DefaultImplementation - .getDefaultImplementation - .newMessageId(ledger.ledgerId, ledger.entries - 1, -1) - } else if (readLimitLeft > 0) { - val numEntriesToRead = Math.max(1, readLimit / avgBytesPerEntries) - val lastEntryId = if (ledger.ledgerId != startLedgerId) { - numEntriesToRead - 1 - } else { - startEntryId + numEntriesToRead - } - val lastEntryRead = Math.min(ledger.entries - 1, lastEntryId) - messageId = DefaultImplementation - .getDefaultImplementation - .newMessageId(ledger.ledgerId, lastEntryRead, -1) - readLimitLeft = 0 - } - } - messageId - } - -} diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala index 30de8e6b..c95f8766 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala @@ -1,7 +1,9 @@ package org.apache.spark.sql.pulsar import org.apache.pulsar.client.admin.PulsarAdmin +import org.apache.pulsar.client.api.MessageId import org.apache.pulsar.client.internal.DefaultImplementation +import org.apache.spark.sql.pulsar.PulsarSourceUtils.{getEntryId, getLedgerId} import org.apache.spark.sql.streaming.Trigger.{Once, ProcessingTime} import org.apache.spark.util.Utils @@ -58,11 +60,16 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { ) } -// test("latest") { -// val topic = newTopic() -// sendMessages(topic, Array("1")) -// -// val adminu: String = adminUrl -// val pulsarHelper = new PulsarAdmissionControlHelper(adminu) -// } + test("latest") { + val topic = newTopic() + sendMessages(topic, Array("-1")) + require(getLatestOffsets(Set(topic)).size === 1) + Utils.tryWithResource(PulsarAdmin.builder().serviceHttpUrl(adminUrl).build()) { admin => { + val admissionControlHelper = new PulsarAdmissionControlHelper(admin) + val offset = admissionControlHelper.latestOffsetForTopic(topic, MessageId.earliest, 10) + logInfo(s"MESSAGE ID: [${getLedgerId(offset)}, ${getEntryId(offset)}]\n") + + } + } + } } From 9a4b1a4138bf03bba11006631abbaabeb2737e06 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Fri, 18 Aug 2023 10:29:46 -0700 Subject: [PATCH 19/38] added more tests for admission control --- .../spark/sql/pulsar/PulsarHelper.scala | 3 +- .../pulsar/PulsarAdmissionControlSuite.scala | 59 +++++++++++++++++-- 2 files changed, 55 insertions(+), 7 deletions(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala index 5722c0a0..a8ea106c 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala @@ -515,7 +515,8 @@ private[pulsar] case class PulsarHelper( } } -class PulsarAdmissionControlHelper(pulsarAdmin: PulsarAdmin) { +class PulsarAdmissionControlHelper(pulsarAdmin: PulsarAdmin) + extends Logging { import scala.collection.JavaConverters._ def latestOffsetForTopic(topicPartition: String, diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala index c95f8766..38f8b236 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala @@ -45,7 +45,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { val mapped = pulsar.map(kv => kv._2.toInt + 1) - // Each Int adds 38 bytes to message size, so we expect 3 Ints in each message + // Each Int adds 49 bytes to message size, so we expect 3 Ints in each message testStream(mapped)( StartStream(trigger = ProcessingTime(1000)), makeSureGetOffsetCalled, @@ -60,16 +60,63 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { ) } - test("latest") { + test("Only admit first entry of ledger") { val topic = newTopic() - sendMessages(topic, Array("-1")) + val messageIds = sendMessages(topic, Array("1", "2", "3")) + val firstMid = messageIds.head._2 + val firstLedger = getLedgerId(firstMid) + val firstEntry = getEntryId(firstMid) require(getLatestOffsets(Set(topic)).size === 1) Utils.tryWithResource(PulsarAdmin.builder().serviceHttpUrl(adminUrl).build()) { admin => { val admissionControlHelper = new PulsarAdmissionControlHelper(admin) - val offset = admissionControlHelper.latestOffsetForTopic(topic, MessageId.earliest, 10) - logInfo(s"MESSAGE ID: [${getLedgerId(offset)}, ${getEntryId(offset)}]\n") - + val offset = admissionControlHelper.latestOffsetForTopic(topic, MessageId.earliest, 1) + assert(getLedgerId(offset) == firstLedger && getEntryId(offset) == firstEntry) } } } + + test("Admit entry in the middle of the ledger") { + val topic = newTopic() + val messageIds = sendMessages(topic, Array("1", "2", "3")) + val firstMid = messageIds.head._2 + val secondMid = messageIds.apply(1)._2 + require(getLatestOffsets(Set(topic)).size === 1) + Utils.tryWithResource(PulsarAdmin.builder().serviceHttpUrl(adminUrl).build()) { admin => + val admissionControlHelper = new PulsarAdmissionControlHelper(admin) + val offset = admissionControlHelper.latestOffsetForTopic(topic, firstMid, 1) + assert(getLedgerId(offset) == getLedgerId(secondMid) && getEntryId(offset) == getEntryId(secondMid)) + } + } + + test("Admission Control for multiple topics") { + val topic1 = newTopic() + val topic2 = newTopic() + + val pulsar = spark.readStream + .format("pulsar") + .option(TopicMulti, s"$topic1,$topic2") + .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) + .option(MaxBytesPerTrigger, 300) + .load() + .selectExpr("CAST(__key AS STRING)", "CAST(value AS STRING)") + .as[(String, String)] + + val mapped = pulsar.map(kv => kv._2.toInt + 1) + + // Each Int adds 49 bytes to message size, so we expect 3 Ints in each message + testStream(mapped)( + StartStream(trigger = ProcessingTime(1000)), + makeSureGetOffsetCalled, + AddPulsarData(Set(topic1), 1, 2, 3), + CheckLastBatch(2, 3, 4), + AddPulsarData(Set(topic2), 4, 5, 6, 7, 8, 9), + CheckLastBatch(8, 9, 10), + AssertOnQuery { query => + val recordsRead = query.recentProgress.map(_.numInputRows).sum + recordsRead == 9 + } + ) + } + } From b0a4450e8d2e2262a0c8fb867046e2ec941d07d2 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Fri, 18 Aug 2023 10:42:44 -0700 Subject: [PATCH 20/38] changing where pulsarAdmin is set --- .../spark/sql/pulsar/PulsarHelper.scala | 13 +++++++------ .../pulsar/PulsarAdmissionControlSuite.scala | 19 ++++++++----------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala index a8ea106c..d5f58050 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala @@ -62,11 +62,10 @@ private[pulsar] case class PulsarHelper( private var topics: Seq[String] = _ private var topicPartitions: Seq[String] = _ - // We can do this because pulsarAdmin will only be called if latestOffset is called + // We can call adminUrl.get because admissionControlHelper will only be called if latestOffset is called // and there should be an exception thrown in PulsarProvider if maxBytes is set, - // and maxBytes is not set - private lazy val pulsarAdmin = PulsarAdmin.builder().serviceHttpUrl(adminUrl.get).build() - private lazy val admissionControlHelper = new PulsarAdmissionControlHelper(pulsarAdmin) + // and adminUrl is not set + private lazy val admissionControlHelper = new PulsarAdmissionControlHelper(adminUrl.get) override def close(): Unit = { // do nothing @@ -235,7 +234,7 @@ private[pulsar] case class PulsarHelper( } val newTopics = topicPartitions.toSet.diff(existingStartOffsets.keySet) val startPartitionOffsets = existingStartOffsets ++ newTopics.map(topicPartition - => topicPartition -> MessageId.earliest) + => topicPartition -> MessageId.earliest) val offsets = mutable.Map[String, MessageId]() offsets ++= startPartitionOffsets val numPartitions = startPartitionOffsets.size @@ -515,9 +514,11 @@ private[pulsar] case class PulsarHelper( } } -class PulsarAdmissionControlHelper(pulsarAdmin: PulsarAdmin) +class PulsarAdmissionControlHelper(adminUrl: String) extends Logging { + private lazy val pulsarAdmin = PulsarAdmin.builder().serviceHttpUrl(adminUrl).build() + import scala.collection.JavaConverters._ def latestOffsetForTopic(topicPartition: String, startMessageId: MessageId, diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala index 38f8b236..fac21b4b 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala @@ -67,12 +67,10 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { val firstLedger = getLedgerId(firstMid) val firstEntry = getEntryId(firstMid) require(getLatestOffsets(Set(topic)).size === 1) - Utils.tryWithResource(PulsarAdmin.builder().serviceHttpUrl(adminUrl).build()) { admin => { - val admissionControlHelper = new PulsarAdmissionControlHelper(admin) - val offset = admissionControlHelper.latestOffsetForTopic(topic, MessageId.earliest, 1) - assert(getLedgerId(offset) == firstLedger && getEntryId(offset) == firstEntry) - } - } + val admissionControlHelper = new PulsarAdmissionControlHelper(adminUrl) + val offset = admissionControlHelper.latestOffsetForTopic(topic, MessageId.earliest, 1) + assert(getLedgerId(offset) == firstLedger && getEntryId(offset) == firstEntry) + } test("Admit entry in the middle of the ledger") { @@ -81,11 +79,10 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { val firstMid = messageIds.head._2 val secondMid = messageIds.apply(1)._2 require(getLatestOffsets(Set(topic)).size === 1) - Utils.tryWithResource(PulsarAdmin.builder().serviceHttpUrl(adminUrl).build()) { admin => - val admissionControlHelper = new PulsarAdmissionControlHelper(admin) - val offset = admissionControlHelper.latestOffsetForTopic(topic, firstMid, 1) - assert(getLedgerId(offset) == getLedgerId(secondMid) && getEntryId(offset) == getEntryId(secondMid)) - } + val admissionControlHelper = new PulsarAdmissionControlHelper(adminUrl) + val offset = admissionControlHelper.latestOffsetForTopic(topic, firstMid, 1) + assert(getLedgerId(offset) == getLedgerId(secondMid) && getEntryId(offset) == getEntryId(secondMid)) + } test("Admission Control for multiple topics") { From dbe65288e317bbdfeb5a8e3ff19f7733ec04b695 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Fri, 18 Aug 2023 12:17:50 -0700 Subject: [PATCH 21/38] test where we add a new topic partition after stream has started --- .../spark/sql/pulsar/PulsarHelper.scala | 10 ++++-- .../pulsar/PulsarAdmissionControlSuite.scala | 33 +++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala index d5f58050..028507e5 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala @@ -224,7 +224,7 @@ private[pulsar] case class PulsarHelper( def latestOffsets(startingOffset: streaming.Offset, totalReadLimit: Long): SpecificPulsarOffset = { // implement helper inside PulsarHelper in order to use getTopicPartitions - val topicPartitions = getTopicPartitions + val topicPartitions = fetchLatestOffsets().topicOffsets.keySet // add new partitions from PulsarAdmin, set to earliest entry and ledger id based on limit // start a reader, get to the earliest offset for new topic partitions val existingStartOffsets = if (startingOffset != null) { @@ -233,8 +233,13 @@ private[pulsar] case class PulsarHelper( Map[String, MessageId]() } val newTopics = topicPartitions.toSet.diff(existingStartOffsets.keySet) + logInfo(s"EXISTING TOPIC PARTITIONS: ${existingStartOffsets.keySet.mkString(",")}\n") + logInfo(s"ALL TOPIC PARTITIONS: ${topicPartitions.mkString(",")}\n") val startPartitionOffsets = existingStartOffsets ++ newTopics.map(topicPartition - => topicPartition -> MessageId.earliest) + => { + logInfo(s"SETTING NEW TOPIC PARTITION: $topicPartition\n") + topicPartition -> MessageId.earliest + }) val offsets = mutable.Map[String, MessageId]() offsets ++= startPartitionOffsets val numPartitions = startPartitionOffsets.size @@ -523,6 +528,7 @@ class PulsarAdmissionControlHelper(adminUrl: String) def latestOffsetForTopic(topicPartition: String, startMessageId: MessageId, readLimit: Long): MessageId = { + logInfo(s"TOPIC PARTITION: $topicPartition\n") val startLedgerId = getLedgerId(startMessageId) val startEntryId = getEntryId(startMessageId) val stats = pulsarAdmin.topics.getInternalStats(topicPartition) diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala index fac21b4b..4994fc60 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala @@ -116,4 +116,37 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { ) } + test("Add new topic after stream as started") { + val topic1 = newTopic() + sendMessages(topic1, Array("-1")) + require(getLatestOffsets(Set(topic1)).size === 1) + + val pulsar = spark.readStream + .format("pulsar") + .option(TopicMulti, topic1) + .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) + .option(MaxBytesPerTrigger, 300) + .load() + .selectExpr("CAST(__key AS STRING)", "CAST(value AS STRING)") + .as[(String, String)] + + val mapped = pulsar.map(kv => kv._2.toInt + 1) + + val topic2 = newTopic() + // Each Int adds 49 bytes to message size, so we expect 3 Ints in each message + testStream(mapped)( + StartStream(trigger = ProcessingTime(1000)), + makeSureGetOffsetCalled, + AddPulsarData(Set(topic1), 1, 2, 3), + CheckLastBatch(2, 3, 4), + AddPulsarData(Set(topic2), 4, 5, 6, 7, 8, 9), + CheckLastBatch(8, 9, 10), + AssertOnQuery { query => + val recordsRead = query.recentProgress.map(_.numInputRows).sum + recordsRead == 9 + } + ) + } + } From b6a114c8ccc69f79a6d2574f70d97235f717eecc Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Fri, 18 Aug 2023 12:18:50 -0700 Subject: [PATCH 22/38] fetchlatest -> gettopicpartitions --- src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala index 028507e5..19f96ac1 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala @@ -224,7 +224,7 @@ private[pulsar] case class PulsarHelper( def latestOffsets(startingOffset: streaming.Offset, totalReadLimit: Long): SpecificPulsarOffset = { // implement helper inside PulsarHelper in order to use getTopicPartitions - val topicPartitions = fetchLatestOffsets().topicOffsets.keySet + val topicPartitions = getTopicPartitions // add new partitions from PulsarAdmin, set to earliest entry and ledger id based on limit // start a reader, get to the earliest offset for new topic partitions val existingStartOffsets = if (startingOffset != null) { From 2e69b5f9cf0cedf09f835d8e28c68f2fea87cdd3 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Sun, 20 Aug 2023 19:51:41 -0700 Subject: [PATCH 23/38] more tests --- .../spark/sql/pulsar/PulsarHelper.scala | 16 ++- .../spark/sql/pulsar/PulsarSource.scala | 6 +- .../pulsar/PulsarAdmissionControlSuite.scala | 127 ++++++++++++++---- .../spark/sql/pulsar/PulsarSourceTest.scala | 65 +++++++++ 4 files changed, 179 insertions(+), 35 deletions(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala index 19f96ac1..c30df09f 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala @@ -62,8 +62,9 @@ private[pulsar] case class PulsarHelper( private var topics: Seq[String] = _ private var topicPartitions: Seq[String] = _ - // We can call adminUrl.get because admissionControlHelper will only be called if latestOffset is called - // and there should be an exception thrown in PulsarProvider if maxBytes is set, + // We can call adminUrl.get because admissionControlHelper + // will only be called if latestOffset is called and there should + // be an exception thrown in PulsarProvider if maxBytes is set, // and adminUrl is not set private lazy val admissionControlHelper = new PulsarAdmissionControlHelper(adminUrl.get) @@ -236,7 +237,7 @@ private[pulsar] case class PulsarHelper( logInfo(s"EXISTING TOPIC PARTITIONS: ${existingStartOffsets.keySet.mkString(",")}\n") logInfo(s"ALL TOPIC PARTITIONS: ${topicPartitions.mkString(",")}\n") val startPartitionOffsets = existingStartOffsets ++ newTopics.map(topicPartition - => { + => { logInfo(s"SETTING NEW TOPIC PARTITION: $topicPartition\n") topicPartition -> MessageId.earliest }) @@ -247,7 +248,8 @@ private[pulsar] case class PulsarHelper( startPartitionOffsets.keys.foreach { topicPartition => val readLimit = totalReadLimit / numPartitions val startMessageId = startPartitionOffsets.apply(topicPartition) - offsets += (topicPartition -> admissionControlHelper.latestOffsetForTopic( + offsets += (topicPartition -> + admissionControlHelper.latestOffsetForTopicPartition( topicPartition, startMessageId, readLimit)) } SpecificPulsarOffset(offsets.toMap) @@ -525,10 +527,10 @@ class PulsarAdmissionControlHelper(adminUrl: String) private lazy val pulsarAdmin = PulsarAdmin.builder().serviceHttpUrl(adminUrl).build() import scala.collection.JavaConverters._ - def latestOffsetForTopic(topicPartition: String, + + def latestOffsetForTopicPartition(topicPartition: String, startMessageId: MessageId, readLimit: Long): MessageId = { - logInfo(s"TOPIC PARTITION: $topicPartition\n") val startLedgerId = getLedgerId(startMessageId) val startEntryId = getEntryId(startMessageId) val stats = pulsarAdmin.topics.getInternalStats(topicPartition) @@ -561,7 +563,7 @@ class PulsarAdmissionControlHelper(adminUrl: String) .getDefaultImplementation .newMessageId(ledger.ledgerId, ledger.entries - 1, -1) } else if (readLimitLeft > 0) { - val numEntriesToRead = Math.max(1, readLimit / avgBytesPerEntries) + val numEntriesToRead = Math.max(1, readLimitLeft / avgBytesPerEntries) val lastEntryId = if (ledger.ledgerId != startLedgerId) { numEntriesToRead - 1 } else { diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala index b0d0c6b1..9b1b3020 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala @@ -81,7 +81,11 @@ private[pulsar] class PulsarSource( readLimit: ReadLimit): streaming.Offset = { initialTopicOffsets readLimit match { - case ReadMaxBytes(maxBytes) => pulsarHelper.latestOffsets(startingOffset, maxBytes) + case ReadMaxBytes(maxBytes) => + startingOffset match { + case null => pulsarHelper.latestOffsets(initialTopicOffsets, maxBytes) + case startingOffset => pulsarHelper.latestOffsets(startingOffset, maxBytes) + } case _: ReadAllAvailable => pulsarHelper.fetchLatestOffsets() } } diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala index 4994fc60..8823f383 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala @@ -20,19 +20,11 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { super.afterAll() } - /** - * Write unit test to create limits, can construct fake ledger statistics - * Can call latestOffset() directly from the unit test - * - * Just need to verify that each microbatch is <= maxBytesPerTrigger (within some threshold) - * Can send message of specific size in AddPulsarData here - */ - test("Check last batch where message size is greater than maxBytesPerTrigger") { val topic = newTopic() sendMessages(topic, Array("-1")) require(getLatestOffsets(Set(topic)).size === 1) - sparkContext.setLogLevel("INFO") + val pulsar = spark.readStream .format("pulsar") .option(TopicSingle, topic) @@ -68,7 +60,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { val firstEntry = getEntryId(firstMid) require(getLatestOffsets(Set(topic)).size === 1) val admissionControlHelper = new PulsarAdmissionControlHelper(adminUrl) - val offset = admissionControlHelper.latestOffsetForTopic(topic, MessageId.earliest, 1) + val offset = admissionControlHelper.latestOffsetForTopicPartition(topic, MessageId.earliest, 1) assert(getLedgerId(offset) == firstLedger && getEntryId(offset) == firstEntry) } @@ -80,7 +72,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { val secondMid = messageIds.apply(1)._2 require(getLatestOffsets(Set(topic)).size === 1) val admissionControlHelper = new PulsarAdmissionControlHelper(adminUrl) - val offset = admissionControlHelper.latestOffsetForTopic(topic, firstMid, 1) + val offset = admissionControlHelper.latestOffsetForTopicPartition(topic, firstMid, 1) assert(getLedgerId(offset) == getLedgerId(secondMid) && getEntryId(offset) == getEntryId(secondMid)) } @@ -88,7 +80,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { test("Admission Control for multiple topics") { val topic1 = newTopic() val topic2 = newTopic() - + val pulsar = spark.readStream .format("pulsar") .option(TopicMulti, s"$topic1,$topic2") @@ -116,37 +108,118 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { ) } - test("Add new topic after stream as started") { - val topic1 = newTopic() - sendMessages(topic1, Array("-1")) - require(getLatestOffsets(Set(topic1)).size === 1) - val pulsar = spark.readStream + test("Admission Control with one topic-partition") { + val topic = newTopic() + + + Utils.tryWithResource(PulsarAdmin.builder().serviceHttpUrl(adminUrl).build()) { admin => + admin.topics().createPartitionedTopic(topic, 1) + require(getLatestOffsets(Set(topic)).size === 1) + } + + val reader = spark.readStream .format("pulsar") - .option(TopicMulti, topic1) .option(ServiceUrlOptionKey, serviceUrl) .option(AdminUrlOptionKey, adminUrl) - .option(MaxBytesPerTrigger, 300) + .option(FailOnDataLossOptionKey, "false") + .option(MaxBytesPerTrigger, 150) + + val pulsar = reader + .option(TopicSingle, topic) .load() .selectExpr("CAST(__key AS STRING)", "CAST(value AS STRING)") .as[(String, String)] + val mapped = pulsar.map(kv => kv._2.toInt) - val mapped = pulsar.map(kv => kv._2.toInt + 1) + testStream(mapped)( + StartStream(trigger = ProcessingTime(1000)), + makeSureGetOffsetCalled, + AddPulsarDataWithPartition(Set(topic), Some(0), 1, 2, 3, 4), + CheckLastBatch(4), + AssertOnQuery { query => + val recordsRead = query.recentProgress.map(_.numInputRows).sum + recordsRead == 4 + } + ) + } + + test("Admission Control with multiple topic-partitions") { + val topic = newTopic() + + + Utils.tryWithResource(PulsarAdmin.builder().serviceHttpUrl(adminUrl).build()) { admin => + admin.topics().createPartitionedTopic(topic, 2) + require(getLatestOffsets(Set(topic)).size === 2) + } + + val reader = spark.readStream + .format("pulsar") + .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) + .option(FailOnDataLossOptionKey, "false") + .option(MaxBytesPerTrigger, 200) + + val pulsar = reader + .option(TopicSingle, topic) + .load() + .selectExpr("CAST(__key AS STRING)", "CAST(value AS STRING)") + .as[(String, String)] + val mapped = pulsar.map(kv => kv._2.toInt) - val topic2 = newTopic() - // Each Int adds 49 bytes to message size, so we expect 3 Ints in each message testStream(mapped)( StartStream(trigger = ProcessingTime(1000)), makeSureGetOffsetCalled, - AddPulsarData(Set(topic1), 1, 2, 3), - CheckLastBatch(2, 3, 4), - AddPulsarData(Set(topic2), 4, 5, 6, 7, 8, 9), - CheckLastBatch(8, 9, 10), + AddPulsarDataWithPartition(Set(topic), Some(0), 1, 2, 3, 4), + CheckLastBatch(3, 4), + AddPulsarDataWithPartition(Set(topic), Some(1), 5, 6, 7, 8), + CheckLastBatch(7, 8), AssertOnQuery { query => val recordsRead = query.recentProgress.map(_.numInputRows).sum - recordsRead == 9 + recordsRead == 8 } ) } + test("Add topic-partition after starting stream") { + val topic = newTopic() + + + Utils.tryWithResource(PulsarAdmin.builder().serviceHttpUrl(adminUrl).build()) { admin => + admin.topics().createPartitionedTopic(topic, 1) + require(getLatestOffsets(Set(topic)).size === 1) + } + + val reader = spark.readStream + .format("pulsar") + .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) + .option(FailOnDataLossOptionKey, "false") + .option(MaxBytesPerTrigger, 200) + + val pulsar = reader + .option(TopicSingle, topic) + .load() + .selectExpr("CAST(__key AS STRING)", "CAST(value AS STRING)") + .as[(String, String)] + val mapped = pulsar.map(kv => kv._2.toInt) + + testStream(mapped)( + StartStream(trigger = ProcessingTime(1000)), + makeSureGetOffsetCalled, + AddPulsarDataWithPartition(Set(topic), Some(0), 1, 2, 3, 4), + CheckLastBatch(1, 2, 3, 4), + ) + + addPartitions(topic, 2) + + testStream(mapped)( + AddPulsarDataWithPartition(Set(topic), Some(1), 5, 6, 7, 8), + CheckLastBatch(7, 8), + AssertOnQuery { query => + val recordsRead = query.recentProgress.map(_.numInputRows).sum + recordsRead == 4 + } + ) + } } diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarSourceTest.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarSourceTest.scala index 7eb1ea9d..eb14d93f 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarSourceTest.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarSourceTest.scala @@ -126,6 +126,71 @@ class PulsarSourceTest extends StreamTest with SharedSparkSession with PulsarTes s"AddPulsarData(topics = $topics, data = $data, message = $message)" } + /** + * Add data to Pulsar with partition specified + * + * `topicAction` can be used to run actions for each topic before inserting data. + */ + case class AddPulsarDataWithPartition( + topics: Set[String], + partition: Option[Int], + data: Int*)( + implicit ensureDataInMultiplePartition: Boolean = false, + concurrent: Boolean = false, + message: String = "", + topicAction: (String, Option[MessageId]) => Unit = (_, _) => {}) + extends AddData { + + override def addData(query: Option[StreamExecution]): (SparkDataStream, Offset) = { + query match { + // Make sure no Spark job is running when deleting a topic + case Some(m: MicroBatchExecution) => m.processAllAvailable() + case _ => + } + + val existingTopics = getAllTopicsSize().toMap + val newTopics = topics.diff(existingTopics.keySet) + for (newTopic <- newTopics) { + topicAction(newTopic, None) + } + for (existingTopicPartitions <- existingTopics) { + topicAction(existingTopicPartitions._1, Some(existingTopicPartitions._2)) + } + + require( + query.nonEmpty, + "Cannot add data when there is no query for finding the active pulsar source") + + val sources = query.get.logicalPlan.collect { + case StreamingExecutionRelation(source: PulsarSource, _) => source + case StreamingExecutionRelation(source: PulsarMicroBatchReader, _) => source + }.distinct + + if (sources.isEmpty) { + throw new Exception( + "Could not find Pulsar source in the StreamExecution logical plan to add data to") + } else if (sources.size > 1) { + throw new Exception( + "Could not select the Pulsar source in the StreamExecution logical plan as there" + + "are multiple Pulsar sources:\n\t" + sources.mkString("\n\t")) + } + val pulsarSource = sources.head + val topic = topics.toSeq(Random.nextInt(topics.size)) + + sendMessages(topic, data.map { + _.toString + }.toArray, partition) + val sizes = getLatestOffsets(topics).toSeq + val offset = SpecificPulsarOffset(sizes: _*) + logInfo(s"Added data, expected offset $offset") + (pulsarSource, offset) + } + + override def toString: String = + s"AddPulsarDataWithPartition(topics = $topics, partition = $partition, " + + s"data = $data, message = $message)" + } + /** * Add data to Pulsar. * From 4d00f88d5f5092e594b3b6d8a4d48b49e1a406b7 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Mon, 21 Aug 2023 08:28:01 -0700 Subject: [PATCH 24/38] changing AddPulsarDataWithPartition from Set(topic) -> topic --- .../org/apache/spark/sql/pulsar/PulsarHelper.scala | 8 +++----- .../org/apache/spark/sql/pulsar/PulsarSource.scala | 5 ++++- .../spark/sql/pulsar/PulsarAdmissionControlSuite.scala | 10 +++++----- .../org/apache/spark/sql/pulsar/PulsarSourceTest.scala | 3 ++- 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala index c30df09f..68a6ee8b 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala @@ -234,19 +234,16 @@ private[pulsar] case class PulsarHelper( Map[String, MessageId]() } val newTopics = topicPartitions.toSet.diff(existingStartOffsets.keySet) - logInfo(s"EXISTING TOPIC PARTITIONS: ${existingStartOffsets.keySet.mkString(",")}\n") - logInfo(s"ALL TOPIC PARTITIONS: ${topicPartitions.mkString(",")}\n") val startPartitionOffsets = existingStartOffsets ++ newTopics.map(topicPartition => { - logInfo(s"SETTING NEW TOPIC PARTITION: $topicPartition\n") topicPartition -> MessageId.earliest }) val offsets = mutable.Map[String, MessageId]() offsets ++= startPartitionOffsets val numPartitions = startPartitionOffsets.size // move all topic partition logic to helper function + val readLimit = totalReadLimit / numPartitions startPartitionOffsets.keys.foreach { topicPartition => - val readLimit = totalReadLimit / numPartitions val startMessageId = startPartitionOffsets.apply(topicPartition) offsets += (topicPartition -> admissionControlHelper.latestOffsetForTopicPartition( @@ -546,6 +543,7 @@ class PulsarAdmissionControlHelper(adminUrl: String) var messageId = startMessageId var readLimitLeft = readLimit ledgers.filter(_.entries != 0).sortBy(_.ledgerId).foreach { ledger => + assert(readLimitLeft >= 0) if (readLimitLeft == 0) { return messageId } @@ -562,7 +560,7 @@ class PulsarAdmissionControlHelper(adminUrl: String) messageId = DefaultImplementation .getDefaultImplementation .newMessageId(ledger.ledgerId, ledger.entries - 1, -1) - } else if (readLimitLeft > 0) { + } else { val numEntriesToRead = Math.max(1, readLimitLeft / avgBytesPerEntries) val lastEntryId = if (ledger.ledgerId != startLedgerId) { numEntriesToRead - 1 diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala index 9b1b3020..8405e653 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala @@ -83,6 +83,9 @@ private[pulsar] class PulsarSource( readLimit match { case ReadMaxBytes(maxBytes) => startingOffset match { + // deals with the case where we add a topic-partition after + // the stream has started, since adding a new topic-partition + // sets startingOffset to null case null => pulsarHelper.latestOffsets(initialTopicOffsets, maxBytes) case startingOffset => pulsarHelper.latestOffsets(startingOffset, maxBytes) } @@ -93,7 +96,7 @@ private[pulsar] class PulsarSource( if (maxBytesPerTrigger == 0L) { ReadLimit.allAvailable() } else { - ReadMaxBytes.apply(maxBytesPerTrigger) + ReadMaxBytes(maxBytesPerTrigger) } } diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala index 8823f383..0630e389 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala @@ -135,7 +135,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { testStream(mapped)( StartStream(trigger = ProcessingTime(1000)), makeSureGetOffsetCalled, - AddPulsarDataWithPartition(Set(topic), Some(0), 1, 2, 3, 4), + AddPulsarDataWithPartition(topic, Some(0), 1, 2, 3, 4), CheckLastBatch(4), AssertOnQuery { query => val recordsRead = query.recentProgress.map(_.numInputRows).sum @@ -170,9 +170,9 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { testStream(mapped)( StartStream(trigger = ProcessingTime(1000)), makeSureGetOffsetCalled, - AddPulsarDataWithPartition(Set(topic), Some(0), 1, 2, 3, 4), + AddPulsarDataWithPartition(topic, Some(0), 1, 2, 3, 4), CheckLastBatch(3, 4), - AddPulsarDataWithPartition(Set(topic), Some(1), 5, 6, 7, 8), + AddPulsarDataWithPartition(topic, Some(1), 5, 6, 7, 8), CheckLastBatch(7, 8), AssertOnQuery { query => val recordsRead = query.recentProgress.map(_.numInputRows).sum @@ -207,14 +207,14 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { testStream(mapped)( StartStream(trigger = ProcessingTime(1000)), makeSureGetOffsetCalled, - AddPulsarDataWithPartition(Set(topic), Some(0), 1, 2, 3, 4), + AddPulsarDataWithPartition(topic, Some(0), 1, 2, 3, 4), CheckLastBatch(1, 2, 3, 4), ) addPartitions(topic, 2) testStream(mapped)( - AddPulsarDataWithPartition(Set(topic), Some(1), 5, 6, 7, 8), + AddPulsarDataWithPartition(topic, Some(1), 5, 6, 7, 8), CheckLastBatch(7, 8), AssertOnQuery { query => val recordsRead = query.recentProgress.map(_.numInputRows).sum diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarSourceTest.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarSourceTest.scala index eb14d93f..166da619 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarSourceTest.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarSourceTest.scala @@ -132,7 +132,7 @@ class PulsarSourceTest extends StreamTest with SharedSparkSession with PulsarTes * `topicAction` can be used to run actions for each topic before inserting data. */ case class AddPulsarDataWithPartition( - topics: Set[String], + topic: String, partition: Option[Int], data: Int*)( implicit ensureDataInMultiplePartition: Boolean = false, @@ -141,6 +141,7 @@ class PulsarSourceTest extends StreamTest with SharedSparkSession with PulsarTes topicAction: (String, Option[MessageId]) => Unit = (_, _) => {}) extends AddData { + val topics = Set(topic) override def addData(query: Option[StreamExecution]): (SparkDataStream, Offset) = { query match { // Make sure no Spark job is running when deleting a topic From 93694246c6ff315a92b3dc2835c780406ca0228e Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Mon, 21 Aug 2023 09:31:51 -0700 Subject: [PATCH 25/38] adding test case concurrent topic writes --- .../pulsar/PulsarAdmissionControlSuite.scala | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala index 0630e389..58f4cbc4 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala @@ -108,6 +108,36 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { ) } + test("Admission Control for concurrent topic writes") { + val topic1 = newTopic() + val topic2 = newTopic() + + val pulsar = spark.readStream + .format("pulsar") + .option(TopicMulti, s"$topic1,$topic2") + .option(ServiceUrlOptionKey, serviceUrl) + .option(AdminUrlOptionKey, adminUrl) + .option(MaxBytesPerTrigger, 300) + .load() + .selectExpr("CAST(__key AS STRING)", "CAST(value AS STRING)") + .as[(String, String)] + + val mapped = pulsar.map(kv => kv._2.toInt + 1) + + // Each Int adds 49 bytes to message size, so we expect 3 Ints in each message + testStream(mapped)( + StartStream(trigger = ProcessingTime(1000)), + makeSureGetOffsetCalled, + AddPulsarData(Set(topic1, topic2), 1, 2, 3), + CheckLastBatch(2, 3, 4), + AddPulsarData(Set(topic1, topic2), 4, 5, 6, 7, 8, 9), + CheckLastBatch(8, 9, 10), + AssertOnQuery { query => + val recordsRead = query.recentProgress.map(_.numInputRows).sum + recordsRead == 9 + } + ) + } test("Admission Control with one topic-partition") { val topic = newTopic() From 3103b661067e9618b6d1327af45a1e8135a8e0a7 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Mon, 21 Aug 2023 10:11:01 -0700 Subject: [PATCH 26/38] changing getAdminUrl and reducing offsets, startpartitionoffsets redundancy --- .../scala/org/apache/spark/sql/pulsar/PulsarHelper.scala | 1 - .../scala/org/apache/spark/sql/pulsar/PulsarProvider.scala | 6 +----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala index 68a6ee8b..327b6479 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala @@ -239,7 +239,6 @@ private[pulsar] case class PulsarHelper( topicPartition -> MessageId.earliest }) val offsets = mutable.Map[String, MessageId]() - offsets ++= startPartitionOffsets val numPartitions = startPartitionOffsets.size // move all topic partition logic to helper function val readLimit = totalReadLimit / numPartitions diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala index 945b0644..9a699382 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala @@ -377,11 +377,7 @@ private[pulsar] object PulsarProvider extends Logging { } private def getAdminUrl(parameters: Map[String, String]): Option[String] = { - val adminUrl = parameters.getOrElse(AdminUrlOptionKey, "") - adminUrl match { - case "" => None - case s => Option(s) - } + parameters.get(AdminUrlOptionKey) } private def getAllowDifferentTopicSchemas(parameters: Map[String, String]): Boolean = { From 4712cf4c51e35e66fdc8e042871c0653b4787663 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Mon, 21 Aug 2023 11:09:02 -0700 Subject: [PATCH 27/38] setting partition index in messageId correctly --- .../spark/sql/pulsar/PulsarHelper.scala | 9 ++++++-- .../spark/sql/pulsar/PulsarOptions.scala | 2 ++ .../pulsar/PulsarAdmissionControlSuite.scala | 22 ++++++++++--------- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala index 327b6479..580e0837 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala @@ -539,6 +539,11 @@ class PulsarAdmissionControlHelper(adminUrl: String) ledgers.last.size = stats.currentLedgerSize ledgers.last.entries = stats.currentLedgerEntries } + val partitionIndex = if(topicPartition.contains(PartitionSuffix)) { + topicPartition.split(PartitionSuffix)(1).toInt + } else { + -1 + } var messageId = startMessageId var readLimitLeft = readLimit ledgers.filter(_.entries != 0).sortBy(_.ledgerId).foreach { ledger => @@ -558,7 +563,7 @@ class PulsarAdmissionControlHelper(adminUrl: String) readLimitLeft -= bytesLeftInLedger messageId = DefaultImplementation .getDefaultImplementation - .newMessageId(ledger.ledgerId, ledger.entries - 1, -1) + .newMessageId(ledger.ledgerId, ledger.entries - 1, partitionIndex) } else { val numEntriesToRead = Math.max(1, readLimitLeft / avgBytesPerEntries) val lastEntryId = if (ledger.ledgerId != startLedgerId) { @@ -569,7 +574,7 @@ class PulsarAdmissionControlHelper(adminUrl: String) val lastEntryRead = Math.min(ledger.entries - 1, lastEntryId) messageId = DefaultImplementation .getDefaultImplementation - .newMessageId(ledger.ledgerId, lastEntryRead, -1) + .newMessageId(ledger.ledgerId, lastEntryRead, partitionIndex) readLimitLeft = 0 } } diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarOptions.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarOptions.scala index d9ec02f0..1e3ca711 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarOptions.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarOptions.scala @@ -37,6 +37,8 @@ private[pulsar] object PulsarOptions { val ServiceUrlOptionKey: String = "service.url" val AdminUrlOptionKey: String = "admin.url" + val MaxEntriesPerLedger: String = + "admin.managedLedgerMaxEntriesPerLedger".toLowerCase(Locale.ROOT) val StartingOffsetsOptionKey: String = "startingOffsets".toLowerCase(Locale.ROOT) val StartingTime: String = "startingTime".toLowerCase(Locale.ROOT) val EndingTime: String = "endingTime".toLowerCase(Locale.ROOT) diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala index 58f4cbc4..f5334e74 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala @@ -24,12 +24,13 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { val topic = newTopic() sendMessages(topic, Array("-1")) require(getLatestOffsets(Set(topic)).size === 1) - + sparkContext.setLogLevel("INFO") val pulsar = spark.readStream .format("pulsar") .option(TopicSingle, topic) .option(ServiceUrlOptionKey, serviceUrl) .option(AdminUrlOptionKey, adminUrl) + .option(FailOnDataLossOptionKey, "true") .option(MaxBytesPerTrigger, 150) .load() .selectExpr("CAST(__key AS STRING)", "CAST(value AS STRING)") @@ -80,12 +81,13 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { test("Admission Control for multiple topics") { val topic1 = newTopic() val topic2 = newTopic() - + sparkContext.setLogLevel("INFO") val pulsar = spark.readStream .format("pulsar") .option(TopicMulti, s"$topic1,$topic2") .option(ServiceUrlOptionKey, serviceUrl) .option(AdminUrlOptionKey, adminUrl) + .option(FailOnDataLossOptionKey, "true") .option(MaxBytesPerTrigger, 300) .load() .selectExpr("CAST(__key AS STRING)", "CAST(value AS STRING)") @@ -111,12 +113,13 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { test("Admission Control for concurrent topic writes") { val topic1 = newTopic() val topic2 = newTopic() - + sparkContext.setLogLevel("INFO") val pulsar = spark.readStream .format("pulsar") .option(TopicMulti, s"$topic1,$topic2") .option(ServiceUrlOptionKey, serviceUrl) .option(AdminUrlOptionKey, adminUrl) + .option(FailOnDataLossOptionKey, "true") .option(MaxBytesPerTrigger, 300) .load() .selectExpr("CAST(__key AS STRING)", "CAST(value AS STRING)") @@ -147,12 +150,12 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { admin.topics().createPartitionedTopic(topic, 1) require(getLatestOffsets(Set(topic)).size === 1) } - + sparkContext.setLogLevel("INFO") val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) .option(AdminUrlOptionKey, adminUrl) - .option(FailOnDataLossOptionKey, "false") + .option(FailOnDataLossOptionKey, "true") .option(MaxBytesPerTrigger, 150) val pulsar = reader @@ -182,12 +185,12 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { admin.topics().createPartitionedTopic(topic, 2) require(getLatestOffsets(Set(topic)).size === 2) } - + sparkContext.setLogLevel("INFO") val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) .option(AdminUrlOptionKey, adminUrl) - .option(FailOnDataLossOptionKey, "false") + .option(FailOnDataLossOptionKey, "true") .option(MaxBytesPerTrigger, 200) val pulsar = reader @@ -214,17 +217,16 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { test("Add topic-partition after starting stream") { val topic = newTopic() - Utils.tryWithResource(PulsarAdmin.builder().serviceHttpUrl(adminUrl).build()) { admin => admin.topics().createPartitionedTopic(topic, 1) require(getLatestOffsets(Set(topic)).size === 1) } - + sparkContext.setLogLevel("INFO") val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) .option(AdminUrlOptionKey, adminUrl) - .option(FailOnDataLossOptionKey, "false") + .option(FailOnDataLossOptionKey, "true") .option(MaxBytesPerTrigger, 200) val pulsar = reader From a3acee5dffd266905fbe65ba4219814e279d7b87 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Mon, 21 Aug 2023 11:10:23 -0700 Subject: [PATCH 28/38] removing info logs --- .../sql/pulsar/PulsarAdmissionControlSuite.scala | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala index f5334e74..8361a5a1 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala @@ -24,7 +24,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { val topic = newTopic() sendMessages(topic, Array("-1")) require(getLatestOffsets(Set(topic)).size === 1) - sparkContext.setLogLevel("INFO") + val pulsar = spark.readStream .format("pulsar") .option(TopicSingle, topic) @@ -81,7 +81,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { test("Admission Control for multiple topics") { val topic1 = newTopic() val topic2 = newTopic() - sparkContext.setLogLevel("INFO") + val pulsar = spark.readStream .format("pulsar") .option(TopicMulti, s"$topic1,$topic2") @@ -113,7 +113,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { test("Admission Control for concurrent topic writes") { val topic1 = newTopic() val topic2 = newTopic() - sparkContext.setLogLevel("INFO") + val pulsar = spark.readStream .format("pulsar") .option(TopicMulti, s"$topic1,$topic2") @@ -150,7 +150,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { admin.topics().createPartitionedTopic(topic, 1) require(getLatestOffsets(Set(topic)).size === 1) } - sparkContext.setLogLevel("INFO") + val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) @@ -185,7 +185,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { admin.topics().createPartitionedTopic(topic, 2) require(getLatestOffsets(Set(topic)).size === 2) } - sparkContext.setLogLevel("INFO") + val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) @@ -221,7 +221,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { admin.topics().createPartitionedTopic(topic, 1) require(getLatestOffsets(Set(topic)).size === 1) } - sparkContext.setLogLevel("INFO") + val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) From bf3dfe652d91d086b9999008492f8a59fa53944c Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Mon, 21 Aug 2023 11:19:32 -0700 Subject: [PATCH 29/38] removing maxEntriesPerLedger option --- src/main/scala/org/apache/spark/sql/pulsar/PulsarOptions.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarOptions.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarOptions.scala index 1e3ca711..d9ec02f0 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarOptions.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarOptions.scala @@ -37,8 +37,6 @@ private[pulsar] object PulsarOptions { val ServiceUrlOptionKey: String = "service.url" val AdminUrlOptionKey: String = "admin.url" - val MaxEntriesPerLedger: String = - "admin.managedLedgerMaxEntriesPerLedger".toLowerCase(Locale.ROOT) val StartingOffsetsOptionKey: String = "startingOffsets".toLowerCase(Locale.ROOT) val StartingTime: String = "startingTime".toLowerCase(Locale.ROOT) val EndingTime: String = "endingTime".toLowerCase(Locale.ROOT) From 12deb6bfef2eb7007f3ed6c9c575d4b0ea339b0b Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Mon, 21 Aug 2023 11:51:24 -0700 Subject: [PATCH 30/38] adding maxEntriesPerLedger in test --- .../scala/org/apache/spark/sql/pulsar/PulsarHelper.scala | 2 +- .../spark/sql/pulsar/PulsarAdmissionControlSuite.scala | 2 ++ src/test/scala/org/apache/spark/sql/pulsar/PulsarTest.scala | 6 ++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala index 580e0837..fe8bdc50 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala @@ -539,7 +539,7 @@ class PulsarAdmissionControlHelper(adminUrl: String) ledgers.last.size = stats.currentLedgerSize ledgers.last.entries = stats.currentLedgerEntries } - val partitionIndex = if(topicPartition.contains(PartitionSuffix)) { + val partitionIndex = if (topicPartition.contains(PartitionSuffix)) { topicPartition.split(PartitionSuffix)(1).toInt } else { -1 diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala index 8361a5a1..ea5d18cb 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala @@ -12,7 +12,9 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { import PulsarOptions._ import testImplicits._ + private val maxEntriesPerLedger = "managedLedgerMaxEntriesPerLedger" override def beforeAll(): Unit = { + brokerConfigs.put(maxEntriesPerLedger, "1") super.beforeAll() } diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarTest.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarTest.scala index 5b034b6b..827aebd9 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarTest.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarTest.scala @@ -20,6 +20,7 @@ import java.time.{Clock, Duration} import java.util.{Map => JMap} import scala.collection.JavaConverters._ +import scala.collection.mutable import scala.reflect.ClassTag import org.scalatest.concurrent.Eventually.{eventually, timeout} @@ -54,11 +55,15 @@ trait PulsarTest extends BeforeAndAfterAll with BeforeAndAfterEach { var serviceUrl: String = null var adminUrl: String = null + val brokerConfigs = mutable.Map[String, String]() private val logger: Logger = LoggerFactory.getLogger("pulsar-spark-test-logger") override def beforeAll(): Unit = { pulsarContainer = new PulsarContainer(parse("apachepulsar/pulsar:" + CURRENT_VERSION)) pulsarContainer.withStartupTimeout(Duration.ofMinutes(5)) + brokerConfigs.foreach( kv => + pulsarContainer.withEnv("PULSAR_PREFIX_" + kv._1, kv._2) + ) pulsarContainer.start() @@ -80,6 +85,7 @@ trait PulsarTest extends BeforeAndAfterAll with BeforeAndAfterEach { if (pulsarContainer != null) { pulsarContainer.stop() pulsarContainer.close() + brokerConfigs.clear() } } From b922024c351a20b88b6af13506e04d3cb2a96e65 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Mon, 21 Aug 2023 15:23:03 -0700 Subject: [PATCH 31/38] maxEntriesPerLedger works --- .../pulsar/PulsarAdmissionControlSuite.scala | 39 ++++++++++--------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala index ea5d18cb..481698f2 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala @@ -13,8 +13,12 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { import testImplicits._ private val maxEntriesPerLedger = "managedLedgerMaxEntriesPerLedger" + private val ledgerRolloverTime = "managedLedgerMinLedgerRolloverTimeMinutes" + private val sizeOfInt = 49 + override def beforeAll(): Unit = { - brokerConfigs.put(maxEntriesPerLedger, "1") + brokerConfigs.put(maxEntriesPerLedger, "3") + brokerConfigs.put(ledgerRolloverTime, "0") super.beforeAll() } @@ -26,21 +30,21 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { val topic = newTopic() sendMessages(topic, Array("-1")) require(getLatestOffsets(Set(topic)).size === 1) - + val pulsar = spark.readStream .format("pulsar") .option(TopicSingle, topic) .option(ServiceUrlOptionKey, serviceUrl) .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, "true") - .option(MaxBytesPerTrigger, 150) + .option(MaxBytesPerTrigger, sizeOfInt * 3) .load() .selectExpr("CAST(__key AS STRING)", "CAST(value AS STRING)") .as[(String, String)] val mapped = pulsar.map(kv => kv._2.toInt + 1) - // Each Int adds 49 bytes to message size, so we expect 3 Ints in each message + testStream(mapped)( StartStream(trigger = ProcessingTime(1000)), makeSureGetOffsetCalled, @@ -83,21 +87,21 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { test("Admission Control for multiple topics") { val topic1 = newTopic() val topic2 = newTopic() - + val pulsar = spark.readStream .format("pulsar") .option(TopicMulti, s"$topic1,$topic2") .option(ServiceUrlOptionKey, serviceUrl) .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, "true") - .option(MaxBytesPerTrigger, 300) + .option(MaxBytesPerTrigger, sizeOfInt * 6) .load() .selectExpr("CAST(__key AS STRING)", "CAST(value AS STRING)") .as[(String, String)] val mapped = pulsar.map(kv => kv._2.toInt + 1) - // Each Int adds 49 bytes to message size, so we expect 3 Ints in each message + testStream(mapped)( StartStream(trigger = ProcessingTime(1000)), makeSureGetOffsetCalled, @@ -115,21 +119,21 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { test("Admission Control for concurrent topic writes") { val topic1 = newTopic() val topic2 = newTopic() - + val pulsar = spark.readStream .format("pulsar") .option(TopicMulti, s"$topic1,$topic2") .option(ServiceUrlOptionKey, serviceUrl) .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, "true") - .option(MaxBytesPerTrigger, 300) + .option(MaxBytesPerTrigger, sizeOfInt * 6) .load() .selectExpr("CAST(__key AS STRING)", "CAST(value AS STRING)") .as[(String, String)] val mapped = pulsar.map(kv => kv._2.toInt + 1) - // Each Int adds 49 bytes to message size, so we expect 3 Ints in each message + testStream(mapped)( StartStream(trigger = ProcessingTime(1000)), makeSureGetOffsetCalled, @@ -147,18 +151,18 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { test("Admission Control with one topic-partition") { val topic = newTopic() - + Utils.tryWithResource(PulsarAdmin.builder().serviceHttpUrl(adminUrl).build()) { admin => admin.topics().createPartitionedTopic(topic, 1) require(getLatestOffsets(Set(topic)).size === 1) } - + val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, "true") - .option(MaxBytesPerTrigger, 150) + .option(MaxBytesPerTrigger, sizeOfInt * 3) val pulsar = reader .option(TopicSingle, topic) @@ -182,18 +186,17 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { test("Admission Control with multiple topic-partitions") { val topic = newTopic() - Utils.tryWithResource(PulsarAdmin.builder().serviceHttpUrl(adminUrl).build()) { admin => admin.topics().createPartitionedTopic(topic, 2) require(getLatestOffsets(Set(topic)).size === 2) } - + val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, "true") - .option(MaxBytesPerTrigger, 200) + .option(MaxBytesPerTrigger, sizeOfInt * 4) val pulsar = reader .option(TopicSingle, topic) @@ -223,13 +226,13 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { admin.topics().createPartitionedTopic(topic, 1) require(getLatestOffsets(Set(topic)).size === 1) } - + val reader = spark.readStream .format("pulsar") .option(ServiceUrlOptionKey, serviceUrl) .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, "true") - .option(MaxBytesPerTrigger, 200) + .option(MaxBytesPerTrigger, sizeOfInt * 4) val pulsar = reader .option(TopicSingle, topic) From eeb0b456eb723c8924e1c2f25c4a2786781f7fb9 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Mon, 21 Aug 2023 15:23:51 -0700 Subject: [PATCH 32/38] spacing --- .../spark/sql/pulsar/PulsarAdmissionControlSuite.scala | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala index 481698f2..a146ab91 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala @@ -43,7 +43,6 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { .as[(String, String)] val mapped = pulsar.map(kv => kv._2.toInt + 1) - testStream(mapped)( StartStream(trigger = ProcessingTime(1000)), @@ -69,7 +68,6 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { val admissionControlHelper = new PulsarAdmissionControlHelper(adminUrl) val offset = admissionControlHelper.latestOffsetForTopicPartition(topic, MessageId.earliest, 1) assert(getLedgerId(offset) == firstLedger && getEntryId(offset) == firstEntry) - } test("Admit entry in the middle of the ledger") { @@ -101,7 +99,6 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { val mapped = pulsar.map(kv => kv._2.toInt + 1) - testStream(mapped)( StartStream(trigger = ProcessingTime(1000)), makeSureGetOffsetCalled, @@ -133,7 +130,6 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { val mapped = pulsar.map(kv => kv._2.toInt + 1) - testStream(mapped)( StartStream(trigger = ProcessingTime(1000)), makeSureGetOffsetCalled, @@ -151,7 +147,6 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { test("Admission Control with one topic-partition") { val topic = newTopic() - Utils.tryWithResource(PulsarAdmin.builder().serviceHttpUrl(adminUrl).build()) { admin => admin.topics().createPartitionedTopic(topic, 1) require(getLatestOffsets(Set(topic)).size === 1) From c115f6f67330d264b03c50a09245b707ac17e5dd Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Mon, 21 Aug 2023 16:05:34 -0700 Subject: [PATCH 33/38] checking numInputRows per microbatch in query --- .../spark/sql/pulsar/PulsarHelper.scala | 2 +- .../pulsar/PulsarAdmissionControlSuite.scala | 88 ++++++++++--------- 2 files changed, 48 insertions(+), 42 deletions(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala index fe8bdc50..2b150179 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala @@ -564,7 +564,7 @@ class PulsarAdmissionControlHelper(adminUrl: String) messageId = DefaultImplementation .getDefaultImplementation .newMessageId(ledger.ledgerId, ledger.entries - 1, partitionIndex) - } else { + } else if (readLimitLeft >= avgBytesPerEntries) { val numEntriesToRead = Math.max(1, readLimitLeft / avgBytesPerEntries) val lastEntryId = if (ledger.ledgerId != startLedgerId) { numEntriesToRead - 1 diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala index a146ab91..a47b7dde 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala @@ -14,7 +14,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { private val maxEntriesPerLedger = "managedLedgerMaxEntriesPerLedger" private val ledgerRolloverTime = "managedLedgerMinLedgerRolloverTimeMinutes" - private val sizeOfInt = 49 + private val sizeOfInt = 50 override def beforeAll(): Unit = { brokerConfigs.put(maxEntriesPerLedger, "3") @@ -26,6 +26,30 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { super.afterAll() } + test("Only admit first entry of ledger") { + val topic = newTopic() + val messageIds = sendMessages(topic, Array("1", "2", "3")) + val firstMid = messageIds.head._2 + val firstLedger = getLedgerId(firstMid) + val firstEntry = getEntryId(firstMid) + require(getLatestOffsets(Set(topic)).size === 1) + val admissionControlHelper = new PulsarAdmissionControlHelper(adminUrl) + val offset = admissionControlHelper.latestOffsetForTopicPartition(topic, MessageId.earliest, sizeOfInt) + assert(getLedgerId(offset) == firstLedger && getEntryId(offset) == firstEntry) + } + + test("Admit entry in the middle of the ledger") { + val topic = newTopic() + val messageIds = sendMessages(topic, Array("1", "2", "3")) + val firstMid = messageIds.head._2 + val secondMid = messageIds.apply(1)._2 + require(getLatestOffsets(Set(topic)).size === 1) + val admissionControlHelper = new PulsarAdmissionControlHelper(adminUrl) + val offset = admissionControlHelper.latestOffsetForTopicPartition(topic, firstMid, sizeOfInt) + assert(getLedgerId(offset) == getLedgerId(secondMid) && getEntryId(offset) == getEntryId(secondMid)) + + } + test("Check last batch where message size is greater than maxBytesPerTrigger") { val topic = newTopic() sendMessages(topic, Array("-1")) @@ -43,7 +67,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { .as[(String, String)] val mapped = pulsar.map(kv => kv._2.toInt + 1) - + testStream(mapped)( StartStream(trigger = ProcessingTime(1000)), makeSureGetOffsetCalled, @@ -52,36 +76,13 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { AddPulsarData(Set(topic), 4, 5, 6, 7, 8, 9), CheckLastBatch(8, 9, 10), AssertOnQuery { query => - val recordsRead = query.recentProgress.map(_.numInputRows).sum - recordsRead == 9 + query.recentProgress.map(microBatch => + microBatch.numInputRows == 0 || microBatch.numInputRows == 3 + ).forall(_ == true) } ) } - test("Only admit first entry of ledger") { - val topic = newTopic() - val messageIds = sendMessages(topic, Array("1", "2", "3")) - val firstMid = messageIds.head._2 - val firstLedger = getLedgerId(firstMid) - val firstEntry = getEntryId(firstMid) - require(getLatestOffsets(Set(topic)).size === 1) - val admissionControlHelper = new PulsarAdmissionControlHelper(adminUrl) - val offset = admissionControlHelper.latestOffsetForTopicPartition(topic, MessageId.earliest, 1) - assert(getLedgerId(offset) == firstLedger && getEntryId(offset) == firstEntry) - } - - test("Admit entry in the middle of the ledger") { - val topic = newTopic() - val messageIds = sendMessages(topic, Array("1", "2", "3")) - val firstMid = messageIds.head._2 - val secondMid = messageIds.apply(1)._2 - require(getLatestOffsets(Set(topic)).size === 1) - val admissionControlHelper = new PulsarAdmissionControlHelper(adminUrl) - val offset = admissionControlHelper.latestOffsetForTopicPartition(topic, firstMid, 1) - assert(getLedgerId(offset) == getLedgerId(secondMid) && getEntryId(offset) == getEntryId(secondMid)) - - } - test("Admission Control for multiple topics") { val topic1 = newTopic() val topic2 = newTopic() @@ -107,8 +108,9 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { AddPulsarData(Set(topic2), 4, 5, 6, 7, 8, 9), CheckLastBatch(8, 9, 10), AssertOnQuery { query => - val recordsRead = query.recentProgress.map(_.numInputRows).sum - recordsRead == 9 + query.recentProgress.map(microBatch => + microBatch.numInputRows == 0 || microBatch.numInputRows == 3 + ).forall(_ == true) } ) } @@ -138,8 +140,9 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { AddPulsarData(Set(topic1, topic2), 4, 5, 6, 7, 8, 9), CheckLastBatch(8, 9, 10), AssertOnQuery { query => - val recordsRead = query.recentProgress.map(_.numInputRows).sum - recordsRead == 9 + query.recentProgress.map(microBatch => + microBatch.numInputRows == 0 || microBatch.numInputRows == 3 + ).forall(_ == true) } ) } @@ -170,11 +173,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { StartStream(trigger = ProcessingTime(1000)), makeSureGetOffsetCalled, AddPulsarDataWithPartition(topic, Some(0), 1, 2, 3, 4), - CheckLastBatch(4), - AssertOnQuery { query => - val recordsRead = query.recentProgress.map(_.numInputRows).sum - recordsRead == 4 - } + CheckLastBatch(4) ) } @@ -208,8 +207,9 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { AddPulsarDataWithPartition(topic, Some(1), 5, 6, 7, 8), CheckLastBatch(7, 8), AssertOnQuery { query => - val recordsRead = query.recentProgress.map(_.numInputRows).sum - recordsRead == 8 + query.recentProgress.map( microBatch => + microBatch.numInputRows == 0 || microBatch.numInputRows == 2 + ).forall(_ == true) } ) } @@ -241,6 +241,11 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { makeSureGetOffsetCalled, AddPulsarDataWithPartition(topic, Some(0), 1, 2, 3, 4), CheckLastBatch(1, 2, 3, 4), + AssertOnQuery { query => + query.recentProgress.map(microBatch => + microBatch.numInputRows == 0 || microBatch.numInputRows == 4 + ).forall(_ == true) + } ) addPartitions(topic, 2) @@ -249,8 +254,9 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { AddPulsarDataWithPartition(topic, Some(1), 5, 6, 7, 8), CheckLastBatch(7, 8), AssertOnQuery { query => - val recordsRead = query.recentProgress.map(_.numInputRows).sum - recordsRead == 4 + query.recentProgress.map(microBatch => + microBatch.numInputRows == 0 || microBatch.numInputRows == 2 + ).forall(_ == true) } ) } From 8c4a7e245f30cb4cd5ffd4d5f8f417505bab17c9 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Mon, 21 Aug 2023 16:31:16 -0700 Subject: [PATCH 34/38] removing exact checklastbatch --- .../spark/sql/pulsar/PulsarHelper.scala | 2 +- .../pulsar/PulsarAdmissionControlSuite.scala | 48 ++++++++----------- 2 files changed, 22 insertions(+), 28 deletions(-) diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala index 2b150179..fe8bdc50 100644 --- a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala +++ b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala @@ -564,7 +564,7 @@ class PulsarAdmissionControlHelper(adminUrl: String) messageId = DefaultImplementation .getDefaultImplementation .newMessageId(ledger.ledgerId, ledger.entries - 1, partitionIndex) - } else if (readLimitLeft >= avgBytesPerEntries) { + } else { val numEntriesToRead = Math.max(1, readLimitLeft / avgBytesPerEntries) val lastEntryId = if (ledger.ledgerId != startLedgerId) { numEntriesToRead - 1 diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala index a47b7dde..6dc416f0 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarAdmissionControlSuite.scala @@ -14,7 +14,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { private val maxEntriesPerLedger = "managedLedgerMaxEntriesPerLedger" private val ledgerRolloverTime = "managedLedgerMinLedgerRolloverTimeMinutes" - private val sizeOfInt = 50 + private val approxSizeOfInt = 50 override def beforeAll(): Unit = { brokerConfigs.put(maxEntriesPerLedger, "3") @@ -34,7 +34,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { val firstEntry = getEntryId(firstMid) require(getLatestOffsets(Set(topic)).size === 1) val admissionControlHelper = new PulsarAdmissionControlHelper(adminUrl) - val offset = admissionControlHelper.latestOffsetForTopicPartition(topic, MessageId.earliest, sizeOfInt) + val offset = admissionControlHelper.latestOffsetForTopicPartition(topic, MessageId.earliest, approxSizeOfInt) assert(getLedgerId(offset) == firstLedger && getEntryId(offset) == firstEntry) } @@ -45,7 +45,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { val secondMid = messageIds.apply(1)._2 require(getLatestOffsets(Set(topic)).size === 1) val admissionControlHelper = new PulsarAdmissionControlHelper(adminUrl) - val offset = admissionControlHelper.latestOffsetForTopicPartition(topic, firstMid, sizeOfInt) + val offset = admissionControlHelper.latestOffsetForTopicPartition(topic, firstMid, approxSizeOfInt) assert(getLedgerId(offset) == getLedgerId(secondMid) && getEntryId(offset) == getEntryId(secondMid)) } @@ -61,7 +61,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { .option(ServiceUrlOptionKey, serviceUrl) .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, "true") - .option(MaxBytesPerTrigger, sizeOfInt * 3) + .option(MaxBytesPerTrigger, approxSizeOfInt * 3) .load() .selectExpr("CAST(__key AS STRING)", "CAST(value AS STRING)") .as[(String, String)] @@ -72,12 +72,10 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { StartStream(trigger = ProcessingTime(1000)), makeSureGetOffsetCalled, AddPulsarData(Set(topic), 1, 2, 3), - CheckLastBatch(2, 3, 4), AddPulsarData(Set(topic), 4, 5, 6, 7, 8, 9), - CheckLastBatch(8, 9, 10), AssertOnQuery { query => query.recentProgress.map(microBatch => - microBatch.numInputRows == 0 || microBatch.numInputRows == 3 + microBatch.numInputRows <= 4 ).forall(_ == true) } ) @@ -93,7 +91,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { .option(ServiceUrlOptionKey, serviceUrl) .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, "true") - .option(MaxBytesPerTrigger, sizeOfInt * 6) + .option(MaxBytesPerTrigger, approxSizeOfInt * 6) .load() .selectExpr("CAST(__key AS STRING)", "CAST(value AS STRING)") .as[(String, String)] @@ -104,12 +102,10 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { StartStream(trigger = ProcessingTime(1000)), makeSureGetOffsetCalled, AddPulsarData(Set(topic1), 1, 2, 3), - CheckLastBatch(2, 3, 4), AddPulsarData(Set(topic2), 4, 5, 6, 7, 8, 9), - CheckLastBatch(8, 9, 10), AssertOnQuery { query => query.recentProgress.map(microBatch => - microBatch.numInputRows == 0 || microBatch.numInputRows == 3 + microBatch.numInputRows <= 4 ).forall(_ == true) } ) @@ -125,7 +121,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { .option(ServiceUrlOptionKey, serviceUrl) .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, "true") - .option(MaxBytesPerTrigger, sizeOfInt * 6) + .option(MaxBytesPerTrigger, approxSizeOfInt * 6) .load() .selectExpr("CAST(__key AS STRING)", "CAST(value AS STRING)") .as[(String, String)] @@ -136,12 +132,10 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { StartStream(trigger = ProcessingTime(1000)), makeSureGetOffsetCalled, AddPulsarData(Set(topic1, topic2), 1, 2, 3), - CheckLastBatch(2, 3, 4), AddPulsarData(Set(topic1, topic2), 4, 5, 6, 7, 8, 9), - CheckLastBatch(8, 9, 10), AssertOnQuery { query => query.recentProgress.map(microBatch => - microBatch.numInputRows == 0 || microBatch.numInputRows == 3 + microBatch.numInputRows <= 4 ).forall(_ == true) } ) @@ -160,7 +154,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { .option(ServiceUrlOptionKey, serviceUrl) .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, "true") - .option(MaxBytesPerTrigger, sizeOfInt * 3) + .option(MaxBytesPerTrigger, approxSizeOfInt * 3) val pulsar = reader .option(TopicSingle, topic) @@ -173,7 +167,11 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { StartStream(trigger = ProcessingTime(1000)), makeSureGetOffsetCalled, AddPulsarDataWithPartition(topic, Some(0), 1, 2, 3, 4), - CheckLastBatch(4) + AssertOnQuery { query => + query.recentProgress.map(microBatch => + microBatch.numInputRows <= 4 + ).forall(_ == true) + } ) } @@ -190,7 +188,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { .option(ServiceUrlOptionKey, serviceUrl) .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, "true") - .option(MaxBytesPerTrigger, sizeOfInt * 4) + .option(MaxBytesPerTrigger, approxSizeOfInt * 4) val pulsar = reader .option(TopicSingle, topic) @@ -203,12 +201,10 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { StartStream(trigger = ProcessingTime(1000)), makeSureGetOffsetCalled, AddPulsarDataWithPartition(topic, Some(0), 1, 2, 3, 4), - CheckLastBatch(3, 4), AddPulsarDataWithPartition(topic, Some(1), 5, 6, 7, 8), - CheckLastBatch(7, 8), AssertOnQuery { query => - query.recentProgress.map( microBatch => - microBatch.numInputRows == 0 || microBatch.numInputRows == 2 + query.recentProgress.map(microBatch => + microBatch.numInputRows <= 3 ).forall(_ == true) } ) @@ -227,7 +223,7 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { .option(ServiceUrlOptionKey, serviceUrl) .option(AdminUrlOptionKey, adminUrl) .option(FailOnDataLossOptionKey, "true") - .option(MaxBytesPerTrigger, sizeOfInt * 4) + .option(MaxBytesPerTrigger, approxSizeOfInt * 4) val pulsar = reader .option(TopicSingle, topic) @@ -240,10 +236,9 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { StartStream(trigger = ProcessingTime(1000)), makeSureGetOffsetCalled, AddPulsarDataWithPartition(topic, Some(0), 1, 2, 3, 4), - CheckLastBatch(1, 2, 3, 4), AssertOnQuery { query => query.recentProgress.map(microBatch => - microBatch.numInputRows == 0 || microBatch.numInputRows == 4 + microBatch.numInputRows <= 4 ).forall(_ == true) } ) @@ -252,10 +247,9 @@ class PulsarAdmissionControlSuite extends PulsarSourceTest { testStream(mapped)( AddPulsarDataWithPartition(topic, Some(1), 5, 6, 7, 8), - CheckLastBatch(7, 8), AssertOnQuery { query => query.recentProgress.map(microBatch => - microBatch.numInputRows == 0 || microBatch.numInputRows == 2 + microBatch.numInputRows <= 3 ).forall(_ == true) } ) From 6e7b93ca91f01125e3de8ebc96e3de594ef7ce5f Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Thu, 24 Aug 2023 09:29:37 -0700 Subject: [PATCH 35/38] updating README --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 55ec8e75..8dc7322d 100644 --- a/README.md +++ b/README.md @@ -227,6 +227,16 @@ A possible solution to remove duplicates when reading the written data could be The Pulsar `serviceHttpUrl` configuration. + + `maxBytesPerTrigger` + A number string in unit of number of bytes + No + None + Streaming and Batch + A soft limit of the maximum number of bytes we want to process per microbatch. If this is specified, `admin.url` also needs to be specified. + + + `predefinedSubscription` A Subscription name string From c69d34baea18e12a05f755972b185837cc442961 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Thu, 24 Aug 2023 09:31:31 -0700 Subject: [PATCH 36/38] updating readme --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 8dc7322d..856c4fac 100644 --- a/README.md +++ b/README.md @@ -229,14 +229,13 @@ A possible solution to remove duplicates when reading the written data could be `maxBytesPerTrigger` - A number string in unit of number of bytes + A long value in unit of number of bytes No None Streaming and Batch A soft limit of the maximum number of bytes we want to process per microbatch. If this is specified, `admin.url` also needs to be specified. - `predefinedSubscription` A Subscription name string From 305fb2cae13346c224406b322b8815294eb52746 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Thu, 24 Aug 2023 09:40:47 -0700 Subject: [PATCH 37/38] changing admin.url in readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 856c4fac..a60f5fb0 100644 --- a/README.md +++ b/README.md @@ -219,12 +219,12 @@ A possible solution to remove duplicates when reading the written data could be - `admin.url` (Deprecated) + `admin.url` A service HTTP URL of your Pulsar cluster No None Streaming and Batch - The Pulsar `serviceHttpUrl` configuration. + The Pulsar `serviceHttpUrl` configuration. Only needed when `maxBytesPerTrigger` is specified From 3fdc0b7b75d280a289f92b82054c7acc3712a567 Mon Sep 17 00:00:00 2001 From: Eric Marnadi Date: Thu, 24 Aug 2023 11:28:19 -0700 Subject: [PATCH 38/38] build errors --- .../scala/org/apache/spark/sql/pulsar/PulsarSourceTest.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/scala/org/apache/spark/sql/pulsar/PulsarSourceTest.scala b/src/test/scala/org/apache/spark/sql/pulsar/PulsarSourceTest.scala index 166da619..749871d4 100644 --- a/src/test/scala/org/apache/spark/sql/pulsar/PulsarSourceTest.scala +++ b/src/test/scala/org/apache/spark/sql/pulsar/PulsarSourceTest.scala @@ -163,8 +163,8 @@ class PulsarSourceTest extends StreamTest with SharedSparkSession with PulsarTes "Cannot add data when there is no query for finding the active pulsar source") val sources = query.get.logicalPlan.collect { - case StreamingExecutionRelation(source: PulsarSource, _) => source - case StreamingExecutionRelation(source: PulsarMicroBatchReader, _) => source + case StreamingExecutionRelation(source: PulsarSource, _, _) => source + case StreamingExecutionRelation(source: PulsarMicroBatchReader, _, _) => source }.distinct if (sources.isEmpty) {