moving functionality to PulsarHelper

ericm-db · ericm-db · commit e7e87b699fb9 · 2023-08-15T16:06:49.000-07:00
diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarHelper.scala
@@ -22,16 +22,22 @@ import scala.collection.mutable
 import scala.language.postfixOps
 import scala.util.control.NonFatal
 
+import org.apache.pulsar.client.admin.PulsarAdmin
 import org.apache.pulsar.client.api.{MessageId, PulsarClient}
 import org.apache.pulsar.client.impl.{MessageIdImpl, PulsarClientImpl}
 import org.apache.pulsar.client.impl.schema.BytesSchema
+import org.apache.pulsar.client.internal.DefaultImplementation
 import org.apache.pulsar.common.api.proto.CommandGetTopicsOfNamespace
 import org.apache.pulsar.common.naming.TopicName
 import org.apache.pulsar.common.schema.SchemaInfo
 import org.apache.pulsar.shade.com.google.common.util.concurrent.Uninterruptibles
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.sql.connector.read.streaming
+import org.apache.spark.sql.connector.read.streaming.{ReadAllAvailable, ReadLimit}
 import org.apache.spark.sql.pulsar.PulsarOptions._
+import org.apache.spark.sql.pulsar.PulsarSourceUtils.{getEntryId, getLedgerId}
+import org.apache.spark.sql.pulsar.SpecificPulsarOffset.getTopicOffsets
 import org.apache.spark.sql.types.StructType
 
 /**
@@ -40,6 +46,7 @@ import org.apache.spark.sql.types.StructType
  */
 private[pulsar] case class PulsarHelper(
     serviceUrl: String,
+    adminUrl: String,
     clientConf: ju.Map[String, Object],
     driverGroupIdPrefix: String,
     caseInsensitiveParameters: Map[String, String],
@@ -55,6 +62,8 @@ private[pulsar] case class PulsarHelper(
   private var topics: Seq[String] = _
   private var topicPartitions: Seq[String] = _
 
+  private lazy val pulsarAdmin = PulsarAdmin.builder().serviceHttpUrl(adminUrl).build()
+
   override def close(): Unit = {
     // do nothing
   }
@@ -207,6 +216,63 @@ private[pulsar] case class PulsarHelper(
     }.toMap)
   }
 
+  def latestOffsets(startingOffset: streaming.Offset,
+                    admissionLimits: AdmissionLimits): SpecificPulsarOffset = {
+    // implement helper inside PulsarHelper in order to use getTopicPartitions
+    val latestOffsets = fetchLatestOffsets().topicOffsets
+    // add new partitions from PulsarAdmin, set to earliest entry and ledger id based on limit
+    // start a reader, get to the earliest offset for new topic partitions
+    val existingStartOffsets = if (startingOffset != null) {
+      getTopicOffsets(startingOffset.asInstanceOf[org.apache.spark.sql.execution.streaming.Offset])
+    } else {
+      Map[String, MessageId]()
+    }
+    val newTopics = latestOffsets.keySet.diff(existingStartOffsets.keySet)
+    val startPartitionOffsets = existingStartOffsets ++ newTopics.map(topicPartition
+    => topicPartition -> fetchLatestOffsetForTopic(topicPartition))
+    val totalReadLimit = admissionLimits.bytesToTake
+    val offsets = mutable.Map[String, MessageId]()
+    offsets ++= startPartitionOffsets
+    val numPartitions = startPartitionOffsets.size
+    startPartitionOffsets.keys.filter(topicPartition => {
+      pulsarAdmin.topics.getInternalStats(topicPartition).currentLedgerEntries > 0
+    }).foreach { topicPartition =>
+      var readLimit = totalReadLimit / numPartitions
+      val messageId = startPartitionOffsets.apply(topicPartition)
+      val ledgerId = getLedgerId(messageId)
+      val entryId = getEntryId(messageId)
+      val stats = pulsarAdmin.topics.getInternalStats(topicPartition)
+      pulsarAdmin.topics.getInternalStats(topicPartition).ledgers.
+        asScala.filter(_.ledgerId >= ledgerId).sortBy(_.ledgerId).foreach { ledger =>
+        ledger.entries = stats.currentLedgerEntries
+        val avgBytesPerEntries = stats.currentLedgerSize / stats.currentLedgerEntries
+        // approximation of bytes left in ledger to deal with case
+        // where we are at the middle of the ledger
+        val bytesLeftInLedger = avgBytesPerEntries * {
+          if (ledger.ledgerId == ledgerId) {
+            ledger.entries - entryId
+          } else {
+            ledger.entries
+          }
+        }
+        if (readLimit > bytesLeftInLedger) {
+          readLimit -= bytesLeftInLedger
+          offsets += (topicPartition -> DefaultImplementation
+            .getDefaultImplementation
+            .newMessageId(ledger.ledgerId, ledger.entries - 1, -1))
+        } else {
+          val numEntriesToRead = Math.max(1, readLimit / avgBytesPerEntries)
+          val lastEntryRead = Math.min(ledger.entries - 1, entryId + numEntriesToRead)
+          offsets += (topicPartition -> DefaultImplementation
+            .getDefaultImplementation
+            .newMessageId(ledger.ledgerId, lastEntryRead, -1))
+          readLimit = 0
+        }
+      }
+    }
+    SpecificPulsarOffset(offsets.toMap)
+  }
+
   def fetchLatestOffsetForTopic(topic: String): MessageId = {
     val messageId =
       try {
diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarProvider.scala
@@ -56,12 +56,13 @@ private[pulsar] class PulsarProvider
       parameters: Map[String, String]): (String, StructType) = {
 
     val caseInsensitiveParams = validateStreamOptions(parameters)
-    val (clientConfig, _, serviceUrlConfig, _) = prepareConfForReader(parameters)
+    val (clientConfig, _, serviceUrlConfig, adminUrl) = prepareConfForReader(parameters)
 
     val subscriptionNamePrefix = s"spark-pulsar-${UUID.randomUUID}"
     val inferredSchema = Utils.tryWithResource(
       PulsarHelper(
         serviceUrlConfig,
+        adminUrl,
         clientConfig,
         subscriptionNamePrefix,
         caseInsensitiveParams,
@@ -91,6 +92,7 @@ private[pulsar] class PulsarProvider
     val subscriptionNamePrefix = getSubscriptionPrefix(parameters)
     val pulsarHelper = PulsarHelper(
       serviceUrl,
+      adminUrl,
       clientConfig,
       subscriptionNamePrefix,
       caseInsensitiveParams,
@@ -127,10 +129,11 @@ private[pulsar] class PulsarProvider
 
     val subscriptionNamePrefix = getSubscriptionPrefix(parameters, isBatch = true)
 
-    val (clientConfig, readerConfig, serviceUrl, _) = prepareConfForReader(parameters)
+    val (clientConfig, readerConfig, serviceUrl, adminUrl) = prepareConfForReader(parameters)
     val (start, end, schema, pSchema) = Utils.tryWithResource(
       PulsarHelper(
         serviceUrl,
+        adminUrl,
         clientConfig,
         subscriptionNamePrefix,
         caseInsensitiveParams,
diff --git a/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala b/src/main/scala/org/apache/spark/sql/pulsar/PulsarSource.scala
@@ -68,89 +68,25 @@ private[pulsar] class PulsarSource(
 
   private var currentTopicOffsets: Option[Map[String, MessageId]] = None
 
-  private lazy val pulsarAdmin = PulsarAdmin.builder().serviceHttpUrl(serviceUrl).build()
 
   private lazy val pulsarSchema: SchemaInfo = pulsarHelper.getPulsarSchema
 
   override def schema(): StructType = SchemaUtils.pulsarSourceSchema(pulsarSchema)
 
   override def getOffset: Option[Offset] = {
-    // Make sure initialTopicOffsets is initialized
-    initialTopicOffsets
-    val latest = pulsarHelper.fetchLatestOffsets()
-    currentTopicOffsets = Some(latest.topicOffsets)
-    Some(latest.asInstanceOf[Offset])
+    throw new UnsupportedOperationException(
+      "latestOffset(Offset, ReadLimit) should be called instead of this method")
   }
 
   override def latestOffset(startingOffset: streaming.Offset,
                             readLimit: ReadLimit): streaming.Offset = {
     initialTopicOffsets
-    // implement helper inside PulsarHelper in order to use getTopicPartitions
-    val latestOffsets = pulsarHelper.fetchLatestOffsets().topicOffsets
-    // add new partitions from PulsarAdmin, set to earliest entry and ledger id based on limit
-    // start a reader, get to the earliest offset for new topic partitions
-    val existingStartOffsets = if (startingOffset != null) {
-      getTopicOffsets(startingOffset.asInstanceOf[org.apache.spark.sql.execution.streaming.Offset])
-    } else {
-      Map[String, MessageId]()
-    }
-    val newTopics = latestOffsets.keySet.diff(existingStartOffsets.keySet)
-    val startPartitionOffsets = existingStartOffsets ++ newTopics.map(topicPartition
-      => topicPartition -> pulsarHelper.fetchLatestOffsetForTopic(topicPartition))
-    val totalReadLimit = AdmissionLimits(readLimit).get.bytesToTake
-    val offsets = mutable.Map[String, MessageId]()
-    offsets ++= startPartitionOffsets
-    val numPartitions = startPartitionOffsets.size
-    startPartitionOffsets.keys.filter(topicPartition => {
-      pulsarAdmin.topics.getInternalStats(topicPartition).currentLedgerEntries > 0
-    }).foreach { topicPartition =>
-      var readLimit = totalReadLimit / numPartitions
-      val messageId = startPartitionOffsets.apply(topicPartition)
-      val ledgerId = getLedgerId(messageId)
-      val entryId = getEntryId(messageId)
-      val stats = pulsarAdmin.topics.getInternalStats(topicPartition)
-      pulsarAdmin.topics.getInternalStats(topicPartition).ledgers.
-        asScala.filter(_.ledgerId >= ledgerId).sortBy(_.ledgerId).foreach{ ledger =>
-        ledger.entries = stats.currentLedgerEntries
-        val avgBytesPerEntries = stats.currentLedgerSize / stats.currentLedgerEntries
-        // approximation of bytes left in ledger to deal with case
-        // where we are at the middle of the ledger
-        val bytesLeftInLedger = avgBytesPerEntries * {
-          if (ledger.ledgerId == ledgerId) {
-            ledger.entries - entryId
-          } else {
-            ledger.entries
-          }
-        }
-        if (readLimit > bytesLeftInLedger) {
-          readLimit -= bytesLeftInLedger
-          offsets += (topicPartition -> DefaultImplementation
-            .getDefaultImplementation
-            .newMessageId(ledger.ledgerId, ledger.entries - 1, -1))
-        } else {
-          val numEntriesToRead = Math.max(1, readLimit / avgBytesPerEntries)
-          val lastEntryRead = Math.min(ledger.entries - 1, entryId + numEntriesToRead)
-          offsets += (topicPartition -> DefaultImplementation
-            .getDefaultImplementation
-            .newMessageId(ledger.ledgerId, lastEntryRead, -1))
-          readLimit = 0
-        }
-      }
-    }
-    SpecificPulsarOffset(offsets.toMap)
+    val admissionLimits = AdmissionLimits(readLimit)
+    pulsarHelper.latestOffsets(startingOffset, admissionLimits.get)
   }
   override def getDefaultReadLimit: ReadLimit = {
       ReadMaxBytes.apply(maxBytesPerTrigger)
   }
-  class AdmissionLimits(var bytesToTake: Long)
-
-  object AdmissionLimits {
-    def apply(limit: ReadLimit): Option[AdmissionLimits] = limit match {
-      case maxBytes: ReadMaxBytes => Some(new AdmissionLimits(maxBytes.maxBytes))
-      case _  : ReadAllAvailable => Some(new AdmissionLimits(Int.MaxValue))
-    }
-
-  }
 
   override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
     // Make sure initialTopicOffsets is initialized
@@ -257,3 +193,13 @@ private[pulsar] class PulsarSource(
 
 /** A read limit that admits a soft-max of `maxBytes` per micro-batch. */
 case class ReadMaxBytes(maxBytes: Long) extends ReadLimit
+
+class AdmissionLimits(var bytesToTake: Long)
+
+object AdmissionLimits {
+  def apply(limit: ReadLimit): Option[AdmissionLimits] = limit match {
+    case maxBytes: ReadMaxBytes => Some(new AdmissionLimits(maxBytes.maxBytes))
+    case _: ReadAllAvailable => Some(new AdmissionLimits(Int.MaxValue))
+  }
+
+}