@@ -22,16 +22,22 @@ import scala.collection.mutable
2222import scala .language .postfixOps
2323import scala .util .control .NonFatal
2424
25+ import org .apache .pulsar .client .admin .PulsarAdmin
2526import org .apache .pulsar .client .api .{MessageId , PulsarClient }
2627import org .apache .pulsar .client .impl .{MessageIdImpl , PulsarClientImpl }
2728import org .apache .pulsar .client .impl .schema .BytesSchema
29+ import org .apache .pulsar .client .internal .DefaultImplementation
2830import org .apache .pulsar .common .api .proto .CommandGetTopicsOfNamespace
2931import org .apache .pulsar .common .naming .TopicName
3032import org .apache .pulsar .common .schema .SchemaInfo
3133import org .apache .pulsar .shade .com .google .common .util .concurrent .Uninterruptibles
3234
3335import org .apache .spark .internal .Logging
36+ import org .apache .spark .sql .connector .read .streaming
37+ import org .apache .spark .sql .connector .read .streaming .{ReadAllAvailable , ReadLimit }
3438import org .apache .spark .sql .pulsar .PulsarOptions ._
39+ import org .apache .spark .sql .pulsar .PulsarSourceUtils .{getEntryId , getLedgerId }
40+ import org .apache .spark .sql .pulsar .SpecificPulsarOffset .getTopicOffsets
3541import org .apache .spark .sql .types .StructType
3642
3743/**
@@ -40,6 +46,7 @@ import org.apache.spark.sql.types.StructType
4046 */
4147private [pulsar] case class PulsarHelper (
4248 serviceUrl : String ,
49+ adminUrl : String ,
4350 clientConf : ju.Map [String , Object ],
4451 driverGroupIdPrefix : String ,
4552 caseInsensitiveParameters : Map [String , String ],
@@ -55,6 +62,8 @@ private[pulsar] case class PulsarHelper(
5562 private var topics : Seq [String ] = _
5663 private var topicPartitions : Seq [String ] = _
5764
65+ private lazy val pulsarAdmin = PulsarAdmin .builder().serviceHttpUrl(adminUrl).build()
66+
5867 override def close (): Unit = {
5968 // do nothing
6069 }
@@ -207,6 +216,63 @@ private[pulsar] case class PulsarHelper(
207216 }.toMap)
208217 }
209218
219+ def latestOffsets (startingOffset : streaming.Offset ,
220+ admissionLimits : AdmissionLimits ): SpecificPulsarOffset = {
221+ // implement helper inside PulsarHelper in order to use getTopicPartitions
222+ val latestOffsets = fetchLatestOffsets().topicOffsets
223+ // add new partitions from PulsarAdmin, set to earliest entry and ledger id based on limit
224+ // start a reader, get to the earliest offset for new topic partitions
225+ val existingStartOffsets = if (startingOffset != null ) {
226+ getTopicOffsets(startingOffset.asInstanceOf [org.apache.spark.sql.execution.streaming.Offset ])
227+ } else {
228+ Map [String , MessageId ]()
229+ }
230+ val newTopics = latestOffsets.keySet.diff(existingStartOffsets.keySet)
231+ val startPartitionOffsets = existingStartOffsets ++ newTopics.map(topicPartition
232+ => topicPartition -> fetchLatestOffsetForTopic(topicPartition))
233+ val totalReadLimit = admissionLimits.bytesToTake
234+ val offsets = mutable.Map [String , MessageId ]()
235+ offsets ++= startPartitionOffsets
236+ val numPartitions = startPartitionOffsets.size
237+ startPartitionOffsets.keys.filter(topicPartition => {
238+ pulsarAdmin.topics.getInternalStats(topicPartition).currentLedgerEntries > 0
239+ }).foreach { topicPartition =>
240+ var readLimit = totalReadLimit / numPartitions
241+ val messageId = startPartitionOffsets.apply(topicPartition)
242+ val ledgerId = getLedgerId(messageId)
243+ val entryId = getEntryId(messageId)
244+ val stats = pulsarAdmin.topics.getInternalStats(topicPartition)
245+ pulsarAdmin.topics.getInternalStats(topicPartition).ledgers.
246+ asScala.filter(_.ledgerId >= ledgerId).sortBy(_.ledgerId).foreach { ledger =>
247+ ledger.entries = stats.currentLedgerEntries
248+ val avgBytesPerEntries = stats.currentLedgerSize / stats.currentLedgerEntries
249+ // approximation of bytes left in ledger to deal with case
250+ // where we are at the middle of the ledger
251+ val bytesLeftInLedger = avgBytesPerEntries * {
252+ if (ledger.ledgerId == ledgerId) {
253+ ledger.entries - entryId
254+ } else {
255+ ledger.entries
256+ }
257+ }
258+ if (readLimit > bytesLeftInLedger) {
259+ readLimit -= bytesLeftInLedger
260+ offsets += (topicPartition -> DefaultImplementation
261+ .getDefaultImplementation
262+ .newMessageId(ledger.ledgerId, ledger.entries - 1 , - 1 ))
263+ } else {
264+ val numEntriesToRead = Math .max(1 , readLimit / avgBytesPerEntries)
265+ val lastEntryRead = Math .min(ledger.entries - 1 , entryId + numEntriesToRead)
266+ offsets += (topicPartition -> DefaultImplementation
267+ .getDefaultImplementation
268+ .newMessageId(ledger.ledgerId, lastEntryRead, - 1 ))
269+ readLimit = 0
270+ }
271+ }
272+ }
273+ SpecificPulsarOffset (offsets.toMap)
274+ }
275+
210276 def fetchLatestOffsetForTopic (topic : String ): MessageId = {
211277 val messageId =
212278 try {
0 commit comments