Skip to content

Commit cc95d38

Browse files
authored
added CapacityError to be used for failing task launch promises after some number of unmatched offer cycles (#40)
1 parent 49b0cde commit cc95d38

6 files changed

Lines changed: 203 additions & 32 deletions

File tree

src/main/resources/reference.conf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
mesos-actor {
33
agent-stats-ttl = 1 minutes //minimum time that agent stats remain valid without an update; incrase time in case offer cycle is slow; decrease time ignore offer data that may be stale
44
agent-stats-pruning-period = 5 seconds //time period between pruning of expired agent stats
5+
//fail-pending-offer-cycles = 2 //fail tasks that are not matched within this number of offer cycles
56
}

src/main/scala/com/adobe/api/platform/runtime/mesos/MesosClient.scala

Lines changed: 46 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ import org.apache.mesos.v1.scheduler.Protos.Call
4343
import org.apache.mesos.v1.scheduler.Protos.Call._
4444
import org.apache.mesos.v1.scheduler.Protos.Event
4545
import pureconfig._
46+
import pureconfig.loadConfigOrThrow
4647
import scala.collection.JavaConverters._
4748
import scala.collection.mutable
4849
import scala.collection.mutable.Buffer
@@ -119,7 +120,7 @@ case class CommandURIDef(uri: URI, extract: Boolean = true, cache: Boolean = fal
119120

120121
//task states
121122
sealed abstract class TaskState()
122-
case class SubmitPending(reqs: TaskDef, promise: Promise[Running]) extends TaskState
123+
case class SubmitPending(reqs: TaskDef, promise: Promise[Running], offerCycles: Int = 1) extends TaskState
123124
case class Submitted(pending: SubmitPending,
124125
taskInfo: TaskInfo,
125126
offer: OfferID,
@@ -137,10 +138,19 @@ case class DeletePending(taskId: String, promise: Promise[Deleted]) extends Task
137138
case class Deleted(taskId: String, taskStatus: TaskStatus) extends TaskState
138139
case class Failed(taskId: String, agentId: String) extends TaskState
139140

140-
case class MesosActorConfig(agentStatsTTL: FiniteDuration, agentStatsPruningPeriod: FiniteDuration)
141+
case class MesosActorConfig(agentStatsTTL: FiniteDuration,
142+
agentStatsPruningPeriod: FiniteDuration,
143+
failPendingOfferCycles: Option[Int])
141144
case class AgentStats(mem: Double, cpu: Double, ports: Int, expiration: Instant)
142145

143146
case class MesosAgentStats(stats: Map[String, AgentStats])
147+
148+
case class CapacityFailure(requiredMem: Float,
149+
requiredCpu: Float,
150+
requiredPorts: Int,
151+
remainingResources: List[(Float, Float, Int)])
152+
extends MesosException("cluster does not have capacity")
153+
144154
//TODO: mesos authentication
145155
trait MesosClientActor extends Actor with ActorLogging with MesosClientConnection {
146156
implicit val ec: ExecutionContext = context.dispatcher
@@ -167,7 +177,7 @@ trait MesosClientActor extends Actor with ActorLogging with MesosClientConnectio
167177
var agentOfferHistory = Map.empty[String, AgentStats] // Map[<agent hostname> -> <stats>] track the most recent offer stats per agent hostname
168178
val listener: Option[ActorRef]
169179

170-
val config = loadConfigOrThrow[MesosActorConfig]("mesos-actor")
180+
val config: MesosActorConfig
171181

172182
if (autoSubscribe) {
173183
log.info(s"auto-subscribing ${self} to mesos master at ${master}")
@@ -191,13 +201,14 @@ trait MesosClientActor extends Actor with ActorLogging with MesosClientConnectio
191201
}
192202
case object PruneStats
193203

194-
val statsPruner =
204+
override def preStart() = {
195205
actorSystem.scheduler.schedule(30.seconds, config.agentStatsPruningPeriod, context.actorOf(Props(new Actor {
196206
override def receive: Receive = {
197207
case PruneStats =>
198208
context.parent ! PruneStats //client actor needs to handle PruneStats to avoid concurrent update to stats map
199209
}
200210
})), PruneStats)
211+
}
201212
//cache the framework id, so that in case this actor restarts we can reconnect
202213
if (MesosClient.frameworkID.isEmpty) MesosClient.frameworkID = Some(FrameworkID.newBuilder().setValue(id()).build())
203214
private val frameworkID = MesosClient.frameworkID.get
@@ -234,7 +245,7 @@ trait MesosClientActor extends Actor with ActorLogging with MesosClientConnectio
234245
tasks.get(taskID.getValue) match {
235246
case Some(taskDetails) =>
236247
taskDetails match {
237-
case SubmitPending(taskDef, promise) => {
248+
case SubmitPending(taskDef, promise, _) => {
238249
log.info(s"deleting unlaunched task ${taskDef.taskId}")
239250
tasks.remove(taskDef.taskId)
240251
}
@@ -434,7 +445,8 @@ trait MesosClientActor extends Actor with ActorLogging with MesosClientConnectio
434445

435446
log.debug(s"agent offer stats: {}", agentOfferHistory)
436447

437-
val matchedTasks = taskMatcher.matchTasksToOffers(role, pending, event.getOffersList.asScala.toList, taskBuilder)
448+
val (matchedTasks, remaining) =
449+
taskMatcher.matchTasksToOffers(role, pending, event.getOffersList.asScala.toList, taskBuilder)
438450

439451
val matchedCount = matchedTasks.foldLeft(0)(_ + _._2.size)
440452
log.info(s"matched ${matchedCount} tasks to ${matchedTasks.size} offers out of ${pending.size} pending tasks")
@@ -474,7 +486,7 @@ trait MesosClientActor extends Actor with ActorLogging with MesosClientConnectio
474486
matchedTasks.foreach(entry => {
475487
entry._2.foreach(task => {
476488
tasks(task._1.getTaskId.getValue) match {
477-
case s @ SubmitPending(reqs, promise) =>
489+
case s @ SubmitPending(reqs, promise, _) =>
478490
//dig the hostname out of the offer whose agent id matches the agent id in the task info
479491
val hostname =
480492
event.getOffersList.asScala.find(p => p.getAgentId == task._1.getAgentId).get.getHostname
@@ -493,6 +505,27 @@ trait MesosClientActor extends Actor with ActorLogging with MesosClientConnectio
493505
}
494506

495507
}
508+
//generate failures for pending tasks that did not fit any offers
509+
config.failPendingOfferCycles.foreach { maxOfferCycles =>
510+
val submitPending = tasks.collect { case (_, s: SubmitPending) => s }
511+
if (submitPending.nonEmpty) {
512+
submitPending.foreach { task =>
513+
//println(s"task offerCycles:${task.offerCycles} ${maxOfferCycles}")
514+
if (task.offerCycles > maxOfferCycles) {
515+
log.warning(s"failing task ${task.reqs.taskId} after ${task.offerCycles} unmatching offer cycles")
516+
task.promise.failure(
517+
new CapacityFailure(
518+
task.reqs.mem.toFloat,
519+
task.reqs.cpus.toFloat,
520+
task.reqs.ports.size,
521+
remaining.values.toList))
522+
tasks.remove(task.reqs.taskId)
523+
} else {
524+
tasks.update(task.reqs.taskId, task.copy(offerCycles = task.offerCycles + 1)) //increase the offer cycles this task has seen
525+
}
526+
}
527+
}
528+
}
496529

497530
//store a reference of last memory offer (total) from each agent
498531
val newOfferStats = MesosClient.getOfferStats(config, role, agentOfferMap)
@@ -691,7 +724,8 @@ class MesosClient(val id: () => String,
691724
val tasks: TaskStore,
692725
val refuseSeconds: Double,
693726
val heartbeatMaxFailures: Int,
694-
val listener: Option[ActorRef])
727+
val listener: Option[ActorRef],
728+
val config: MesosActorConfig)
695729
extends MesosClientActor
696730
with MesosClientHttpConnection {}
697731

@@ -710,7 +744,8 @@ object MesosClient {
710744
taskStore: TaskStore,
711745
refuseSeconds: Double = 5.0,
712746
heartbeatMaxFailures: Int = 2,
713-
listener: Option[ActorRef] = None): Props =
747+
listener: Option[ActorRef] = None,
748+
config: MesosActorConfig = loadConfigOrThrow[MesosActorConfig]("mesos-actor")): Props =
714749
Props(
715750
new MesosClient(
716751
id,
@@ -724,7 +759,8 @@ object MesosClient {
724759
taskStore,
725760
refuseSeconds,
726761
heartbeatMaxFailures,
727-
listener))
762+
listener,
763+
config))
728764

729765
//TODO: allow task persistence/reconcile
730766

src/main/scala/com/adobe/api/platform/runtime/mesos/TaskMatcher.scala

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,16 @@ import org.apache.mesos.v1.Protos.TaskInfo
2323
import org.apache.mesos.v1.Protos.Value
2424
import org.apache.mesos.v1.Protos.Value.Ranges
2525
import scala.collection.JavaConverters._
26+
import scala.collection.mutable.Buffer
2627
import scala.collection.mutable.ListBuffer
2728

2829
trait TaskMatcher {
2930
def matchTasksToOffers(role: String, t: Iterable[TaskDef], o: Iterable[Offer], builder: TaskBuilder)(
30-
implicit logger: LoggingAdapter): Map[OfferID, Seq[(TaskInfo, Seq[Int])]]
31+
implicit logger: LoggingAdapter): (Map[OfferID, Seq[(TaskInfo, Seq[Int])]], Map[OfferID, (Float, Float, Int)])
3132
}
3233
class DefaultTaskMatcher(isValid: Offer => Boolean = _ => true) extends TaskMatcher {
3334
override def matchTasksToOffers(role: String, t: Iterable[TaskDef], o: Iterable[Offer], builder: TaskBuilder)(
34-
implicit logger: LoggingAdapter): Map[OfferID, Seq[(TaskInfo, Seq[Int])]] = {
35+
implicit logger: LoggingAdapter): (Map[OfferID, Seq[(TaskInfo, Seq[Int])]], Map[OfferID, (Float, Float, Int)]) = {
3536
//we can launch many tasks on a single offer
3637

3738
var tasksInNeed: ListBuffer[TaskDef] = t.to[ListBuffer]
@@ -40,7 +41,7 @@ class DefaultTaskMatcher(isValid: Offer => Boolean = _ => true) extends TaskMatc
4041
_.getResourcesList.asScala
4142
.find(_.getName == "cpus")
4243
.map(_.getScalar.getValue))
43-
44+
var remaining: Map[OfferID, (Float, Float, Int)] = Map.empty
4445
sortedOffers.foreach(offer => {
4546
try {
4647
//for testing worst case scenario...
@@ -68,6 +69,17 @@ class DefaultTaskMatcher(isValid: Offer => Boolean = _ => true) extends TaskMatc
6869
var remainingOfferMem = scalarResources("mem")
6970
var usedPorts = ListBuffer[Int]()
7071
var acceptedTasks = ListBuffer[(TaskInfo, Seq[Int])]()
72+
//check for a good fit
73+
//collect ranges from ports resources
74+
val offerPortsRanges = offer.getResourcesList.asScala
75+
.filter(res => res.getName == "ports")
76+
.filter(_.getRole == role) //ignore resources with other roles
77+
.map(res => res.getRanges)
78+
79+
//before matching, consider all resources remaining
80+
remaining = remaining + (offer.getId -> (remainingOfferMem.toFloat, remainingOfferCpus.toFloat, countPorts(
81+
offerPortsRanges) - usedPorts.size))
82+
7183
tasksInNeed.map(task => {
7284

7385
val taskCpus = task.cpus
@@ -86,12 +98,6 @@ class DefaultTaskMatcher(isValid: Offer => Boolean = _ => true) extends TaskMatc
8698
}
8799
})
88100
logger.debug(s"constraintChecks ${constraintChecks}")
89-
//check for a good fit
90-
//collect ranges from ports resources
91-
val offerPortsRanges = offer.getResourcesList.asScala
92-
.filter(res => res.getName == "ports")
93-
.filter(_.getRole == role) //ignore resources with other roles
94-
.map(res => res.getRanges)
95101
//pluck the number of ports needed for this task
96102
val hostPorts = pluckPorts(offerPortsRanges, task.ports.size, usedPorts)
97103
val matchedResources = remainingOfferCpus > taskCpus &&
@@ -105,12 +111,13 @@ class DefaultTaskMatcher(isValid: Offer => Boolean = _ => true) extends TaskMatc
105111
logger.info(
106112
s"offer did not match resource requirements cpu:${taskCpus} (${remainingOfferCpus}), mem: ${taskMem} (${remainingOfferMem}), ports: ${task.ports.size} (${hostPorts.size})")
107113
} else {
108-
109114
//mark resources as used
110115
remainingOfferCpus -= taskCpus
111116
remainingOfferMem -= taskMem
112117
usedPorts ++= hostPorts
113-
118+
//update remaining for all resource usage
119+
remaining = remaining + (offer.getId -> (remainingOfferMem.toFloat, remainingOfferCpus.toFloat, countPorts(
120+
offerPortsRanges) - usedPorts.size))
114121
//build port mappings
115122
val portMappings =
116123
if (task.ports.isEmpty) List()
@@ -180,7 +187,7 @@ class DefaultTaskMatcher(isValid: Offer => Boolean = _ => true) extends TaskMatc
180187
case t: Exception => logger.error(s"task matching failed, ignoring offer ${offer.getId} ${t}")
181188
}
182189
})
183-
result
190+
(result, remaining)
184191
}
185192

186193
def pluckPorts(rangesList: Iterable[org.apache.mesos.v1.Protos.Value.Ranges],
@@ -201,4 +208,7 @@ class DefaultTaskMatcher(isValid: Offer => Boolean = _ => true) extends TaskMatc
201208
})
202209
ports.toList
203210
}
211+
def countPorts(portRanges: Buffer[Ranges]) = {
212+
portRanges.foldLeft(0)(_ + _.getRangeList.asScala.foldLeft(0)((a, b) => a + b.getEnd.toInt - b.getBegin.toInt + 1))
213+
}
204214
}

src/test/resources/emptyoffer.json

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
{
2+
"offers": [{
3+
"id": {
4+
"value": "7168e411-c3e4-4e29-b292-9b12eda4aaca-O58"
5+
},
6+
"frameworkId": {
7+
"value": "sample-11fdf320-311f-42b0-a284-dbc38bb8d191"
8+
},
9+
"agentId": {
10+
"value": "db6b062d-84e3-4a2e-a8c5-98ffa944a304-S0"
11+
},
12+
"hostname": "192.168.99.100",
13+
"resources": [{
14+
"name": "ports",
15+
"type": "RANGES",
16+
"ranges": {
17+
"range": [{
18+
"begin": "11001",
19+
"end": "11199"
20+
}]
21+
},
22+
"role": "*",
23+
"allocationInfo": {
24+
"role": "*"
25+
}
26+
}, {
27+
"name": "cpus",
28+
"type": "SCALAR",
29+
"scalar": {
30+
"value": 0.0
31+
},
32+
"role": "*",
33+
"allocationInfo": {
34+
"role": "*"
35+
}
36+
}, {
37+
"name": "mem",
38+
"type": "SCALAR",
39+
"scalar": {
40+
"value": 0.0
41+
},
42+
"role": "*",
43+
"allocationInfo": {
44+
"role": "*"
45+
}
46+
}, {
47+
"name": "disk",
48+
"type": "SCALAR",
49+
"scalar": {
50+
"value": 0.0
51+
},
52+
"role": "*",
53+
"allocationInfo": {
54+
"role": "*"
55+
}
56+
}],
57+
"attributes": [{
58+
"name": "att1",
59+
"type": "TEXT",
60+
"text": {
61+
"value": "att1valueslave1"
62+
}
63+
}, {
64+
"name": "att2",
65+
"type": "TEXT",
66+
"text": {
67+
"value": "att2valueslave1"
68+
}
69+
}],
70+
"url": {
71+
"scheme": "http",
72+
"address": {
73+
"hostname": "192.168.99.100",
74+
"ip": "192.168.99.100",
75+
"port": 5051
76+
},
77+
"path": "/slave(1)"
78+
},
79+
"allocationInfo": {
80+
"role": "*"
81+
}
82+
}]
83+
}

0 commit comments

Comments
 (0)