spark源码研究—作业提交

现在的位置: 首页 > 综合 > 正文

spark源码研究—作业提交

2019年05月23日 ⁄ 综合 ⁄ 共 15895字 ⁄ 字号小中大 ⁄ 评论关闭

注意看表红的地方，那是代码的执行流程，有些通信的地方没有标注，因为通信是在太多了

DAG：

<span style="font-size:14px;"><span style="font-size:18px;"> private[scheduler] def handleJobSubmitted(jobId: Int,
      finalRDD: RDD[_],
      func: (TaskContext, Iterator[_]) => _,
      partitions: Array[Int],
      allowLocal: Boolean,
      callSite: CallSite,
      listener: JobListener,
      properties: Properties = null)
  {
    var finalStage: Stage = null
    try {
      // New stage creation may throw an exception if, for example, jobs are run on a
      // HadoopRDD whose underlying HDFS files have been deleted.
      finalStage = newStage(finalRDD, partitions.size, None, jobId, callSite)
    } catch {
      case e: Exception =>
        logWarning("Creating new stage failed due to exception - job: " + jobId, e)
        listener.jobFailed(e)
        return
    }
    if (finalStage != null) {
   <span style="color:#ff0000;">   val job = new ActiveJob(jobId, finalStage, func, partitions, callSite, listener, properties)</span>
      clearCacheLocs()
      logInfo("Got job %s (%s) with %d output partitions (allowLocal=%s)".format(
        job.jobId, callSite.shortForm, partitions.length, allowLocal))
      logInfo("Final stage: " + finalStage + "(" + finalStage.name + ")")
      logInfo("Parents of final stage: " + finalStage.parents)
      logInfo("Missing parents: " + getMissingParentStages(finalStage))
      val shouldRunLocally =
        localExecutionEnabled && allowLocal && finalStage.parents.isEmpty && partitions.length == 1
      if (shouldRunLocally) {
        // Compute very short actions like first() or take() with no parent stages locally.
        listenerBus.post(SparkListenerJobStart(job.jobId, Array[Int](), properties))
        runLocally(job)
      } else {
        jobIdToActiveJob(jobId) = job
        activeJobs += job
        finalStage.resultOfJob = Some(job)
        listenerBus.post(SparkListenerJobStart(job.jobId, jobIdToStageIds(jobId).toArray,
          properties))
   <span style="color:#ff0000;">     submitStage(finalStage)</span>
      }
    }
    submitWaitingStages()
  }</span></span>

再来：

<span style="font-size:14px;"><span style="font-size:18px;">/** Submits stage, but first recursively submits any missing parents. */
  private def submitStage(stage: Stage) {
    val jobId = activeJobForStage(stage)
    if (jobId.isDefined) {
      logDebug("submitStage(" + stage + ")")
      if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
        val missing = getMissingParentStages(stage).sortBy(_.id）<span style="color:#ff0000;">//这个方法可以和submitStage对比一下</span>
        logDebug("missing: " + missing)
        if (missing == Nil) {<span style="color:#ff0000;">//如果这个条件满足，那么就是第一个stage，这个不用解释吧</span>
          logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
     <span style="color:#ff0000;">     submitMissingTasks(stage, jobId.get)</span>
        } else {
          for (parent <- missing) {
            submitStage(parent)
          }
          waitingStages += stage
        }
      }
    } else {
      abortStage(stage, "No active job for stage " + stage.id)
    }
  }
</span></span>

再来：这个方法是将stage转化为tasks的，task是并行运行的且task是根据stage的partitions的个数来确定的

<span style="font-size:14px;"><span style="font-size:18px;">  private def submitMissingTasks(stage: Stage, jobId: Int) {
    logDebug("submitMissingTasks(" + stage + ")")
    // Get our pending tasks and remember them in our pendingTasks entry
    stage.pendingTasks.clear()//这边为什么要清楚tasks呢，因为这次submittask可能是因为有些partition运行失败，但不是所有的都失败，所以只需要运行相关的partitions就行

    // First figure out the indexes of partition ids to compute.
  <span style="color:#ff0000;">  val partitionsToCompute: Seq[Int] = {//shuffle是要写到不同文件中的
      if (stage.isShuffleMap) {
        (0 until stage.numPartitions).filter(id => stage.outputLocs(id) == Nil)//灭有没处理的partition
      } else {
        val job = stage.resultOfJob.get
        (0 until job.numPartitions).filter(id => !job.finished(id))
      }
    }</span>

    val properties = if (jobIdToActiveJob.contains(jobId)) {
      jobIdToActiveJob(stage.jobId).properties
    } else {
      // this stage will be assigned to "default" pool
      null
    }

    runningStages += stage
    // SparkListenerStageSubmitted should be posted before testing whether tasks are
    // serializable. If tasks are not serializable, a SparkListenerStageCompleted event
    // will be posted, which should always come after a corresponding SparkListenerStageSubmitted
    // event.
    stage.latestInfo = StageInfo.fromStage(stage, Some(partitionsToCompute.size))
    listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))

    // TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
    // Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast
    // the serialized copy of the RDD and for each task we will deserialize it, which means each
    // task gets a different copy of the RDD. This provides stronger isolation between tasks that
    // might modify state of objects referenced in their closures. This is necessary in Hadoop
    // where the JobConf/Configuration object is not thread-safe.
  <span style="color:#ff0000;">  var taskBinary: Broadcast[Array[Byte]] = null//序列化stage
    try {
      // For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
      // For ResultTask, serialize and broadcast (rdd, func).
      val taskBinaryBytes: Array[Byte] =
        if (stage.isShuffleMap) {
          closureSerializer.serialize((stage.rdd, stage.shuffleDep.get) : AnyRef).array()//每进行一次shhuffle就会进行一次广播,但这个不是真正运行的任务
        } else {
          closureSerializer.serialize((stage.rdd, stage.resultOfJob.get.func) : AnyRef).array()
        }
      taskBinary = sc.broadcast(taskBinaryBytes)</span>
    } catch {
      // In the case of a failure during serialization, abort the stage.
      case e: NotSerializableException =>
        abortStage(stage, "Task not serializable: " + e.toString)
        runningStages -= stage
        return
      case NonFatal(e) =>
        abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}")
        runningStages -= stage
        return
    }

 <span style="color:#ff0000;">   val tasks: Seq[Task[_]] = if (stage.isShuffleMap) {
      partitionsToCompute.map { id =>
        val locs = getPreferredLocs(stage.rdd, id)
        val part = stage.rdd.partitions(id)
        new ShuffleMapTask(stage.id, taskBinary, part, locs)//因为有那么多的partitions，所以要更具不同的配置信息创建多个ShuffleMapTask，task只有来年各种，resulttask，shuffletask
      }
    } else {
      val job = stage.resultOfJob.get
      partitionsToCompute.map { id =>
        val p: Int = job.partitions(id)
        val part = stage.rdd.partitions(p)
        val locs = getPreferredLocs(stage.rdd, p)
        new ResultTask(stage.id, taskBinary, part, locs, id)
      }
    }</span>

    if (tasks.size > 0) {
      // Preemptively serialize a task to make sure it can be serialized. We are catching this
      // exception here because it would be fairly hard to catch the non-serializable exception
      // down the road, where we have several different implementations for local scheduler and
      // cluster schedulers.
      //
      // We've already serialized RDDs and closures in taskBinary, but here we check for all other
      // objects such as Partition.
      try {
        closureSerializer.serialize(tasks.head)
      } catch {
        case e: NotSerializableException =>
          abortStage(stage, "Task not serializable: " + e.toString)
          runningStages -= stage
          return
        case NonFatal(e) => // Other exceptions, such as IllegalArgumentException from Kryo.
          abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}")
          runningStages -= stage
          return
      }

      logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
<span style="color:#ff0000;">      stage.pendingTasks ++= tasks//这才是要真正要运行的，这些task是并行运行的</span>
      logDebug("New pending tasks: " + stage.pendingTasks)
    <span style="background-color: rgb(102, 102, 102);">  <span style="color:#ff0000;">taskScheduler</span>.submitTasks(
        new TaskSet(tasks.toArray, stage.id, stage.newAttemptId(), stage.jobId, properties))//<span style="color:#ff0000;">现在就到了TaskSchedulerImpl中</span></span>
      stage.latestInfo.submissionTime = Some(clock.getTime())
    } else {
      // Because we posted SparkListenerStageSubmitted earlier, we should post
      // SparkListenerStageCompleted here in case there are no tasks to run.
      listenerBus.post(SparkListenerStageCompleted(stage.latestInfo))
      logDebug("Stage " + stage + " is actually done; %b %d %d".format(
        stage.isAvailable, stage.numAvailableOutputs, stage.numPartitions))
      runningStages -= stage
    }
  }</span></span>

再来：

<span style="font-size:14px;"> override def submitTasks(taskSet: TaskSet) {
    val tasks = taskSet.tasks//这是一个task数组，数组中任务的执行逻辑是一样的，只是数据的location不一样
    logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
    this.synchronized {
      val manager = new TaskSetManager(this, taskSet, maxTaskFailures)
      activeTaskSets(taskSet.id) = manager
      schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)
    }
<span style="color:#ff0000;">    backend.reviveOffers()</span>
  }</span>

再来：

<span style="font-size:14px;">   // Make fake resource offers on all executors
    def makeOffers() {
      launchTasks(<span style="color:#ff0000;">scheduler</span>.resourceOffers(executorDataMap.map { case (id, executorData) =>//划红线的部分是：TaskSchedulerImpl
        new WorkerOffer(id, executorData.executorHost, executorData.freeCores)
      }.toSeq))
    }</span>

再来：

<span style="font-size:14px;"> /**
   * <span style="color:#ff0000;">Called by cluster manager to offer resources on slaves</span>. We respond by asking our active task
   * sets for tasks in order of priority. We fill each node with tasks in a round-robin manner so
   * that tasks are balanced across the cluster.
   */
  def resourceOffers(offers: Seq[WorkerOffer]):<span style="color:#ff0000;"> Seq[Seq[TaskDescription]]</span> = synchronized {//他返回的是TaskDescription，而这个又是什么呢，看下面
    // Mark each slave as alive and remember its hostname
    // Also track if new executor is added
    var newExecAvail = false
    for (o <- offers) {
      executorIdToHost(o.executorId) = o.host
      if (!executorsByHost.contains(o.host)) {
        executorsByHost(o.host) = new HashSet[String]()
        executorAdded(o.executorId, o.host)
        newExecAvail = true
      }
      for (rack <- getRackForHost(o.host)) {
        hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
      }
    }</span>

再来:

<span style="font-size:14px;">private[spark] class TaskDescription(
    val taskId: Long,
    val executorId: String,
    val name: String,
    val index: Int,    // Index within this task's TaskSet
 <span style="color:#ff0000;">   _serializedTask: ByteBuffer</span>)//他是一个bytebuffer，也就是说他是一个序列化之后的对象
  extends Serializable {

  // Because ByteBuffers are not serializable, wrap the task in a SerializableBuffer
  private val buffer = new SerializableBuffer(_serializedTask)

  def serializedTask: ByteBuffer = buffer.value

  override def toString: String = "TaskDescription(TID=%d, index=%d)".format(taskId, index)
}</span>

再来：

<span style="font-size:14px;">// Launch tasks returned by a set of resource offers
    def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
      for (task <- tasks.flatten) {
        val ser = SparkEnv.get.closureSerializer.newInstance()
        val serializedTask = ser.serialize(task)
        if (serializedTask.limit >= akkaFrameSize - AkkaUtils.reservedSizeBytes) {
          val taskSetId = scheduler.taskIdToTaskSetId(task.taskId)
          scheduler.activeTaskSets.get(taskSetId).foreach { taskSet =>
            try {
              var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +
                "spark.akka.frameSize (%d bytes) - reserved (%d bytes). Consider increasing " +
                "spark.akka.frameSize or using broadcast variables for large values."
              msg = msg.format(task.taskId, task.index, serializedTask.limit, akkaFrameSize,
                AkkaUtils.reservedSizeBytes)
              taskSet.abort(msg)
            } catch {
              case e: Exception => logError("Exception in error callback", e)
            }
          }
        }
        else {
          val executorData = executorDataMap(task.executorId)
          executorData.freeCores -= scheduler.CPUS_PER_TASK
     <span style="color:#ff0000;">     executorData.executorActor ! LaunchTask(new SerializableBuffer(serializedTask))//发布任务 </span>
        }
      }
    }</span>

至于在executor中是怎么运行的，一句话就是反序列化rdd，进行计算

当一个task运行完成的时候：

Task执行是通过TaskRunner来运行，它需要通过ExecutorBackend和Driver通信，通信消息是StatusUpdate：

1、Task运行之前，告诉Driver当前Task的状态为TaskState.RUNNING。

2、Task运行之后，告诉Driver当前Task的状态为TaskState.FINISHED，并返回计算结果。

3、如果Task运行过程中发生错误，告诉Driver当前Task的状态为TaskState.FAILED，并返回错误原因。

4、如果Task在中途被Kill掉了，告诉Driver当前Task的状态为TaskState.FAILED。

下面讲的是运行成功的状态，图太大了，所以插到了最后面。

1、Task运行结束之后，调用ExecutorBackend的statusUpdate方法，把结果返回。结果超过10M，就把结果保存在blockManager处，返回blockId，需要的时候通过blockId到blockManager认领。

2、ExecutorBackend直接向Driver发送StatusUpdate返回Task的信息。

3、Driver（这里具体指的是SchedulerBackend）接收到StatusUpdate消息之后，调用TaskScheduler的statusUpdate方法，然后准备给ExecutorBackend发送下一批Task。

4、TaskScheduler通过TaskId找到管理这个Task的TaskSetManager（负责管理一批Task的类），从TaskSetManager里面删掉这个Task，并把Task插入到TaskResultGetter（负责获取Task结果的类）的成功队列里。

5、TaskResultGetter获取到结果之后，调用TaskScheduler的handleSuccessfulTask方法把结果返回。

6、TaskScheduler调用TaskSetManager的handleSuccessfulTask方法，处理成功的Task。

7、TaskSetManager调用DAGScheduler的taskEnded方法，告诉DAGScheduler这个Task运行结束了，如果这个时候Task全部成功了，就会结束TaskSetManager。

8、DAGScheduler在taskEnded方法里触发CompletionEvent事件，CompletionEvent分ResultTask和ShuffleMapTask来处理。

再来：

<span style="font-size:14px;">  private[scheduler] def handleTaskCompletion(event: CompletionEvent) {
    val task = event.task
    val stageId = task.stageId
    val taskType = Utils.getFormattedClassName(task)

    // The success case is dealt with separately below, since we need to compute accumulator
    // updates before posting.
    if (event.reason != Success) {
      val attemptId = stageIdToStage.get(task.stageId).map(_.latestInfo.attemptId).getOrElse(-1)
      listenerBus.post(SparkListenerTaskEnd(stageId, attemptId, taskType, event.reason,event.taskInfo, event.taskMetrics))
    }

    if (!stageIdToStage.contains(task.stageId)) {
      // Skip all the actions if the stage has been cancelled.
      return
    }
    val stage = stageIdToStage(task.stageId)

    def markStageAsFinished(stage: Stage, errorMessage: Option[String] = None) = {
      val serviceTime = stage.latestInfo.submissionTime match {
        case Some(t) => "%.03f".format((clock.getTime() - t) / 1000.0)
        case _ => "Unknown"
      }
      if (errorMessage.isEmpty) {
        logInfo("%s (%s) finished in %s s".format(stage, stage.name, serviceTime))
        stage.latestInfo.completionTime = Some(clock.getTime())
      } else {
        stage.latestInfo.stageFailed(errorMessage.get)
        logInfo("%s (%s) failed in %s s".format(stage, stage.name, serviceTime))
      }
      listenerBus.post(SparkListenerStageCompleted(stage.latestInfo))
      runningStages -= stage
    }


    event.reason match {
      case Success =>
        if (event.accumUpdates != null) {
          try {
            Accumulators.add(event.accumUpdates)
            event.accumUpdates.foreach { case (id, partialValue) =>
              val acc = Accumulators.originals(id).asInstanceOf[Accumulable[Any, Any]]
              // To avoid UI cruft, ignore cases where value wasn't updated
              if (acc.name.isDefined && partialValue != acc.zero) {
                val name = acc.name.get
                val stringPartialValue = Accumulators.stringifyPartialValue(partialValue)
                val stringValue = Accumulators.stringifyValue(acc.value)
                stage.latestInfo.accumulables(id) = AccumulableInfo(id, name, stringValue)
                event.taskInfo.accumulables +=
                  AccumulableInfo(id, name, Some(stringPartialValue), stringValue)
              }
            }
          } catch {
            // If we see an exception during accumulator update, just log the error and move on.
            case e: Exception =>
              logError(s"Failed to update accumulators for $task", e)
          }
        }
        listenerBus.post(SparkListenerTaskEnd(stageId, stage.latestInfo.attemptId, taskType,
          event.reason, event.taskInfo, event.taskMetrics))
        stage.pendingTasks -= task
        task match {
          case rt: ResultTask[_, _] =>
            stage.resultOfJob match {
              case Some(job) =>
                if (!job.finished(rt.outputId)) {
                  job.finished(rt.outputId) = true
                  job.numFinished += 1
                  // If the whole job has finished, remove it
                  if (job.numFinished == job.numPartitions) {
                    markStageAsFinished(stage)
                    cleanupStateForJobAndIndependentStages(job)
                    listenerBus.post(SparkListenerJobEnd(job.jobId, JobSucceeded))
                  }

                  // taskSucceeded runs some user code that might throw an exception. Make sure
                  // we are resilient against that.
                  try {
                    job.listener.taskSucceeded(rt.outputId, event.result)
                  } catch {
                    case e: Exception =>
                      // TODO: Perhaps we want to mark the stage as failed?
                      job.listener.jobFailed(new SparkDriverExecutionException(e))
                  }
                }
              case None =>
                logInfo("Ignoring result from " + rt + " because its job has finished")
            }

          case smt: ShuffleMapTask =>
            val status = event.result.asInstanceOf[MapStatus]
            val execId = status.location.executorId
            logDebug("ShuffleMapTask finished on " + execId)
            if (failedEpoch.contains(execId) && smt.epoch <= failedEpoch(execId)) {
              logInfo("Ignoring possibly bogus ShuffleMapTask completion from " + execId)
            } else {
              stage.addOutputLoc(smt.partitionId, status)
            }
            if (runningStages.contains(stage) && stage.pendingTasks.isEmpty) {
              markStageAsFinished(stage)
              logInfo("looking for newly runnable stages")
              logInfo("running: " + runningStages)
              logInfo("waiting: " + waitingStages)
              logInfo("failed: " + failedStages)
              if (stage.shuffleDep.isDefined) {
                // We supply true to increment the epoch number here in case this is a
                // recomputation of the map outputs. In that case, some nodes may have cached
                // locations with holes (from when we detected the error) and will need the
                // epoch incremented to refetch them.
                // TODO: Only increment the epoch number if this is not the first time
                //       we registered these map outputs.
                mapOutputTracker.registerMapOutputs(
                  stage.shuffleDep.get.shuffleId,
                  stage.outputLocs.map(list => if (list.isEmpty) null else list.head).toArray,
                  changeEpoch = true)
              }
              clearCacheLocs()
              if (stage.outputLocs.exists(_ == Nil)) {
                // Some tasks had failed; let's resubmit this stage
                // TODO: Lower-level scheduler should also deal with this
                logInfo("Resubmitting " + stage + " (" + stage.name +
                  ") because some of its tasks had failed: " +
                  stage.outputLocs.zipWithIndex.filter(_._1 == Nil).map(_._2).mkString(", "))
                submitStage(stage)
              } else {
                val newlyRunnable = new ArrayBuffer[Stage]
                for (stage <- waitingStages) {
                  logInfo("Missing parents for " + stage + ": " + getMissingParentStages(stage))
                }
                for (stage <- waitingStages if getMissingParentStages(stage) == Nil) {
                  newlyRunnable += stage
                }
                waitingStages --= newlyRunnable
                runningStages ++= newlyRunnable
                for {
                  stage <- newlyRunnable.sortBy(_.id)
                  jobId <- activeJobForStage(stage)
                } {
           <span style="color:#ff0000;">       logInfo("Submitting " + stage + " (" + stage.rdd + "), which is now runnable")
                  submitMissingTasks(stage, jobId)//</span>
                }
              }
            }
          }</span>

【上篇】tomcat原理及实现—–tomcat模块DIY（第一篇）
【下篇】spark作业的生命周期——-就是这个意思

作者: retainer

该日志由 retainer 于5年前发表在综合分类下，最后更新于 2019年05月23日.
转载请注明: spark源码研究—作业提交 | 学步园 +复制链接

抱歉!评论已关闭.

学步园

spark源码研究—作业提交

作者: retainer

书签

最新文章New

本站推荐

返回首页