spark原始碼分析, 任務反序列化及執行

阿新 • • 發佈：2020-08-24

1 ==> 接受訊息,org.apache.spark.executor.CoarseGrainedExecutorBackend#receive

    case LaunchTask(data) =>
      if (executor == null) {
        exitExecutor(1, "Received LaunchTask command but executor was null")
      } else {
        val taskDesc = TaskDescription.decode(data.value)
        logInfo( 
"Got assigned task " + taskDesc.taskId)
        executor.launchTask(this, taskDesc)
      }

2. ==>org.apache.spark.executor.Executor#launchTask

  // Maintains the list of running tasks.
  private val runningTasks = new ConcurrentHashMap[Long, TaskRunner]

 def launchTask(context: ExecutorBackend, taskDescription: TaskDescription): Unit  
= {
    val tr = new TaskRunner(context, taskDescription)
    runningTasks.put(taskDescription.taskId, tr)
    threadPool.execute(tr)
  }

3. ==>org.apache.spark.executor.Executor.TaskRunner#run

override def run(): Unit = {
      threadId = Thread.currentThread.getId
      Thread.currentThread.setName(threadName)
      val threadMXBean  
= ManagementFactory.getThreadMXBean
      val taskMemoryManager = new TaskMemoryManager(env.memoryManager, taskId)
  
//下載依賴
        updateDependencies(taskDescription.addedFiles, taskDescription.addedJars)
//反序列化得到真正的 task
        task = ser.deserialize[Task[Any]](taskDescription.serializedTask, Thread.currentThread.getContextClassLoader)
        task.localProperties = taskDescription.properties
        task.setTaskMemoryManager(taskMemoryManager)

      
    val value = Utils.tryWithSafeFinally {
              val res = task.run(
                taskAttemptId = taskId,
                attemptNumber = taskDescription.attemptNumber,
                metricsSystem = env.metricsSystem)
              threwException = false
              res
            } {
              val releasedLocks = env.blockManager.releaseAllLocksForTask(taskId)
              val freedMemory = taskMemoryManager.cleanUpAllAllocatedMemory()
            }
    //處理執行結果
    val resultSer = env.serializer.newInstance()
    val beforeSerialization = System.currentTimeMillis()
    val valueBytes = resultSer.serialize(value)
    val afterSerialization = System.currentTimeMillis()       

    // Note: accumulator updates must be collected after TaskMetrics is updated
    val accumUpdates = task.collectAccumulatorUpdates()
    // TODO: do not serialize value twice
    val directResult = new DirectTaskResult(valueBytes, accumUpdates)
    val serializedDirectResult = ser.serialize(directResult)
    val resultSize = serializedDirectResult.limit()

    // directSend = sending directly back to the driver
    val serializedResult: ByteBuffer = {
      if (maxResultSize > 0 && resultSize > maxResultSize) {
        logWarning(s"Finished $taskName (TID $taskId). Result is larger than maxResultSize " +
          s"(${Utils.bytesToString(resultSize)} > ${Utils.bytesToString(maxResultSize)}), " +
          s"dropping it.")
        ser.serialize(new IndirectTaskResult[Any](TaskResultBlockId(taskId), resultSize))
      } else if (resultSize > maxDirectResultSize) {
        val blockId = TaskResultBlockId(taskId)
        env.blockManager.putBytes(
          blockId,
          new ChunkedByteBuffer(serializedDirectResult.duplicate()),
          StorageLevel.MEMORY_AND_DISK_SER)
        logInfo(
          s"Finished $taskName (TID $taskId). $resultSize bytes result sent via BlockManager)")
        ser.serialize(new IndirectTaskResult[Any](blockId, resultSize))
      } else {
        logInfo(s"Finished $taskName (TID $taskId). $resultSize bytes result sent to driver")
        serializedDirectResult
      }
    }

    setTaskFinishedAndClearInterruptStatus()
    execBackend.statusUpdate(taskId, TaskState.FINISHED, serializedResult)
}

==>org.apache.spark.executor.Executor#updateDependencies

  /**
   * Download any missing dependencies if we receive a new set of files and JARs from the
   * SparkContext. Also adds any new JARs we fetched to the class loader.
   */
  private def updateDependencies(newFiles: Map[String, Long], newJars: Map[String, Long]) {
    lazy val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
    synchronized {
      // Fetch missing dependencies
      for ((name, timestamp) <- newFiles if currentFiles.getOrElse(name, -1L) < timestamp) {
        logInfo("Fetching " + name + " with timestamp " + timestamp)
        // Fetch file with useCache mode, close cache for local mode.
        Utils.fetchFile(name, new File(SparkFiles.getRootDirectory()), conf,
          env.securityManager, hadoopConf, timestamp, useCache = !isLocal)
        currentFiles(name) = timestamp
      }
      for ((name, timestamp) <- newJars) {
        val localName = new URI(name).getPath.split("/").last
        val currentTimeStamp = currentJars.get(name)
          .orElse(currentJars.get(localName))
          .getOrElse(-1L)
        if (currentTimeStamp < timestamp) {
          logInfo("Fetching " + name + " with timestamp " + timestamp)
          // Fetch file with useCache mode, close cache for local mode.
          Utils.fetchFile(name, new File(SparkFiles.getRootDirectory()), conf,
            env.securityManager, hadoopConf, timestamp, useCache = !isLocal)
          currentJars(name) = timestamp
          // Add it to our class loader
          val url = new File(SparkFiles.getRootDirectory(), localName).toURI.toURL
          if (!urlClassLoader.getURLs().contains(url)) {
            logInfo("Adding " + url + " to class loader")
            urlClassLoader.addURL(url)
          }
        }
      }
    }
  }

==>org.apache.spark.scheduler.Task#run

 final def run(
      taskAttemptId: Long,
      attemptNumber: Int,
      metricsSystem: MetricsSystem): T = {
    SparkEnv.get.blockManager.registerTask(taskAttemptId)


    val taskContext = new TaskContextImpl(
      stageId,
      stageAttemptId, // stageAttemptId and stageAttemptNumber are semantically equal
      partitionId,
      taskAttemptId,
      attemptNumber,
      taskMemoryManager,
      localProperties,
      metricsSystem,
      metrics)

    context = if (isBarrier) {
      new BarrierTaskContext(taskContext)
    } else {
      taskContext
    }

    TaskContext.setTaskContext(context)
    taskThread = Thread.currentThread()

    if (_reasonIfKilled != null) {
      kill(interruptThread = false, _reasonIfKilled)
    }

    new CallerContext(
      "TASK",
      SparkEnv.get.conf.get(APP_CALLER_CONTEXT),
      appId,
      appAttemptId,
      jobId,
      Option(stageId),
      Option(stageAttemptId),
      Option(taskAttemptId),
      Option(attemptNumber)).setCurrentContext()

    try {
    //分為ResultTask, ShuffleMapTask 兩種型別
      runTask(context)
    } 
  }

spark原始碼分析, 任務反序列化及執行

1 ==> 接受訊息,org.apache.spark.executor.CoarseGrainedExecutorBackend#receive case LaunchTask(data) =>

spark原始碼分析, 任務提交及序列化

org.apache.spark.scheduler.DAGScheduler#submitMissingTasks 　　=> org.apache.spark.scheduler.TaskSchedulerImpl#submitTasks

【分散式】Zookeeper原始碼分析：Jute序列化

概要 Zookeeper的客戶端和服務端進行網路通訊實現資料傳輸使用了序列化元件Jute，它最初是Hadoop中預設的序列化元件（Record IO）中的序列化元件，後來Hadoop從0.21.0版本開始廢棄了Record IO，而使用Avro這個序列化

序列化與反序列化及初始drf

內容回顧 # 前後端開發模式 -混合開發(前後端不分離)--》bbs---》頁面渲染是使用模板語法(dtl,jsp-->模板語法,php)-->後端執行-->把頁面渲染成純粹的html，css，js---》直接返回給前端---》瀏覽器中展示

企業安全06-Apache Log4j Server 反序列化命令執行漏洞（CVE-2017-5645）

CVE-2017-5645 Apache Log4j Server 反序列化命令執行漏洞（CVE-2017-5645）一、漏洞原理 Apache Log4j是一個用於Java的日誌記錄庫，其支援啟動遠端日誌伺服器。Apache Log4j 2.8.2之前的2.x版本中存在安全漏洞。攻

CVE-2018-2628 Weblogic WLS Core Components 反序列化命令執行漏洞

0X00-引言今晚路燈很亮，好像上面少個人 0X01-環境搭建靶機：CentOS Linux 7 攻擊機：windows server 2016 && Kail

Typecho 反序列化漏洞分析及復現

0x00 漏洞簡介 CVE-2018-18753 漏洞概述： typecho 是一款非常簡潔快速部落格 CMS，前臺 install.php 檔案存在反序列化漏洞，通過構造的反序列化字串注入可以執行任意 PHP 程式碼。

序列化與反序列化、def的介紹與快速使用、cbv原始碼分析、APIView與request物件分析

今日內容概要序列化與反序列化 def介紹和快速使用 cbv原始碼流程分析 drf之APIView和Request物件分析

Java序列化反序列化原理及漏洞解決方案

Java序列化 Java 提供了一種物件序列化的機制，該機制中，一個物件可以被表示為一個位元組序列，該位元組序列包括該物件的資料、有關物件的型別的資訊和儲存在物件中資料的型別。

Flutter | Json自動反序列化——json_serializable（附原始碼）

https://www.jianshu.com/p/b307a377c5e8 前言 Google推出flutter這樣一個新的高效能跨平臺（Android，ios）快速開發框架之後，被業界許多開發者所關注。我在接觸了flutter之後發現這個確實是一個好東西，好東西當

JAVA反序列化之URLDNS鏈分析

前言從之前shiro、fastjson等反序列化漏洞剛曝出的時候，就接觸ysoserial的工具利用了，不過這麼久都沒好好去學習過其中的利用鏈，這次先從其中的一個可以說是最簡單的利用鏈URLDNS開始學起。

CVE-2020-9496 apache ofbiz xml-rpc反序列化漏洞分析

0x00 apache ofbiz介紹 OFBiz是一個非常著名的電子商務平臺，是一個非常著名的開源專案，提供了建立基於最新J2EE/XML規範和技術標準，構建大中型企業級、跨平臺、跨資料庫、跨應用伺服器的多層、分散式電子商務類WEB

CommonsCollections2 反序列化利用鏈分析

在 ysoserial中 commons-collections2 是用的 PriorityQueue reaObject 作為反序列化的入口那麼就來看一下 java.util.PriorityQueue.java 的 readObject方法

CommonsCollections3 反序列化利用鏈分析

InstantiateTransformer commons-collections 3.1 中有 InstantiateTransformer 這麼一個類，這個類也實現了 Transformer的transform方法，如下：

Yii框架反序列化RCE利用鏈分析

影響範圍 Yii2 < 2.0.38 測試版本 yii-basic-app-2.0.37.tgz 原理我這邊看了一下,是能夠看懂,但是我是菜雞,反序列化的鏈我構造不出來=。=

Apache Dubbo反序列化漏洞（CVE-2019-17564）復現分析

漏洞描述 Apache Dubbo是一款高效能Java RPC框架，核心功能是方便麵向介面的遠端過程呼叫，叢集容錯和負載均衡，以及服務自動註冊與發現。

CVE-2020-26945 mybatis二級快取反序列化的分析與復現

0x01 簡介 MyBatis 本是Apache的一個開源專案iBatis, 2010年這個專案由Apache Software Foundation 遷移到了Google Code，並且改名為MyBatis。MyBatis是一款優秀的持久層框架，它支援定製化SQL、儲存過程以及高階對映

weblogic CVE-2020-2963、CNVD-2020-23019 反序列化漏洞分析與復現

簡介這兩個洞應該都是5月更新的補丁，分析時候無意中發現的。看了一下漏洞挺簡單，就是利用有點苛刻

【筆記】拉勾Java工程師高薪訓練營-第一階段開源框架原始碼解析-模組一持久層框架涉及實現及MyBatis原始碼分析-任務二：Mybatis基礎回顧及高階應用

以下筆記是我看完視訊之後總結整理的，部分較為基礎的知識點也做了補充，如有問題歡迎溝通。

【筆記】拉勾Java工程師高薪訓練營-第一階段開源框架原始碼解析-模組一持久層框架涉及實現及MyBatis原始碼分析-任務三：Mybatis原始碼剖析

以下筆記是我看完視訊之後總結整理的，部分較為基礎的知識點也做了補充，如有問題歡迎溝通。由於本任務大多為程式碼講解，相關內容在講義裡面都有，所以這裡基本上沒有什麼內容。網上有更多更好的材料可以參考學習，

spark原始碼分析, 任務反序列化及執行

相關推薦