• spark DiskBlockManager


    RDD本身presist可以是本地存储,本地存储级别的持久化实现方式如下:

    DiskBlockManager负责管理和维护block和磁盘存储的映射关系,通过blockId作为文件名称,然后如果是多个目录通过blcokId的hash值进行分发。

    包括创建目录,删除,读取文件,以及一些退出删除文件的机制。

    /**
      * Creates and maintains the logical mapping between logical blocks and physical on-disk
      * locations. One block is mapped to one file with a name given by its BlockId.
      * 创建和维护blocks和磁盘存储位置的映射关系。每个block对应一个文件。文件名字是bclockId。
      * Block files are hashed among the directories listed in spark.local.dir (or in
      * SPARK_LOCAL_DIRS, if it's set).
      *
      * spark.local.dir目录存储 block 的文件。是通过文件名的hash到各个spark.local.dirs目录里面
      */
    private[spark] class DiskBlockManager(conf: SparkConf, deleteFilesOnStop: Boolean) extends Logging {
    
      private[spark] val subDirsPerLocalDir = conf.getInt("spark.diskStore.subDirectories", 64)
    
      /* Create one local directory for each path mentioned in spark.local.dir; then, inside this
       * directory, create multiple subdirectories that we will hash files into, in order to avoid
       * having really large inodes at the top level. */
      private[spark] val localDirs: Array[File] = createLocalDirs(conf)
      if (localDirs.isEmpty) {
        logError("Failed to create any local dir.")
        System.exit(ExecutorExitCode.DISK_STORE_FAILED_TO_CREATE_DIR)
      }
      // The content of subDirs is immutable but the content of subDirs(i) is mutable. And the content
      // of subDirs(i) is protected by the lock of subDirs(i)
      private val subDirs = Array.fill(localDirs.length)(new Array[File](subDirsPerLocalDir))
    
      private val shutdownHook = addShutdownHook()
    
      /** Looks up a file by hashing it into one of our local subdirectories. */
      // This method should be kept in sync with
      // org.apache.spark.network.shuffle.ExternalShuffleBlockResolver#getFile().
    
      // 通过文件名的hash在目录中查找文件
      def getFile(filename: String): File = {
        // Figure out which local directory it hashes to, and which subdirectory in that
        val hash = Utils.nonNegativeHash(filename)
        val dirId = hash % localDirs.length
        val subDirId = (hash / localDirs.length) % subDirsPerLocalDir
    
        // Create the subdirectory if it doesn't already exist
        val subDir = subDirs(dirId).synchronized {
          val old = subDirs(dirId)(subDirId)
          if (old != null) {
            old
          } else {
            val newDir = new File(localDirs(dirId), "%02x".format(subDirId))
            if (!newDir.exists() && !newDir.mkdir()) {
              throw new IOException(s"Failed to create local dir in $newDir.")
            }
            subDirs(dirId)(subDirId) = newDir
            newDir
          }
        }
    
        new File(subDir, filename)
      }
    
      def getFile(blockId: BlockId): File = getFile(blockId.name)
    
      /** Check if disk block manager has a block. */
      def containsBlock(blockId: BlockId): Boolean = {
        getFile(blockId.name).exists()
      }
    
      /** List all the files currently stored on disk by the disk manager. */
      def getAllFiles(): Seq[File] = {
        // Get all the files inside the array of array of directories
        subDirs.flatMap { dir =>
          dir.synchronized {
            // Copy the content of dir because it may be modified in other threads
            dir.clone()
          }
        }.filter(_ != null).flatMap { dir =>
          val files = dir.listFiles()
          if (files != null) files else Seq.empty
        }
      }
    
      /** List all the blocks currently stored on disk by the disk manager. */
      def getAllBlocks(): Seq[BlockId] = {
        getAllFiles().map(f => BlockId(f.getName))
      }
    
      /** Produces a unique block id and File suitable for storing local intermediate results. */
      def createTempLocalBlock(): (TempLocalBlockId, File) = {
        var blockId = new TempLocalBlockId(UUID.randomUUID())
        while (getFile(blockId).exists()) {
          blockId = new TempLocalBlockId(UUID.randomUUID())
        }
        (blockId, getFile(blockId))
      }
    
      /** Produces a unique block id and File suitable for storing shuffled intermediate results. */
      def createTempShuffleBlock(): (TempShuffleBlockId, File) = {
        var blockId = new TempShuffleBlockId(UUID.randomUUID())
        while (getFile(blockId).exists()) {
          blockId = new TempShuffleBlockId(UUID.randomUUID())
        }
        (blockId, getFile(blockId))
      }
    
      /**
        * Create local directories for storing block data. These directories are
        * located inside configured local directories and won't
        * be deleted on JVM exit when using the external shuffle service.
        *
        *  在rootDir中创建blockmgr目录,用来存储block数据
        *
        */
      private def createLocalDirs(conf: SparkConf): Array[File] = {
        Utils.getConfiguredLocalDirs(conf).flatMap { rootDir =>
          try {
            val localDir = Utils.createDirectory(rootDir, "blockmgr")
            logInfo(s"Created local directory at $localDir")
            Some(localDir)
          } catch {
            case e: IOException =>
              logError(s"Failed to create local dir in $rootDir. Ignoring this directory.", e)
              None
          }
        }
      }
    
      private def addShutdownHook(): AnyRef = {
        logDebug("Adding shutdown hook") // force eager creation of logger
        ShutdownHookManager.addShutdownHook(ShutdownHookManager.TEMP_DIR_SHUTDOWN_PRIORITY + 1) { () =>
          logInfo("Shutdown hook called")
          DiskBlockManager.this.doStop()
        }
      }
    
      /** Cleanup local dirs and stop shuffle sender. */
      private[spark] def stop() {
        // Remove the shutdown hook.  It causes memory leaks if we leave it around.
        try {
          ShutdownHookManager.removeShutdownHook(shutdownHook)
        } catch {
          case e: Exception =>
            logError(s"Exception while removing shutdown hook.", e)
        }
        doStop()
      }
    
      //删除目录
      private def doStop(): Unit = {
        if (deleteFilesOnStop) {
          localDirs.foreach { localDir =>
            if (localDir.isDirectory() && localDir.exists()) {
              try {
                if (!ShutdownHookManager.hasRootAsShutdownDeleteDir(localDir)) {
                  Utils.deleteRecursively(localDir)
                }
              } catch {
                case e: Exception =>
                  logError(s"Exception while deleting local spark dir: $localDir", e)
              }
            }
          }
        }
      }
    }

    具体调用句柄在DiskStore中,调用put方法,将指定的block写到本地。

    private[spark] class DiskStore(conf: SparkConf, diskManager: DiskBlockManager) extends Logging {
    
      private val minMemoryMapBytes = conf.getSizeAsBytes("spark.storage.memoryMapThreshold", "2m")
    
      def getSize(blockId: BlockId): Long = {
        diskManager.getFile(blockId.name).length
      }
    
      /**
        * Invokes the provided callback function to write the specific block.
        * 调用提供的回掉方法把指定的block写到磁盘
        *
        * @throws IllegalStateException if the block already exists in the disk store.
        */
      def put(blockId: BlockId)(writeFunc: FileOutputStream => Unit): Unit = {
        if (contains(blockId)) {
          throw new IllegalStateException(s"Block $blockId is already present in the disk store")
        }
        logDebug(s"Attempting to put block $blockId")
        val startTime = System.currentTimeMillis
        //生成block文件,blockid作为文件名,包含一些创建文件夹的操作
        val file = diskManager.getFile(blockId)
        val fileOutputStream = new FileOutputStream(file)
        var threwException: Boolean = true
        try {
          writeFunc(fileOutputStream)
          threwException = false
        } finally {
          try {
            Closeables.close(fileOutputStream, threwException)
          } finally {
            if (threwException) {
              remove(blockId)
            }
          }
        }
        val finishTime = System.currentTimeMillis
        logDebug("Block %s stored as %s file on disk in %d ms".format(
          file.getName,
          Utils.bytesToString(file.length()),
          finishTime - startTime))
      }
    
      def putBytes(blockId: BlockId, bytes: ChunkedByteBuffer): Unit = {
        put(blockId) { fileOutputStream =>
          val channel = fileOutputStream.getChannel
          Utils.tryWithSafeFinally {
            bytes.writeFully(channel)
          } {
            channel.close()
          }
        }
      }
    
      //读取出指定的block数据放到内存中
      def getBytes(blockId: BlockId): ChunkedByteBuffer = {
        val file = diskManager.getFile(blockId.name)
        val channel = new RandomAccessFile(file, "r").getChannel
        Utils.tryWithSafeFinally {
          // For small files, directly read rather than memory map
          if (file.length < minMemoryMapBytes) {
            val buf = ByteBuffer.allocate(file.length.toInt)
            channel.position(0)
            while (buf.remaining() != 0) {
              if (channel.read(buf) == -1) {
                throw new IOException("Reached EOF before filling buffer
    " +
                  s"offset=0
    file=${file.getAbsolutePath}
    buf.remaining=${buf.remaining}")
              }
            }
            buf.flip()
            new ChunkedByteBuffer(buf)
          } else {
            new ChunkedByteBuffer(channel.map(MapMode.READ_ONLY, 0, file.length))
          }
        } {
          channel.close()
        }
      }
    
      //删除block数据
      def remove(blockId: BlockId): Boolean = {
        val file = diskManager.getFile(blockId.name)
        if (file.exists()) {
          val ret = file.delete()
          if (!ret) {
            logWarning(s"Error deleting ${file.getPath()}")
          }
          ret
        } else {
          false
        }
      }
    
      def contains(blockId: BlockId): Boolean = {
        val file = diskManager.getFile(blockId.name)
        file.exists()
      }
    }
  • 相关阅读:
    点击cell后 cell的背景不变,cell上的字体颜色发生改变的功能实现
    各种属性设置
    多列表 ,菜单
    正则表达式
    多个storyboard之间的跳转问题
    关于uicollectionview的个人学习
    uiscrollview的自动布局
    手动自动布局
    关于简单的跳转问题
    深入理解@class和#import的区别
  • 原文地址:https://www.cnblogs.com/fantiantian/p/9493154.html
Copyright © 2020-2023  润新知