Flink – JobManager.submitJob

JobManager作为actor，

  case SubmitJob(jobGraph, listeningBehaviour) =>
      val client = sender()

      val jobInfo = new JobInfo(client, listeningBehaviour, System.currentTimeMillis(),
        jobGraph.getSessionTimeout)

      submitJob(jobGraph, jobInfo)

submitJob，做3件事、

根据JobGraph生成ExecuteGraph
恢复状态CheckpointedState，或者Savepoint
提交ExecuteGraph给Scheduler进行调度

ExecuteGraph

executionGraph = ExecutionGraphBuilder.buildGraph(
  executionGraph, //currentJobs.get(jobGraph.getJobID)，对应的jobid是否有现存的ExecuteGraph
  jobGraph,
  flinkConfiguration, //配置
  futureExecutor, //Executors.newFixedThreadPool(numberProcessors, new NamedThreadFactory("jobmanager-future-", "-thread-"))，根据cpu核数创建的线程池
  ioExecutor, // Executors.newFixedThreadPool(numberProcessors, new NamedThreadFactory("jobmanager-io-", "-thread-"))
  userCodeLoader,  //libraryCacheManager.getClassLoader(jobGraph.getJobID)，从jar中加载
  checkpointRecoveryFactory, //用于createCheckpointStore和createCheckpointIDCounter，standalone和zk两种
  Time.of(timeout.length, timeout.unit),
  restartStrategy, //job重启策略
  jobMetrics,
  numSlots, //scheduler.getTotalNumberOfSlots(),注册到该JM上的instances一共有多少slots
  log.logger)

ExecutionGraphBuilder.buildGraph

New

        // create a new execution graph, if none exists so far
        final ExecutionGraph executionGraph;

        try {
            executionGraph = (prior != null) ? prior :
                    new ExecutionGraph(
                        futureExecutor,
                        ioExecutor,
                        jobId,
                        jobName,
                        jobGraph.getJobConfiguration(),
                        jobGraph.getSerializedExecutionConfig(),
                        timeout,
                        restartStrategy,
                        jobGraph.getUserJarBlobKeys(),
                        jobGraph.getClasspaths(),
                        classLoader,
                        metrics);
        } catch (IOException e) {
            throw new JobException("Could not create the execution graph.", e);
        }

attachJobGraph，生成Graph的节点和边

        // topologically sort the job vertices and attach the graph to the existing one
        List<JobVertex> sortedTopology = jobGraph.getVerticesSortedTopologicallyFromSources();
        executionGraph.attachJobGraph(sortedTopology);

ExecutionGraph.attachJobGraph

       for (JobVertex jobVertex : topologiallySorted) {

            // create the execution job vertex and attach it to the graph
            ExecutionJobVertex ejv =
                    new ExecutionJobVertex(this, jobVertex, 1, timeout, createTimestamp);
            ejv.connectToPredecessors(this.intermediateResults);

            //All job vertices that are part of this graph, ConcurrentHashMap<JobVertexID, ExecutionJobVertex> tasks
            ExecutionJobVertex previousTask = this.tasks.putIfAbsent(jobVertex.getID(), ejv);

            for (IntermediateResult res : ejv.getProducedDataSets()) {
                //All intermediate results that are part of this graph
                //ConcurrentHashMap<IntermediateDataSetID, IntermediateResult> intermediateResults
                IntermediateResult previousDataSet = this.intermediateResults.putIfAbsent(res.getId(), res);
            }

            //All vertices, in the order in which they were created
            //List<ExecutionJobVertex> verticesInCreationOrder
            this.verticesInCreationOrder.add(ejv);
        }

将JobVertex封装成ExecutionJobVertex

会依次创建出ExecutionJobVertex，ExecutionVertex, Execution; IntermediateResult, IntermediateResultPartition

ExecutionJobVertex

public ExecutionJobVertex(
        ExecutionGraph graph,
        JobVertex jobVertex,
        int defaultParallelism,
        Time timeout,
        long createTimestamp) throws JobException {

        if (graph == null || jobVertex == null) {
            throw new NullPointerException();
        }
        
        //并发度，决定有多少ExecutionVertex
        int vertexParallelism = jobVertex.getParallelism();
        int numTaskVertices = vertexParallelism > 0 ? vertexParallelism : defaultParallelism;

        //产生ExecutionVertex
        this.taskVertices = new ExecutionVertex[numTaskVertices];
        
        this.inputs = new ArrayList<>(jobVertex.getInputs().size());
        
        // take the sharing group
        this.slotSharingGroup = jobVertex.getSlotSharingGroup();
        this.coLocationGroup = jobVertex.getCoLocationGroup();
        
        // create the intermediate results
        this.producedDataSets = new IntermediateResult[jobVertex.getNumberOfProducedIntermediateDataSets()]; //创建用于存放中间结果的IntermediateResult

        for (int i = 0; i < jobVertex.getProducedDataSets().size(); i++) {
            final IntermediateDataSet result = jobVertex.getProducedDataSets().get(i);

            this.producedDataSets[i] = new IntermediateResult( //将JobGraph中的IntermediateDataSet封装成IntermediateResult
                    result.getId(),
                    this,
                    numTaskVertices,
                    result.getResultType());
        }

        // create all task vertices
        for (int i = 0; i < numTaskVertices; i++) {
            ExecutionVertex vertex = new ExecutionVertex( //初始化ExecutionVertex
                    this, i, this.producedDataSets, timeout, createTimestamp, maxPriorAttemptsHistoryLength);

            this.taskVertices[i] = vertex; //
        }
        
        finishedSubtasks = new boolean[parallelism];
    }

ExecutionVertex

      public ExecutionVertex(
            ExecutionJobVertex jobVertex,
            int subTaskIndex, //第几个task，task和ExecutionVertex对应
            IntermediateResult[] producedDataSets,
            Time timeout,
            long createTimestamp,
            int maxPriorExecutionHistoryLength) {

        this.jobVertex = jobVertex;
        this.subTaskIndex = subTaskIndex;
        this.taskNameWithSubtask = String.format("%s (%d/%d)",
                jobVertex.getJobVertex().getName(), subTaskIndex + 1, jobVertex.getParallelism());

        this.resultPartitions = new LinkedHashMap<IntermediateResultPartitionID, IntermediateResultPartition>(producedDataSets.length, 1); //用于记录IntermediateResultPartition

        for (IntermediateResult result : producedDataSets) {
            IntermediateResultPartition irp = new IntermediateResultPartition(result, this, subTaskIndex); //初始化IntermediateResultPartition
            result.setPartition(subTaskIndex, irp);

            resultPartitions.put(irp.getPartitionId(), irp);
        }

        this.inputEdges = new ExecutionEdge[jobVertex.getJobVertex().getInputs().size()][];

        this.priorExecutions = new EvictingBoundedList<>(maxPriorExecutionHistoryLength);

        this.currentExecution = new Execution( //创建Execution
            getExecutionGraph().getFutureExecutor(),
            this,
            0,
            createTimestamp,
            timeout);

        this.timeout = timeout;
    }

connectToPredecessors，把节点用edge相连

    public void connectToPredecessors(Map<IntermediateDataSetID, IntermediateResult> intermediateDataSets) throws JobException {
        
        List<JobEdge> inputs = jobVertex.getInputs(); //JobVertex的输入
        
        for (int num = 0; num < inputs.size(); num++) {
            JobEdge edge = inputs.get(num); //对应的JobEdge
            
            IntermediateResult ires = intermediateDataSets.get(edge.getSourceId()); //取出JobEdge的source IntermediateResult
            
            this.inputs.add(ires); //List<IntermediateResult> inputs;
            
            int consumerIndex = ires.registerConsumer(); //将当前vertex作为consumer注册到IntermediateResult的每个IntermediateResultPartition
            
            for (int i = 0; i < parallelism; i++) {
                ExecutionVertex ev = taskVertices[i];
                ev.connectSource(num, ires, edge, consumerIndex); //为每个ExecutionVertex建立到具体IntermediateResultPartition的ExecutionEdge
            }
        }
    }

connectSource

public void connectSource(int inputNumber, IntermediateResult source, JobEdge edge, int consumerNumber) {

    final DistributionPattern pattern = edge.getDistributionPattern(); // 获取edge的distribution pattern
    final IntermediateResultPartition[] sourcePartitions = source.getPartitions(); // 获取souce的partitions
    
    ExecutionEdge[] edges;
    
    switch (pattern) {
        case POINTWISE:
            edges = connectPointwise(sourcePartitions, inputNumber);
            break;
        
        case ALL_TO_ALL:
            edges = connectAllToAll(sourcePartitions, inputNumber);
            break;
        
        default:
            throw new RuntimeException("Unrecognized distribution pattern.");
        
    }
    
    this.inputEdges[inputNumber] = edges;
    
    // add the consumers to the source
    // for now (until the receiver initiated handshake is in place), we need to register the 
    // edges as the execution graph
    for (ExecutionEdge ee : edges) {
        ee.getSource().addConsumer(ee, consumerNumber);
    }
}

看下connectPointwise

private ExecutionEdge[] connectPointwise(IntermediateResultPartition[] sourcePartitions, int inputNumber) {
    final int numSources = sourcePartitions.length;  //Partitions的个数
    final int parallelism = getTotalNumberOfParallelSubtasks(); //subTasks的并发度
    
    // simple case same number of sources as targets
    if (numSources == parallelism) { //如果1比1，简单
        return new ExecutionEdge[] { new ExecutionEdge(sourcePartitions[subTaskIndex], this, inputNumber) }; //取sourcePartitions中和subTaskIndex对应的那个partition
    }
    else if (numSources < parallelism) { //如果subTasks的并发度高，那一个source会对应于多个task
    
        int sourcePartition;
        
        // check if the pattern is regular or irregular
        // we use int arithmetics for regular, and floating point with rounding for irregular
        if (parallelism % numSources == 0) { //整除的情况下，比如2个source，6个task，那么第3个task应该对应于第一个source
            // same number of targets per source
            int factor = parallelism / numSources;
            sourcePartition = subTaskIndex / factor;
        }
        else {
            // different number of targets per source
            float factor = ((float) parallelism) / numSources;
            sourcePartition = (int) (subTaskIndex / factor);
        }
        
        return new ExecutionEdge[] { new ExecutionEdge(sourcePartitions[sourcePartition], this, inputNumber) };
    }
    else {
        //......
    }
}

配置checkpoint

                executionGraph.enableSnapshotCheckpointing(
                    snapshotSettings.getCheckpointInterval(),
                    snapshotSettings.getCheckpointTimeout(),
                    snapshotSettings.getMinPauseBetweenCheckpoints(),
                    snapshotSettings.getMaxConcurrentCheckpoints(),
                    snapshotSettings.getExternalizedCheckpointSettings(),
                    triggerVertices,
                    ackVertices,
                    confirmVertices,
                    checkpointIdCounter,
                    completedCheckpoints,
                    externalizedCheckpointsDir,
                    checkpointStatsTracker);

启动CheckpointCoordinator，参考专门讨论Checkpoint机制的blog

Scheduler

下面看看如何将生成好的ExecutionGraph进行调度

     future { //异步
            try {
              submittedJobGraphs.putJobGraph(new SubmittedJobGraph(jobGraph, jobInfo)) //放入submittedJobGraphs
            } catch {
                //
            }
          }

          jobInfo.notifyClients(
            decorateMessage(JobSubmitSuccess(jobGraph.getJobID))) //通知用户提交成功

          if (leaderElectionService.hasLeadership) {
            executionGraph.scheduleForExecution(scheduler) //调度
          }
        } catch {
          //
        }
      }(context.dispatcher)
    }

executionGraph.scheduleForExecution

    public void scheduleForExecution(SlotProvider slotProvider) throws JobException {

        switch (scheduleMode) {

            case LAZY_FROM_SOURCES:
                // simply take the vertices without inputs.
                for (ExecutionJobVertex ejv : this.tasks.values()) { //ConcurrentHashMap<JobVertexID, ExecutionJobVertex> tasks,这个tasks的命名不科学
                    if (ejv.getJobVertex().isInputVertex()) {
                        ejv.scheduleAll(slotProvider, allowQueuedScheduling);
                    }
                }
                break;

            case EAGER:
                for (ExecutionJobVertex ejv : getVerticesTopologically()) {
                    ejv.scheduleAll(slotProvider, allowQueuedScheduling);
                }
                break;

            default:
                throw new JobException("Schedule mode is invalid.");
        }
    }

对于流默认是EAGER，

public JobGraph createJobGraph() {

        jobGraph = new JobGraph(streamGraph.getJobName());

        // make sure that all vertices start immediately
        jobGraph.setScheduleMode(ScheduleMode.EAGER);

ExecutionJobVertex.scheduleAll

    public void scheduleAll(SlotProvider slotProvider, boolean queued) throws NoResourceAvailableException {    
        ExecutionVertex[] vertices = this.taskVertices;

        // kick off the tasks
        for (ExecutionVertex ev : vertices) {
            ev.scheduleForExecution(slotProvider, queued);
        }
    }

ExecutionVertex.scheduleForExecution

//The current or latest execution attempt of this vertex's task
public boolean scheduleForExecution(SlotProvider slotProvider, boolean queued) throws NoResourceAvailableException {
    return this.currentExecution.scheduleForExecution(slotProvider, queued);
}

Execution.scheduleForExecution

    public boolean scheduleForExecution(SlotProvider slotProvider, boolean queued) throws NoResourceAvailableException {

        final SlotSharingGroup sharingGroup = vertex.getJobVertex().getSlotSharingGroup();
        final CoLocationConstraint locationConstraint = vertex.getLocationConstraint();
        
        if (transitionState(CREATED, SCHEDULED)) {

            ScheduledUnit toSchedule = locationConstraint == null ? //生成ScheduledUnit
                new ScheduledUnit(this, sharingGroup) :
                new ScheduledUnit(this, sharingGroup, locationConstraint);

            final Future<SimpleSlot> slotAllocationFuture = slotProvider.allocateSlot(toSchedule, queued); //从slotProvider获取slot

            final Future<Void> deploymentFuture = slotAllocationFuture.handle(new BiFunction<SimpleSlot, Throwable, Void>() {
                @Override
                public Void apply(SimpleSlot simpleSlot, Throwable throwable) {
                    if (simpleSlot != null) { //slot分配成功
                        try {
                            deployToSlot(simpleSlot); //deploy
                        } catch (Throwable t) {
                            try {
                                simpleSlot.releaseSlot();
                            } finally {
                                markFailed(t);
                            }
                        }
                    }
                    else {
                        markFailed(throwable);
                    }
                    return null;
                }
            });

    }

slotProvider，参考Flink - Scheduler

deployToSlot，核心就是往TaskManager提交submitTask请求

    public void deployToSlot(final SimpleSlot slot) throws JobException {

        ExecutionState previous = this.state;
        if (previous == SCHEDULED || previous == CREATED) {
            if (!transitionState(previous, DEPLOYING)) { //状态迁移成Deploying
                throw new IllegalStateException("Cannot deploy task: Concurrent deployment call race.");
            }
        }
    

        try {
            // good, we are allowed to deploy
            if (!slot.setExecutedVertex(this)) { //设置slot和ExecuteVertex关系
                throw new JobException("Could not assign the ExecutionVertex to the slot " + slot);
            }
            this.assignedResource = slot;

            final TaskDeploymentDescriptor deployment = vertex.createDeploymentDescriptor( //创建DeploymentDescriptor
                attemptId,
                slot,
                taskState,
                attemptNumber);

            // register this execution at the execution graph, to receive call backs
            vertex.getExecutionGraph().registerExecution(this);
            
            final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();

            final Future<Acknowledge> submitResultFuture = taskManagerGateway.submitTask(deployment, timeout); //向TaskMananger的Actor发送请求

            submitResultFuture.exceptionallyAsync(new ApplyFunction<Throwable, Void>() {......}
                
        }

相关阅读:
空间解析几何与向量代数（复习笔记）
OpenGL数据类型
 Symbian c++ 调用标准C产生内存泄露
 symbian c++调用标准c函数方法
 （转载）一篇对理解OpenGL的描述的文章
 （个人摘要）make工具的用法
 （转载）哈佛大学凌晨4点半的景象
 网页抓取中的debug 问题记录
 split 使用
 三、抽象工厂(Abstract Factory)模式
原文地址：https://www.cnblogs.com/fxjwind/p/6669698.html