hive源码(七)物理执行计划 提交执行
上面所有的方法执行完成就退出了,往后继续执行
org.apache.hadoop.hive.ql.Driver runInternal 方法里面有一个
execute()这个方法就是把物理执行计划提交执行的入口
execute()方法
private void execute() throws CommandProcessorResponse {
......
//hive执行的时候最先打印一个Hive Session ID
//后面还会打印一个Query ID 和 total job的个数
setQueryDisplays(plan.getRootTasks());
int mrJobs = Utilities.getMRTasks(plan.getRootTasks()).size();
int jobs = mrJobs + Utilities.getTezTasks(plan.getRootTasks()).size()
+ Utilities.getSparkTasks(plan.getRootTasks()).size();
if (jobs > 0) {
logMrWarning(mrJobs);
console.printInfo("Query ID = " + queryId); //打印熟悉的字符串
console.printInfo("Total jobs = " + jobs); //打印熟悉的字符串
}
if (SessionState.get() != null) {
SessionState.get().getHiveHistory().setQueryProperty(queryId, Keys.QUERY_NUM_TASKS,
String.valueOf(jobs));
SessionState.get().getHiveHistory().setIdToTableMap(plan.getIdToTableNameMap());
}
String jobname = Utilities.abbreviate(queryStr, maxlen - 6);
......
//DriverContext 维护了runnable队列和running队列
DriverContext driverCxt = new DriverContext(ctx);
driverCxt.prepare(plan);
ctx.setHDFSCleanup(true);
this.driverCxt = driverCxt;
SessionState.get().setMapRedStats(new LinkedHashMap<>());
SessionState.get().setStackTraces(new HashMap<>());
SessionState.get().setLocalMapRedErrors(new HashMap<>());
for (Task<? extends Serializable> tsk : plan.getRootTasks()) {
assert tsk.getParentTasks() == null || tsk.getParentTasks().isEmpty();
//添加到runnable队列
driverCxt.addToRunnable(tsk);
if (metrics != null) {
tsk.updateTaskMetrics(metrics);
}
}
//循环执行任务
while (driverCxt.isRunning()) {
Task<? extends Serializable> task;
while ((task = driverCxt.getRunnable(maxthreads)) != null) {
//这正提交任务的代码,稍后详细分析
TaskRunner runner = launchTask(task, queryId, noName, jobname, jobs, driverCxt);
if (!runner.isRunning()) {
break;
}
}
//执行完成的拉出来running队列
TaskRunner tskRun = driverCxt.pollFinished();
if (tskRun == null) {
continue;
}
.......
//任务失败的处理
if (exitVal != 0) {
Task<? extends Serializable> backupTask = tsk.getAndInitBackupTask();
if (backupTask != null) {
........
//失败重试
if (DriverContext.isLaunchable(backupTask)) {
driverCxt.addToRunnable(backupTask);
}
continue;
} else
......
driverCxt.finished(tskRun);//mapreduce对应关系
......
//执行子任务
if (tsk.getChildTasks() != null) {
for (Task<? extends Serializable> child : tsk.getChildTasks()) {
if (DriverContext.isLaunchable(child)) {
driverCxt.addToRunnable(child);
}
}
}
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.RUN_TASKS);
postExecutionCacheActions();
Map<String, MapRedStats> stats = SessionState.get().getMapRedStats();
//打印资源情况、任务耗时
......
}
launchTask()方法
private TaskRunner launchTask(Task<? extends Serializable> tsk, String queryId, boolean noName,
String jobname, int jobs, DriverContext cxt) throws HiveException {
if (SessionState.get() != null) {//任务历史情况
SessionState.get().getHiveHistory().startTask(queryId, tsk, tsk.getClass().getName());
}
if (tsk.isMapRedTask() && !(tsk instanceof ConditionalTask)) {
if (noName) {//默认名字
conf.set(MRJobConfig.JOB_NAME, jobname + " (" + tsk.getId() + ")");
}
conf.set(DagUtils.MAPREDUCE_WORKFLOW_NODE_NAME, tsk.getId());
Utilities.setWorkflowAdjacencies(conf, plan);
cxt.incCurJobNo(1);
console.printInfo("Launching Job " + cxt.getCurJobNo() + " out of " + jobs);
}//设置conf driver
tsk.initialize(queryState, plan, cxt, ctx.getOpContext());
TaskRunner tskRun = new TaskRunner(tsk);
//添加到running队列
cxt.launching(tskRun);
//并行提交 union all 的一个优化配置
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.EXECPARALLEL) && tsk.canExecuteInParallel()) {
if (LOG.isInfoEnabled()){
LOG.info("Starting task [" + tsk + "] in parallel");
}
tskRun.start(); //其实最终还是调用runSequential
} else {
if (LOG.isInfoEnabled()){
LOG.info("Starting task [" + tsk + "] in serial mode");
}//task提交的真正代码
tskRun.runSequential();
}
return tskRun;
}
runSequential()方法 executeTask()方法
public void runSequential() {
int exitVal = -101;
try { //重点方法
exitVal = tsk.executeTask(ss == null ? null : ss.getHiveHistory());
} catch (Throwable t) {
if (tsk.getException() == null) {
tsk.setException(t);
}
LOG.error("Error in executeTask", t);
}
result.setExitVal(exitVal);
if (tsk.getException() != null) {
result.setTaskError(tsk.getException());
}
}
public int executeTask(HiveHistory hiveHistory) {
try {
this.setStarted();
if (hiveHistory != null) {
hiveHistory.logPlanProgress(queryPlan);
} //重点方法 MapRedTask 进这个类 将mapreduce进行到底
int retval = execute(driverContext);
this.setDone();
if (hiveHistory != null) {
hiveHistory.logPlanProgress(queryPlan);
}
return retval;
} catch (IOException e) {
throw new RuntimeException("Unexpected error: " + e.getMessage(), e);
}
}
execute(DriverContext driverContext)方法
public int execute(DriverContext driverContext) {
setNumberOfReducers()//设置reduce数量 默认256M输入一个reduce
if (!ctx.isLocalOnlyExecutionMode() && //本地模式 忽略
conf.getBoolVar(HiveConf.ConfVars.LOCALMODEAUTO)) {}
//重点方法 其实还是调用了父类的方法
int ret = super.execute(driverContext); //重点提交任务的方法 下面重点看这个代码
//设置输入类型和DB
//下面的代码其实用的不是mapreduce 是一个jvm模拟mapreduce 运行程序 不是关注的重点
super.setInputAttributes(conf);
String hadoopExec = conf.getVar(HiveConf.ConfVars.HADOOPBIN);//Hadoop 路径
String hiveJar = conf.getJar(); //opt/hive/hive/lib/hive-exec-3.1.3.jar
String libJars = super.getResource(conf, ResourceType.JAR); //添加的jar包
String libJarsOption = StringUtils.isEmpty(libJars) ? " " : " -libjars " + libJars + " ";
String hiveConfArgs = generateCmdLine(conf, ctx);// 有一个这个文件 在目录下jobconf.xml 暂时不知道是做什么的
Path planPath = new Path(ctx.getLocalTmpPath(), "plan.xml");
OutputStream out = null;//创建输出的文件夹
...
String isSilent = "true".equalsIgnoreCase(System.getProperty("test.silent")) ? "-nolog" : ""; //如果是-S参数 设置日志 -nolog
String jarCmd = hiveJar + " " + ExecDriver.class.getName() + libJarsOption; /opt/hive/hive/lib/hive-exec-3.1.3.jar org.apache.hadoop.hive.ql.exec.mr.ExecDriver
String cmdLine = hadoopExec + " jar " + jarCmd + " -plan " + planPath.toString() + " " + isSilent + " " + hiveConfArgs; xml+上面的参数
String workDir = (new File(".")).getCanonicalPath();
String files = super.getResource(conf, ResourceType.FILE);//参数文件拼接
......//参数文件拼接
String hadoopOpts;//build.dir build.dir.hive hive.query.id 系统的三个参数 继续往后面加
StringBuilder sb = new StringBuilder();
Properties p = System.getProperties();
for (String element : HIVE_SYS_PROP) {
if (p.containsKey(element)) {
sb.append(" -D" + element + "=" + p.getProperty(element));
}
}
if (ShimLoader.getHadoopShims().isLocalMode(conf)) {//local模式可用的内存
int hadoopMem = conf.getIntVar(HiveConf.ConfVars.HIVEHADOOPMAXMEM);
if (hadoopMem == 0) {
variables.remove(HADOOP_MEM_KEY);
} else {
variables.put(HADOOP_MEM_KEY, String.valueOf(hadoopMem));
}
} else {
}
if (variables.containsKey(HADOOP_OPTS_KEY)) {//继续variables添加参数 variables里面有系统默认的参数 50+
variables.put(HADOOP_OPTS_KEY, variables.get(HADOOP_OPTS_KEY)
+ hadoopOpts);
} else {
variables.put(HADOOP_OPTS_KEY, hadoopOpts);
}
if(variables.containsKey(HIVE_DEBUG_RECURSIVE)) {
configureDebugVariablesForChildJVM(variables);
}
env = new String[variables.size()];
int pos = 0;
for (Map.Entry<String, String> entry : variables.entrySet()) {//variables变为字符串 然后提交执行
String name = entry.getKey();
String value = entry.getValue();
env[pos++] = name + "=" + value;
}
//opt/hive/hive/lib/hive-exec-3.1.3.jar org.apache.hadoop.hive.ql.exec.mr.ExecDriver ....基本格式就是这样的
executor = Runtime.getRuntime().exec(cmdLine, env, new File(workDir));//variables变为字符串 然后提交执行
...后续就是日志打印 没啥意思了
}
execute(DriverContext driverContext)
public int execute(DriverContext driverContext) {
......
//下面就是比较熟悉的 设置输入输出了 分区 map
HiveFileFormatUtils.prepareJobOutput(job);
job.setOutputFormat(HiveOutputFormatImpl.class);
job.setMapRunnerClass(ExecMapRunner.class);
job.setMapperClass(ExecMapper.class);
job.setMapOutputKeyClass(HiveKey.class);
job.setMapOutputValueClass(BytesWritable.class);
try {
String partitioner = HiveConf.getVar(job, ConfVars.HIVEPARTITIONER);
job.setPartitionerClass(JavaUtils.loadClass(partitioner));
} catch (ClassNotFoundException e) {
throw new RuntimeException(e.getMessage(), e);
}
//输入大小 文件和每个节点大小
propagateSplitSettings(job, mWork);
job.setNumReduceTasks(rWork != null ? rWork.getNumReduceTasks().intValue() : 0);
job.setReducerClass(ExecReducer.class);
setInputAttributes(job); //当前数据库 输入格式
boolean useSpeculativeExecReducers = HiveConf.getBoolVar(job,
HiveConf.ConfVars.HIVESPECULATIVEEXECREDUCERS);
job.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, useSpeculativeExecReducers);
String inpFormat = HiveConf.getVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT);
if (mWork.isUseBucketizedHiveInputFormat()) { //桶表文件单独处理
inpFormat = BucketizedHiveInputFormat.class.getName();
}
try {
job.setInputFormat(JavaUtils.loadClass(inpFormat));//输入文件
} catch (ClassNotFoundException e) {
throw new RuntimeException(e.getMessage(), e);
}
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
int returnVal = 0;
boolean noName = StringUtils.isEmpty(job.get(MRJobConfig.JOB_NAME));
if (noName) {
job.set(MRJobConfig.JOB_NAME, "JOB" + Utilities.randGen.nextInt());
}
try{
MapredLocalWork localwork = mWork.getMapRedLocalWork();
if (localwork != null && localwork.hasStagedAlias()) {//本地模式
......
}
work.configureJobConf(job);
List<Path> inputPaths = Utilities.getInputPaths(job, mWork, emptyScratchDir, ctx, false);
Utilities.setInputPaths(job, inputPaths);//
//设置mapreduce阶段任务
Utilities.setMapRedWork(job, work, ctx.getMRTmpPath());
if (mWork.getSamplingType() > 0 && rWork != null && job.getNumReduceTasks() > 1) {
try {
handleSampling(ctx, mWork, job);
job.setPartitionerClass(HiveTotalOrderPartitioner.class);
} catch (IllegalStateException e) {
console.printInfo("Not enough sampling data.. Rolling back to single reducer task");
rWork.setNumReduceTasks(1);
job.setNumReduceTasks(1);
} catch (Exception e) {
LOG.error("Sampling error", e);
console.printError(e.toString(),
"\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
rWork.setNumReduceTasks(1);
job.setNumReduceTasks(1);
}
}
jc = new JobClient(job);
Throttle.checkJobTracker(job, LOG);
//初始化表的统计信息
if (mWork.isGatheringStats() || (rWork != null && rWork.isGatheringStats())) {
StatsPublisher statsPublisher;
StatsFactory factory = StatsFactory.newFactory(job);
if (factory != null) {
statsPublisher = factory.getStatsPublisher();
List<String> statsTmpDir = Utilities.getStatsTmpDirs(mWork, job);
if (rWork != null) {
statsTmpDir.addAll(Utilities.getStatsTmpDirs(rWork, job));
}
StatsCollectionContext sc = new StatsCollectionContext(job);
sc.setStatsTmpDirs(statsTmpDir);
if (!statsPublisher.init(sc)) { // creating stats table if not exists
if (HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
throw
new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
}
}
}
}
......
//提交任务
rj = jc.submitJob(job);
.....后面的就没有啥意思了
return (returnVal);
}