• 通过模板类简单实现Spark的JobServer


    实验前后效果对比:

    之前:执行13个节点,耗时16分钟

    之后:同样13个节点,耗时3分钟

    具体逻辑请参照代码及注释。

      1 import java.util.concurrent.{ExecutorService, Executors, TimeUnit}
      2 
      3 import akka.actor.{ActorSystem, Props}
      4 import com.alibaba.fastjson.JSONObject
      5 import xxx.listener.AddJobToQueueActor
      6 import xxx.listener.bean.{AppStatusMessage, SparkContextStatusMessage}
      7 import xxx.listener.utils.JSONUtil
      8 import xxx.listener.utils.JmsUtils._
      9 import xxx.main.SparkJob
     10 import xxx.main.utils.JsonUtils
     11 import com.typesafe.config.ConfigFactory
     12 import org.apache.commons.lang.StringUtils
     13 import org.apache.spark.sql.hive.HiveContext
     14 import org.apache.spark.{Logging, SparkConf, SparkContext}
     15 
     16 import scala.collection.mutable.Queue
     17 
     18 /**
     19   * Created by zpc on 2016/9/1.
     20   * JobServer实现模板。
     21   * 修正前:各个任务节点独立提交到spark平台,其中启动sparkContext和hiveContext会占用大量时间,大约40多秒。
     22   * 修改后:将统一用户,占用资源相同的节点利用JMS发送消息提交到同一个SparkContext上,默认设置为3个节点任务并行。
     23   * 实现:1.提交到queue中的msg为任务包含任务中型的子类及参数信息,接收到的任务不存在依赖关系,依赖的处理在消息发送端控制。
     24   *      前置任务执行结束,再发送下一节点任务。
     25   *      2.第一次提交时,任务的参数在args中获取。启动之后,启动jms的lister监听,通过actor将接收到的任务信息加入队列。
     26   *      3.通过反射调用SparkJob的各个子类(真正执行节点逻辑的类),SparkContext的默认timeout时间为30mins。
     27   *      4.节点执行结束,发送节点成功消息到web端,节点失败,发送错误日志及错误消息。
     28   *      程序退出,通过shutdownhook,发送sc关闭消息到web端。
     29   *      程序被关闭,如kill时,将等待队列及正在执行集合中的任务,发送失败消息到web端。
     30   *
     31   *
     32   */
     33 object ExecuteJobServer extends Logging {
     34 
     35   //等待执行的job所在的queue
     36   val jobWaitingQueue = new Queue[String]
     37   //当前正在执行的任务的集合
     38   val jobRunningSet = new scala.collection.mutable.HashSet[JSONObject]
     39   val timeout_mins = 30
     40   //最后运行任务时间
     41   var lastRunTime = System.currentTimeMillis()
     42 
     43   //spark context 对应的 applicationId, user, expId, resource
     44   var appId : String = ""
     45   var user: String = ""
     46   var expId: Long = 0
     47   var resource: String = ""
     48   //正在执行的job JSON
     49   var jobJson : JSONObject = null
     50 
     51   def main(args: Array[String]): Unit = {
     52 
     53     //进程杀死时,将正在执行或未执行的任务,发送失败消息到web端。
     54     Runtime.getRuntime().addShutdownHook(new HookMessage())
     55     //接收到的任务,可以同时提交时,线程数可以多设置,暂定为3
     56     val threadPool: ExecutorService = Executors.newFixedThreadPool(3)
     57     val sc = initSparkContext()
     58     val hiveContext = new HiveContext(sc)
     59 
     60     val list = JsonUtils.parseArray(args(0))
     61     val it = list.iterator
     62     while (it.hasNext) {
     63       val jobStr = it.next().toString
     64       if(expId == 0){
     65         val json = JSONUtil.toJSONString(jobStr)
     66         val param = json.getJSONObject("params")
     67         appId = sc.applicationId
     68         user = param.getString("user")
     69         expId = param.getLongValue("expId")
     70         var driver_memory = ""
     71         var num_executors = "spark.executor.instances"
     72         var executor_memory = ""
     73         sc.getConf.getAll.map( x => {
     74           if(x._1 != null && "spark.executor.instances".equals(x._1)) {
     75             num_executors = x._2
     76           }
     77           else if(x._1 != null && "spark.executor.memory".equals(x._1)){
     78             executor_memory = x._2.substring(0, x._2.length - 1)
     79           }else if(x._1 != null && "spark.driver.memory".equals(x._1)){
     80             driver_memory = x._2.substring(0, x._2.length - 1)
     81           }
     82         })
     83 
     84         resource = driver_memory + num_executors + executor_memory;
     85         logInfo("resource is : " +resource)
     86 //        resource = param.getString("driver-memory") + param.getString("num-executors") +  param.getString("executor-memory")
     87       }
     88       jobWaitingQueue.enqueue(jobStr)
     89     }
     90 
     91     /** 1.启动listener监听appId,接收queue中发送过来的JobMsg消息2.通过Queue发送消息通知web端,sc启动 **/
     92     val system = ActorSystem("mlp", ConfigFactory.load())
     93     val actor = system.actorOf(Props(new AddJobToQueueActor(appId, jobWaitingQueue)))
     94     createTopicListenerOfContextJobMsg("contextJobMsgListener", actor)
     95     informSparkContextStatus(true)
     96 
     97     while (jobWaitingQueue.size > 0 || !checkTimeOut) {
     98       while (jobWaitingQueue.size > 0) {
     99         lastRunTime = System.currentTimeMillis()
    100         val jobStr = jobWaitingQueue.dequeue()//.replace("\", "")
    101         logInfo("***** ExecuteJobServer jobStr ***** jobStr: " + jobStr)
    102         val json = JSONUtil.toJSONString(jobStr)
    103         jobRunningSet.add(json)
    104         threadPool.execute(new ThreadSparkJob(json, hiveContext, sc))
    105         jobJson = json
    106       }
    107       Thread.sleep(1000)
    108     }
    109 
    110     /**
    111       * jobWaittingQueue队列不再接收消息
    112       *
    113       */
    114     threadPool.shutdown()
    115     var loop = true
    116     do {
    117       //等待所有任务完成
    118       loop = !threadPool.awaitTermination(2, TimeUnit.SECONDS); //阻塞,直到线程池里所有任务结束
    119     } while (loop);
    120   }
    121 
    122   def checkTimeOut(): Boolean = {
    123     val nowTime = System.currentTimeMillis()
    124     if (jobRunningSet.isEmpty && (nowTime - lastRunTime) / (1000 * 60) > timeout_mins) {
    125       return true
    126     } else {
    127       return false
    128     }
    129   }
    130 
    131   class ThreadSparkJob(json: JSONObject, hiveContext: HiveContext, ctx: SparkContext) extends Runnable {
    132     override def run() {
    133 
    134       try{
    135         val classStr = json.get("class").toString
    136         val argsStr = json.get("params").toString
    137         val obj: SparkJob = Class.forName(classStr).getMethod("self").invoke(null).asInstanceOf[SparkJob]
    138 //        val obj: SparkJob = Class.forName(classStr).newInstance().asInstanceOf[SparkJob]
    139         obj.jobServer = true
    140         obj.failed = false
    141         obj.setContext(ctx)
    142         obj.setHiveContext(hiveContext)
    143         obj.main(Array(argsStr))
    144         //      InformJobSuccess(json)
    145         logInfo("***** jobRunningSet remove job json***** json: " + json.toJSONString )
    146         jobRunningSet.remove(json)
    147         lastRunTime = System.currentTimeMillis()
    148       }catch {
    149         case oom: OutOfMemoryError => {
    150           informJobFailure(oom.toString, json)
    151           jobRunningSet.remove(json)
    152           logInfo("***** SparkContext go to stop, reaseon: " + oom.getMessage )
    153           hiveContext.sparkContext.stop()
    154           //异常时,sc停止,driver程序停止
    155           System.exit(1)
    156         }
    157         case ex: Exception => {
    158           informJobFailure(ex.toString, json)
    159           jobRunningSet.remove(json)
    160           if(ex.toString.contains("stopped SparkContext")){
    161             logInfo("***** SparkContext go to stop, reaseon: " + ex.getMessage )
    162             hiveContext.sparkContext.stop()
    163             //异常时,sc停止,driver程序停止
    164             System.exit(1)
    165           }
    166         }
    167       }
    168     }
    169     def informJobFailure(errMsg: String, json: JSONObject) = {
    170       if(json != null) {
    171         val params = json.getJSONObject("params")
    172         val user = StringUtils.trimToEmpty(params.getString("user"))
    173         val expId = params.getLongValue("expId")
    174         val nodeId = params.getLongValue("nodeId")
    175         val message = new AppStatusMessage(user, expId, nodeId, "FAILURE", errMsg)
    176         logInfo("***** send informJobFailure message*****: expId: " + expId + "nodeId: " + nodeId)
    177         jobStatusTemplate send message
    178       }
    179     }
    180   }
    181 
    182   def initSparkContext(): SparkContext = {
    183     val conf = new SparkConf().setAppName("cbt-mlaas")
    184     new SparkContext(conf)
    185   }
    186 
    187   class HookMessage extends Thread {
    188     override def run() {
    189       var shouldInformStop = false
    190       informSparkContextStatus(false)
    191       while (jobWaitingQueue.size > 0) {
    192         val jobStr = jobWaitingQueue.dequeue()//.replace("\", "")
    193         val json = JSONUtil.toJSONString(jobStr)
    194         informJobFailureInHook("SparkContext stopped, inform waiting job failed!", json)
    195         shouldInformStop = true
    196       }
    197       jobRunningSet.map(json => {
    198         informJobFailureInHook("SparkContext stopped, inform running job failed!", json);
    199         shouldInformStop = true
    200       })
    201       if (shouldInformStop) {
    202         informExpStop("SparkContext stopped, inform exp failed!", jobJson)
    203       }
    204     }
    205     def informJobFailureInHook(errMsg: String, json: JSONObject) = {
    206       if(json != null) {
    207         val params = json.getJSONObject("params")
    208         val user = StringUtils.trimToEmpty(params.getString("user"))
    209         val expId = params.getLongValue("expId")
    210         val nodeId = params.getLongValue("nodeId")
    211         val message = new AppStatusMessage(user, expId, nodeId, "FAILURE", errMsg)
    212         logInfo("***** send informJobFailure message*****: expId: " + expId + "nodeId: " + nodeId)
    213         jobStatusTemplate send message
    214       }
    215     }
    216     def informExpStop(errMsg: String, json: JSONObject) = {
    217       if(json != null) {
    218         val params = json.getJSONObject("params")
    219         val user = StringUtils.trimToEmpty(params.getString("user"))
    220         val expId = params.getLongValue("expId")
    221         val nodeId = params.getLongValue("nodeId")
    222         val message = new AppStatusMessage(user, expId, nodeId, "STOP", errMsg)
    223         logInfo("***** send informExpStop message*****: expId: " + expId + "nodeId: " + nodeId)
    224         jobStatusTemplate send message
    225       }
    226     }
    227   }
    228 
    229   def informSparkContextStatus(start : Boolean) = {
    230    val msg = new  SparkContextStatusMessage(appId, start, user, expId, resource)
    231     logInfo("***** send sparkContext start message*****: appId: " + appId + "start: " + start)
    232     sparkContextStatusTemplate send msg
    233   }
    234 
    235 }
  • 相关阅读:
    多台计算机之间数据同步——1.[转]网线制作图解教程
    离心泵的使用注意事项泄露或未排气造成扬程不够
    家庭上网用路由器和ADSL的连接
    专业FLV地址解析
    [求助]带程序访问控制的防火墙 eTrust Personal Firewall 和卡巴斯基2009引起冲突造成系统频繁死机
    DV录像带导出一定要用1394
    Cursor:url()的使用
    理解并解决JavaScript内存泄漏
    CodeIgniter的HMVC
    关于在IE下JavaScript的 Stack overflow at line 错误可能的原因
  • 原文地址:https://www.cnblogs.com/drawwindows/p/5908313.html
Copyright © 2020-2023  润新知