1. 整体架构:
ui--> controller -> service --> mapper -->mysql (后端,主要维护服务列表)
⬇
AgentService --> sqlite (虚机层面,主要维护当前虚机的运行服务,同时向脚本发布 install,start, stop , uninstall, take-over, monitor_trace, installall, uninstallall 等命令)
⬇
脚本 (包括中间件mysql, redis, kafka, elk, nacos, sentinel, mogodb, skywallking 等的维护,接管的java服务的维护)
2. 整个平台精华全在脚本,后端主要是向脚本下发命令,维护库的信息等。
3. 脚本简述
agentctl.sh 举例:
#!/bin/bash # JAR 包目录 readonly PACKAGE_FULL_WAY=/opt/download/packages # MySQL执行脚本目录 readonly SH_HOME=agentctl.sh # JAR 包执行目录 readonly AGENT_INSTALL_HOME=/opt/agent # JAR 包名称 readonly JAR_NAME=sitesupport-agent-0.0.1-SNAPSHOT.jar readonly NODE_EXPORTER=node_exporter-1.1.2.linux-amd64.tar.gz # 引入commmon.sh # shellcheck disable=SC1091 source "${PACKAGE_FULL_WAY}"/common.sh || exit function createSshkey() { if [ -e /root/.ssh ] && [ -e /root/.ssh/id_rsa ] && [ -e /root/.ssh/id_rsa.pub ]; then logInfo "ssh-key已存在!" else cd /root && if [ ! -e .ssh ]; then mkdir .ssh; fi cd .ssh || exit ssh-keygen -f "id_rsa" -N "" logInfo "ssh-key生成成功" fi } #check jdk function checkJdk() { logInfo "start check jdk...." if java -version &>/dev/null; then logInfo "start remove old jdk..." yum remove jdk -y # shellcheck disable=SC1091 source /etc/profile fi logInfo "start install new jdk..." if ! rpm -ivh ${PACKAGE_FULL_WAY}/"${JDK_PKG_NAME}"; then logError "jdk1.8.0_291 未安装成功,请重新安装!" fi # 允许jmx远程访问 local jmxremote_conf=/usr/java/jdk1.8.0_291-amd64/jre/lib/management cd ${jmxremote_conf} || logError "${jmxremote_conf} 不存在!" cp jmxremote.password.template jmxremote.password chmod +w jmxremote.password echo "monitorRole QED" >>jmxremote.password echo "controlRole R&D" >>jmxremote.password chmod 0400 jmxremote.password logInfo "the jdk is installed and the environment variables are configured" } # 检查定时任务状态 function checkCrond() { local state="" state=$(systemctl status crond | awk 'NR==3{print}' | awk '{print $3}' | tail -c +2 | head -c -2) if [[ ${state} != "running" ]]; then # 启动定时任务服务 service crond start fi # 设置cron开机自启 systemctl enable crond.service } function installNodeExporter() { logInfo "start install node exporter..." if [ ! -e ${PACKAGE_FULL_WAY}/${NODE_EXPORTER} ]; then logInfo "node exporter不存在!" return fi # 解压node exporter到安装主目录 mkdir ${AGENT_INSTALL_HOME}/node_exporter tar -zxvf ${PACKAGE_FULL_WAY}/${NODE_EXPORTER} -C ${AGENT_INSTALL_HOME}/node_exporter >/dev/null 2>&1 checkResult $? "tar node exporter package error" local package_name="" # shellcheck disable=SC2010 package_name=$(ls ${AGENT_INSTALL_HOME}/node_exporter | grep node_exporter) mv ${AGENT_INSTALL_HOME}/node_exporter/"${package_name}"/* ${AGENT_INSTALL_HOME}/node_exporter rm -rf ${AGENT_INSTALL_HOME}/node_exporter/"${package_name}" cd ${AGENT_INSTALL_HOME}/node_exporter || logError "${AGENT_INSTALL_HOME}/node_exporter 不存在!" if [ -e /usr/lib/systemd/system/node_exporter.service ]; then rm -rf /usr/lib/systemd/system/node_exporter.service &>/dev/null fi cat <<EOF >>/usr/lib/systemd/system/node_exporter.service [Unit] Description=node_exporter After=network-online.target remote-fs.target nss-lookup.target Wants=network-online.target [Service] Type=simple ExecStart=${AGENT_INSTALL_HOME}/node_exporter/node_exporter ExecReload=/bin/kill -s HUP $MAINPID ExecStop=/bin/kill -s TERM $MAINPID [Install] WantedBy=multi-user.target EOF systemctl daemon-reload systemctl enable node_exporter.service systemctl start node_exporter.service # 修改prometheus服务端配置 # TODO local prometheus_ip="" if [[ ${prometheus_ip} = "" ]] || [[ ${prometheus_ip} = "null" ]]; then echo "下次一定!" # logInfo "nacos配置获取失败,开始从外部配置文件获取配置..." # i=0 # temp="" # while true # do # i=`expr $i + 1` # temp=`sed -n "/^${i} /p" ${AGENT_INSTALL_HOME}/nacos_config | cut -d ' ' -f 2` # if [[ ${temp} = "" ]];then break;fi; # if [[ ${temp} =~ ^prometheus ]];then # sed -n "/^${i} /,/^}$/p" ${AGENT_INSTALL_HOME}/nacos_config | sed -n -e '/^{$/,/^}$/p' | jq -r ".install_ip" > ip.txt # fi # done # prometheus_ip=`cat ip.txt` && rm -rf ip.txt else rm -rf temp.json # shellcheck disable=SC2154 sshpass -p "${linux_password}" ssh -n -o StrictHostKeyChecking=no root@"${prometheus_ip}" "cd /opt/sitesupport/prometheus-standalone &>/dev/null || exit;sh prometheusctl.sh add_exporter -j node-${localIp}-exporter -h ${localIp} -p 9100" return 0 fi } function installAgent() { # 创建安装目录 if [[ -e ${AGENT_INSTALL_HOME} ]]; then logError "安装目录[${AGENT_INSTALL_HOME}]已存在,请检查!"; fi mkdir ${AGENT_INSTALL_HOME} checkSshpass createSshkey checkCrond cp -f ${PACKAGE_FULL_WAY}/${JAR_NAME} ${AGENT_INSTALL_HOME} cp -f ${PACKAGE_FULL_WAY}/${SH_HOME} ${AGENT_INSTALL_HOME} cp -f ${PACKAGE_FULL_WAY}/common.sh ${AGENT_INSTALL_HOME} cp -f ${PACKAGE_FULL_WAY}/constant.sh ${AGENT_INSTALL_HOME} cp ${PACKAGE_FULL_WAY}/agent.db ${AGENT_INSTALL_HOME} chmod 755 ${AGENT_INSTALL_HOME}/${SH_HOME} # 增加定时任务 echo "*/1 * * * * root \`cd /opt/agent && sh agentctl.sh self_healing\`" >>/etc/crontab logInfo "config jar finish" } function install() { judgeMem 1024000 checkDepend installAgent start installNodeExporter } function print() { echo -e "====================== sitesupport-agent 启动完成 ====================== = private: http://${localIp}:8888 = ========================================================================" } function start() { local step=5 local res=1 local bool=1 for ((i = 0; i < 60; i = (i + step))); do serviceIsAlive res=$? if [ ${res} = 1 ]; then nohup java -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=18888 -jar ${AGENT_INSTALL_HOME}/${JAR_NAME} >${AGENT_INSTALL_HOME}/nohup.out 2>&1 & logInfo "${JAR_NAME}服务启动中..." elif [ ${res} = 2 ]; then logInfo "${JAR_NAME}服务启动中..." bool=1 else logInfo "${JAR_NAME}服务已正常启动!" bool=0 print return fi sleep $step done # 启动node-exporter systemctl start node_exporter.service if [ ${bool} = 1 ]; then local pid="" pid=$(ps -ef | grep ${JAR_NAME} | grep -v grep | awk '{print $2}') kill -9 "${pid}" logError "${JAR_NAME}服务启动失败!i will kill it!!" fi } function stop() { if serviceIsAlive; then local pid="" pid=$(ps -ef | grep ${JAR_NAME} | grep -v grep | awk '{print $2}') kill -9 "${pid}" logInfo "${JAR_NAME}停止成功" else logInfo "${JAR_NAME}没有启动" fi local node_pid="" node_pid=$(netstat -tnlp | grep 9100 | grep node_exporter | awk '{print $7}' | awk 'NR==1' | cut -d '/' -f 1) if [[ ${node_pid} != "" ]]; then kill -9 "${node_pid}"; fi } function serviceIsAlive() { setLocalIp local pid="" pid=$(ps -ef | grep ${JAR_NAME} | grep -v grep | awk '{print $2}') # 如果不存在返回1,存在返回0 if [ -z "${pid}" ]; then return 1 else if netstat -tlnp | grep "${pid}" >/dev/null; then if ! curl http://"${localIp}":8888 &>/dev/null; then return 2; fi # 存在端口但不提供服务 return 0 else return 2 # 存在pid不存在port,可能正在启动,也可能启动失败 fi fi } # 服务自愈,可配合cron定时任务 function self_healing() { local step=5 local res=1 local bool=1 for ((i = 0; i < 60; i = (i + step))); do serviceIsAlive res=$? if [ ${res} = 1 ]; then logInfo "${JAR_NAME}服务开始启动!" nohup java -jar ${AGENT_INSTALL_HOME}/${JAR_NAME} >${AGENT_INSTALL_HOME}/nohup.out 2>&1 & elif [ ${res} = 2 ]; then logInfo "${JAR_NAME}服务启动中..." bool=1 else logInfo "${JAR_NAME}服务已正常启动!" bool=0 fi sleep $step done if [ ${bool} = 1 ]; then local pid="" pid=$(ps -ef | grep ${JAR_NAME} | grep -v grep | awk '{print $2}') kill -9 "${pid}" logInfo "${JAR_NAME}服务启动失败!i will kill it!!" fi } function uninstall() { stop rm -rf ${AGENT_INSTALL_HOME} # 考虑残留文件,再次判断删除 if [ -e ${AGENT_INSTALL_HOME} ]; then rm -rf ${AGENT_INSTALL_HOME}; fi # 删除定时任务 sed -i '/agentctl.sh/d' /etc/crontab source /etc/crontab # 删除exporter rm -rf /usr/lib/systemd/system/node_exporter.service } function check_status() { serviceIsAlive } case $1 in start) start ;; stop) stop ;; restart) stop start ;; install) install ;; uninstall) uninstall ;; check_status) check_status ;; self_healing) self_healing ;; *) logError "Usage: $0 {start|stop|install|uninstall|check_status|self_healing} {..}" ;; esac