• pacemaker+corosync 实现 PG HA


    环境 IP 规划

    node01 10.10.10.11
    node02 10.10.10.12
    vip-master 10.10.10.13
    

    建议还是使用 172.17.0.xx组网, 实际测试在宿主机只能 ping 通10.10.10.11不能 ping 通 12 与 13 ,原因不知道

    创建子网

     docker network create --driver bridge --subnet 10.10.10.0/16 --gateway 10.10.10.0 pg_network
    

    申请两台虚拟机

    docker run -d --name node01 -h node01 -p 60555:22  -p 60556:3389  -p 60557:5432  --net=pg_network --ip 10.10.10.11 --privileged=true  lxmpg14.1:1.0 /usr/sbin/init
     
    docker run -d --name node02 -h node02 -p 60558:22  -p 60559:3389  -p 60560:5432  --net=pg_network --ip 10.10.10.12 --privileged=true  lxmpg14.1:1.0 /usr/sbin/init
    

    修改 /etc/hosts [ALL]

    echo "10.10.10.11     node01" >> /etc/hosts
    echo "10.10.10.12     node02" >> /etc/hosts
    echo "10.10.10.13     vip-master" >> /etc/hosts
    

    永久关闭防火墙并禁止开机启动与Selinux

    systemctl stop firewalld.service
    systemctl disable firewalld.service
    systemctl status firewalld.service
    setenforce 0 
    sed -i '/^SELINUX=/c\SELINUX=disabled' /etc/selinux/config
    

    node01

    ssh-keygen  -t   rsa
    ssh-copy-id -i  ~/.ssh/id_rsa.pub  root@node02
    

    node02

    ssh-keygen  -t   rsa
    ssh-copy-id -i  ~/.ssh/id_rsa.pub  root@node01
    

    通过 node01 与 node02 搭建 PG14.1 主从流复制环境

    • 具体过程, 可以参考之前文章或者其他文章
    • 软件与数据日志路径等如下所示
    [postgres@node01 ~]$ tree -L 1 /home/postgres/
    /home/postgres/
    ├── pgarchive
    ├── pgbackup
    ├── pgdata
    ├── pglog
    ├── pgscript
    └── pgsql
    6 directories, 0 files
    
    • PG 配置
    cat >> $PGDATA/postgresql.conf << "EOF"
    listen_addresses = '*'
    port=5432
    logging_collector = on
    log_directory = '/home/postgres/pglog'
    log_filename = 'postgresql-%a.log'
    log_truncate_on_rotation = on
    archive_mode='on'
    archive_command='test ! -f /home/postgres/pgarchive/%f && cp %p /home/postgres/pgarchive/%f'
    max_wal_senders=10
    wal_sender_timeout=60s
    EOF
    
    • PG防火墙配置
    [postgres@node01 ~]$ cat pgdata/pg_hba.conf 
    # TYPE  DATABASE    USER    ADDRESS       METHOD
    local     all       all                    trust
    host      all       all   127.0.0.1/32     trust
    host      all       all    0.0.0.0/0        md5
    host   replication  all    0.0.0.0/0        md5
    [postgres@node01 ~]$ 
    
    • 创建流复制用户
    create user repuser replication password 'repuser';
    
    • 确保单机可以正常启动, 测试完成之后, 将 PG 关闭
    [root@node01 ~]# rm -f /home/postgres/pgdata/recovery.conf 
    [root@node01 ~]# su - postgres
    Last login: Sun Jun  5 13:38:20 UTC 2022 on pts/1
    [postgres@node01 ~]$ pg_ctl start
    waiting for server to start....2022-06-05 13:39:22.599 UTC [10145] LOG:  redirecting log output to logging collector process
    2022-06-05 13:39:22.599 UTC [10145] HINT:  Future log output will appear in directory "/home/postgres/pglog".
     done
    server started
    [postgres@node01 ~]$ pg_ctl stop
    waiting for server to shut down.... done
    
    
    • 删除残留文件(如果有的话)
    rm -f /home/postgres/pgdata/recovery.conf
    

    安装集群软件[all]

    yum -y install corosync pacemaker  pcs resource-agents
    

    设置 pcsd 开机自动启动(可选)[all]

    systemctl enable  pcsd
    systemctl start   pcsd 
    systemctl status  pcsd
    

    集群认证与创建 [node01]

    echo "hacluster:hacluster" | chpasswd
    pcs cluster auth node01 node02 -u hacluster -p hacluster --force
    pcs cluster setup --name cluster_test01 node01 node02 --force
    

    创建集群配置脚本 [node01]

    [root@node01 ~]# cat  lxm_cluster_setup.sh
    
    pcs cluster cib pgsql_cfg
    
    pcs -f pgsql_cfg property set no-quorum-policy="ignore"
    pcs -f pgsql_cfg property set stonith-enabled="false"
    pcs -f pgsql_cfg resource defaults resource-stickiness="INFINITY"
    pcs -f pgsql_cfg resource defaults migration-threshold="1"
    
    pcs -f pgsql_cfg resource create vip-master IPaddr2 \
       ip="10.10.10.13" \
       nic="eth0" \
       cidr_netmask="24" \
       op start   timeout="60s" interval="0s"  on-fail="restart" \
       op monitor timeout="60s" interval="10s" on-fail="restart" \
       op stop    timeout="60s" interval="0s"  on-fail="restart"
    
    pcs -f pgsql_cfg resource create pgsql pgsql \
       pgctl="/home/postgres/pgsql/bin//pg_ctl" \
       psql="/home/postgres/pgsql/bin/psql" \
       pgdata="/home/postgres/pgdata" \
       config="/home/postgres/pgdata/postgresql.conf" \
       rep_mode="async" \
       node_list="node01 node02" \
       master_ip="10.10.10.13" \
       repuser="repuser" \
       primary_conninfo_opt="password=repuser keepalives_idle=60 keepalives_interval=5 keepalives_count=5" \
       restart_on_promote='true' \
       op start   timeout="60s" interval="0s"  on-fail="restart" \
       op monitor timeout="60s" interval="4s"  on-fail="restart" \
       op monitor timeout="60s" interval="3s"  on-fail="restart" role="Master" \
       op promote timeout="60s" interval="0s"  on-fail="restart" \
       op demote  timeout="60s" interval="0s"  on-fail="stop" \
       op stop timeout="60s" interval="0s"  on-fail="block" \
       op notify  timeout="60s" interval="0s"
    
    pcs -f pgsql_cfg resource master msPostgresql pgsql \
       master-max=1 master-node-max=1 clone-max=5 clone-node-max=1 notify=true
    
    pcs -f pgsql_cfg resource group add master-group vip-master
    
    pcs -f pgsql_cfg constraint colocation add master-group with master msPostgresql INFINITY
    pcs -f pgsql_cfg constraint order promote msPostgresql then start master-group symmetrical=false score=INFINITY
    pcs -f pgsql_cfg constraint order demote  msPostgresql then stop  master-group symmetrical=false score=0
    
    
    pcs cluster cib-push pgsql_cfg
    

    替换 PG 的 RA 脚本 [ALL]

    对于 PG12 及其以上版本, 必须使用非默认的 pgsql 代理脚本
    脚本获取路径:

    wget https://github.com/ClusterLabs/resource-agents/tree/main/heartbeat/pgsql
    cp -bf pgsql /usr/lib/ocf/resource.d/heartbeat/
    
    

    启动 corosync 与 pacemaker [node01]

    systemctl start corosync pacemaker
    经过测试, 此处使用 pcs cluster start并不能将 PG正常拉起, 原因不明
    

    查看集群状态

    [root@node01 ~]# crm_mon -A1
    Stack: corosync
    Current DC: node01 (version 1.1.23-1.el7_9.1-9acf116022) - partition WITHOUT quorum
    Last updated: Sun Jun  5 13:57:20 2022
    Last change: Sun Jun  5 13:56:39 2022 by root via crm_attribute on node01
    
    2 nodes configured
    6 resource instances configured
    
    Online: [ node01 ]
    OFFLINE: [ node02 ]
    
    Active resources:
    
     Master/Slave Set: msPostgresql [pgsql]
         Masters: [ node01 ]
     Resource Group: master-group
         vip-master (ocf::heartbeat:IPaddr2):       Started node01
    
    Node Attributes:
    * Node node01:
        + master-pgsql                      : 1000      
        + pgsql-data-status                 : LATEST    
        + pgsql-master-baseline             : 00000000160000A0
        + pgsql-status                      : PRI       
    [root@node01 ~]# 
    

    node02 建立备库

    rm -rf /home/postgres/pgdata
    /home/postgres/pgsql/bin/pg_basebackup -U repuser -h node01 -Fp -Pv -Xs -D /home/postgres/pgdata
     chown -R postgres.postgres /home/postgres/pgdata
    

    node02 启动集群

    systemctl start pcsd corosync pacemaker
    

    node01 上查看集群状态

    [root@node01 ~]# crm_mon -A1
    Stack: corosync
    Current DC: node01 (version 1.1.23-1.el7_9.1-9acf116022) - partition with quorum
    Last updated: Sun Jun  5 14:00:54 2022
    Last change: Sun Jun  5 14:00:47 2022 by root via crm_attribute on node01
    
    2 nodes configured
    6 resource instances configured
    
    Online: [ node01 node02 ]
    
    Active resources:
    
     Master/Slave Set: msPostgresql [pgsql]
         Masters: [ node01 ]
         Slaves: [ node02 ]
     Resource Group: master-group
         vip-master (ocf::heartbeat:IPaddr2):       Started node01
    
    Node Attributes:
    * Node node01:
        + master-pgsql                      : 1000      
        + pgsql-data-status                 : LATEST    
        + pgsql-master-baseline             : 00000000160000A0
        + pgsql-status                      : PRI       
    * Node node02:
        + master-pgsql                      : 100       
        + pgsql-data-status                 : STREAMING|ASYNC
        + pgsql-status                      : HS:async  
    [root@node01 ~]# 
    

    测试故障转移

    模拟数据库异常停止

    node01:
    su - postgres
    pg_ctl stop
    exit
    crm_mon -A1
    [root@node01 ~]# 
    [root@node01 ~]# crm_mon -A1
    Stack: corosync
    Current DC: node01 (version 1.1.23-1.el7_9.1-9acf116022) - partition with quorum
    Last updated: Sun Jun  5 14:10:26 2022
    Last change: Sun Jun  5 14:09:48 2022 by root via crm_attribute on node02
    
    2 nodes configured
    6 resource instances configured
    
    Online: [ node01 node02 ]
    
    Active resources:
    
     Master/Slave Set: msPostgresql [pgsql]
         Masters: [ node02 ]
     Resource Group: master-group
         vip-master (ocf::heartbeat:IPaddr2):       Started node02
    
    Node Attributes:
    * Node node01:
        + master-pgsql                      : -INFINITY 
        + pgsql-data-status                 : DISCONNECT
        + pgsql-status                      : STOP      
    * Node node02:
        + master-pgsql                      : 1000      
        + pgsql-data-status                 : LATEST    
        + pgsql-master-baseline             : 00000000190000A0
        + pgsql-status                      : PRI       
    
    Failed Resource Actions:
    * pgsql_monitor_3000 on node01 'not running' (7): call=18, status=complete, exitreason='',
        last-rc-change='Sun Jun  5 14:09:35 2022', queued=0ms, exec=0ms
    [root@node01 ~]# 
    

    可以看到, vip-master漂移到 node02 上, node02上的 PG 变为了 master 节点

    • 将 node01 PG修复, 重启 集群服务, 其实 node01上 PG 作为备库, 如果无法正常进入 备库模式, 需要手动重建备库
    rm -f /var/lib/pgsql/tmp/PGSQL.lock
    systemctl restart pacemaker corosync pcsd
    
    [root@node01 ~]# crm_mon -A1
    Stack: corosync
    Current DC: node02 (version 1.1.23-1.el7_9.1-9acf116022) - partition with quorum
    Last updated: Sun Jun  5 14:13:41 2022
    Last change: Sun Jun  5 14:13:18 2022 by root via crm_attribute on node02
    
    2 nodes configured
    6 resource instances configured
    
    Online: [ node01 node02 ]
    
    Active resources:
    
     Master/Slave Set: msPostgresql [pgsql]
         Masters: [ node02 ]
         Slaves: [ node01 ]
     Resource Group: master-group
         vip-master (ocf::heartbeat:IPaddr2):       Started node02
    
    Node Attributes:
    * Node node01:
        + master-pgsql                      : 100       
        + pgsql-data-status                 : STREAMING|ASYNC
        + pgsql-status                      : HS:async  
    * Node node02:
        + master-pgsql                      : 1000      
        + pgsql-data-status                 : LATEST    
        + pgsql-master-baseline             : 00000000190000A0
        + pgsql-status                      : PR
    

    可以看到 node01 上 PG 重写变为了备库

    模拟集群服务异常停止

    • node02 将 集群服务停止
    systemctl stop pacemaker corosync pcsd
    
    • node01 上查看集群状态
    [root@node01 ~]# crm_mon -A1
    Stack: corosync
    Current DC: node01 (version 1.1.23-1.el7_9.1-9acf116022) - partition with quorum
    Last updated: Sun Jun  5 14:15:10 2022
    Last change: Sun Jun  5 14:15:00 2022 by root via crm_attribute on node01
    
    2 nodes configured
    6 resource instances configured
    
    Online: [ node01 ]
    OFFLINE: [ node02 ]
    
    Active resources:
    
     Master/Slave Set: msPostgresql [pgsql]
         Masters: [ node01 ]
     Resource Group: master-group
         vip-master (ocf::heartbeat:IPaddr2):       Started node01
    
    Node Attributes:
    * Node node01:
        + master-pgsql                      : 1000      
        + pgsql-data-status                 : LATEST    
        + pgsql-master-baseline             : 000000001A0000A0
        + pgsql-status                      : PRI       
    [root@node01 ~]# 
    

    可以看到 node01 上 PG 变为了主库, vip也漂移到了主库

    • 修复 node02 , 重启 集群服务
     rm -f /var/lib/pgsql/tmp/PGSQL.lock 
     systemctl restart pcsd corosync pacemaker
    
    • node01 查看集群状态
    [root@node01 ~]# crm_mon -A1
    Stack: corosync
    Current DC: node01 (version 1.1.23-1.el7_9.1-9acf116022) - partition with quorum
    Last updated: Sun Jun  5 14:17:10 2022
    Last change: Sun Jun  5 14:17:01 2022 by root via crm_attribute on node01
    
    2 nodes configured
    6 resource instances configured
    
    Online: [ node01 node02 ]
    
    Active resources:
    
     Master/Slave Set: msPostgresql [pgsql]
         Masters: [ node01 ]
         Slaves: [ node02 ]
     Resource Group: master-group
         vip-master (ocf::heartbeat:IPaddr2):       Started node01
    
    Node Attributes:
    * Node node01:
        + master-pgsql                      : 1000      
        + pgsql-data-status                 : LATEST    
        + pgsql-master-baseline             : 000000001A0000A0
        + pgsql-status                      : PRI       
    * Node node02:
        + master-pgsql                      : 100       
        + pgsql-data-status                 : STREAMING|ASYNC
        + pgsql-status                      : HS:async  
    

    node02 上 PG 进入备模式

    模拟主机异常宕机

    • 停止 node01
    在宿主机上, 将 node01 容器停止, 类似与 node01 宕机
    docker stop node01
    
    • 在 node02 上查看集群状态
    [root@node02 ~]# crm_mon -A1
    Stack: corosync
    Current DC: node02 (version 1.1.23-1.el7_9.1-9acf116022) - partition with quorum
    Last updated: Sun Jun  5 14:19:05 2022
    Last change: Sun Jun  5 14:18:44 2022 by root via crm_attribute on node02
    
    2 nodes configured
    6 resource instances configured
    
    Online: [ node02 ]
    OFFLINE: [ node01 ]
    
    Active resources:
    
     Master/Slave Set: msPostgresql [pgsql]
         Masters: [ node02 ]
     Resource Group: master-group
         vip-master (ocf::heartbeat:IPaddr2):       Started node02
    
    Node Attributes:
    * Node node02:
        + master-pgsql                      : 1000      
        + pgsql-data-status                 : LATEST    
        + pgsql-master-baseline             : 000000001A000150
        + pgsql-status                      : PRI       
    [root@node02 ~]# 
    [root@node02 ~]# 
    

    可以看到 vip 漂移到了 node02 上, 并且 node02 上的 PG 现在是变为了 主库,可见三种情况都能实现自动故障转移

    • 模拟 修复 node01
    宿主机: docker start node01
    node01:
    rm -f /var/lib/pgsql/tmp/PGSQL.lock 
    systemctl start pcsd corosync pacemaker
    [root@node01 /]# crm_mon -A1
    Stack: corosync
    Current DC: node02 (version 1.1.23-1.el7_9.1-9acf116022) - partition with quorum
    Last updated: Sun Jun  5 14:22:55 2022
    Last change: Sun Jun  5 14:22:46 2022 by root via crm_attribute on node02
    
    2 nodes configured
    6 resource instances configured
    
    Online: [ node01 node02 ]
    
    Active resources:
    
     Master/Slave Set: msPostgresql [pgsql]
         Masters: [ node02 ]
         Slaves: [ node01 ]
     Resource Group: master-group
         vip-master (ocf::heartbeat:IPaddr2):       Started node02
    
    Node Attributes:
    * Node node01:
        + master-pgsql                      : 100       
        + pgsql-data-status                 : STREAMING|ASYNC
        + pgsql-status                      : HS:async  
    * Node node02:
        + master-pgsql                      : 1000      
        + pgsql-data-status                 : LATEST    
        + pgsql-master-baseline             : 000000001A000150
        + pgsql-status                      : PR 
    
  • 相关阅读:
    JSP基础
    线程控制
    多线程简述
    Servlet生命周期
    同步代码块和同步方法有什么区别?
    java.sql.Date和java.util.Date的区别
    Java自学指南五、编码工具
    基本类型和包装类的区别?
    什么是包装类?为什么要有包装类?基本类型与包装类如何转换?
    什么是Java的垃圾回收机制?
  • 原文地址:https://www.cnblogs.com/yldf/p/16345382.html
Copyright © 2020-2023  润新知