• ceph从新启动后lvm消失问题,lvm丢失记录


    ceph从新启动后lvm消失问题

    从新启动ceph集群后 有几台机器lvm莫名消失了.

    集群就炸了啊,赶紧 nodown noout norecover 一套标记走起来.

    vgscan pvscan 都搜不到,对应osd也起不来

    [root@ceph-62 ~]# lsblk
    NAME            MAJ:MIN RM   SIZE RO TYPE MOUNTPOINT
    sda               8:0    0   9.1T  0 disk 
    sdb               8:16   0   9.1T  0 disk 
    sdc               8:32   0   9.1T  0 disk 
    sdd               8:48   0   9.1T  0 disk 
    sde               8:64   0   9.1T  0 disk 
    sdf               8:80   0   9.1T  0 disk 
    sdg               8:96   0   9.1T  0 disk 
    sdh               8:112  0   9.1T  0 disk 
    sdi               8:128  0   9.1T  0 disk 
    sdj               8:144  0   9.1T  0 disk 
    sdk               8:160  0   9.1T  0 disk 
    sdl               8:176  0   9.1T  0 disk 
    sdm               8:192  0   9.1T  0 disk 
    sdn               8:208  0   9.1T  0 disk 
    sdo               8:224  0   9.1T  0 disk 
    sdp               8:240  0   9.1T  0 disk 
    sdq              65:0    0   9.1T  0 disk 
    sdr              65:16   0   9.1T  0 disk 
    sds              65:32   0   9.1T  0 disk 
    sdt              65:48   0 223.6G  0 disk 
    ├─sdt1           65:49   0   200M  0 part /boot/efi
    ├─sdt2           65:50   0     1G  0 part /boot
    └─sdt3           65:51   0 222.4G  0 part 
      ├─centos-root 253:0    0    50G  0 lvm  /
      ├─centos-swap 253:1    0     4G  0 lvm  [SWAP]
      └─centos-home 253:2    0 168.4G  0 lvm  /home
    sdu              65:64   0   9.1T  0 disk 
    
    

    一看凉了,正常的是这样的

    [root@ceph-59 ~]# date
    NAME                                                                                                 MAJ:MIN RM   SIZE RO TYPE MOUNTPOINT
    sda                                                                                                    8:0    0   9.1T  0 disk 
    └─ceph--ebc2b2fc--be8e--48af--9729--8149f9a3960e-osd--data--c7ac8edc--4187--46ca--9569--b35faaf1dd9c 253:13   0   9.1T  0 lvm  
    sdb                                                                                                    8:16   0   9.1T  0 disk 
    └─ceph--3e5b8aba--ec7e--4448--b8c5--ea3e3fe7b4a5-osd--data--11b74eff--5a3c--432e--8ec9--aa776c9d4c64 253:14   0   9.1T  0 lvm  
    sdc                                                                                                    8:32   0   9.1T  0 disk 
    └─ceph--8800da1a--2764--4370--b8d7--8a0b1332bfc4-osd--data--fd68fe83--b142--4b55--a3d4--aa7b728febec 253:15   0   9.1T  0 lvm  
    sdd                                                                                                    8:48   0   9.1T  0 disk 
    └─ceph--ebd8ee23--cada--4717--8933--3cc5cdbb9840-osd--data--603de199--c6e3--4a78--a2cf--1cda25bd3d02 253:9    0   9.1T  0 lvm  
    sde                                                                                                    8:64   0   9.1T  0 disk 
    └─ceph--ce85d728--29a1--4faa--92f4--5288801cf7c0-osd--data--4dfb9363--52bf--498b--9417--9624b99f0a95 253:10   0   9.1T  0 lvm  
    sdf                                                                                                    8:80   0   9.1T  0 disk 
    └─ceph--9dbd5617--aa78--45cb--b80a--e33944890518-osd--data--c1cd41dd--3042--4949--9551--c081b2fe418d 253:11   0   9.1T  0 lvm  
    sdg                                                                                                    8:96   0   9.1T  0 disk 
    └─ceph--3ce3bcda--0250--44cd--ac93--50d585ef5ad5-osd--data--101b4271--cf4c--4ec5--aa53--4627d4feb698 253:12   0   9.1T  0 lvm  
    sdh                                                                                                    8:112  0   9.1T  0 disk 
    └─ceph--86368e74--660f--4772--8844--bf9c29e4e730-osd--data--3bc3cc6c--5850--4c83--a67a--51b024addce6 253:19   0   9.1T  0 lvm  
    sdi                                                                                                    8:128  0   9.1T  0 disk 
    └─ceph--79778284--af9a--416a--814b--22bb1c732794-osd--data--fcd6b24d--e48e--4315--9bcd--d54fba335815 253:20   0   9.1T  0 lvm  
    sdj                                                                                                    8:144  0   9.1T  0 disk 
    └─ceph--bf7c58a5--a9e5--4b77--81c3--ee75634754f7-osd--data--8e10bfa5--ce46--46fe--ba2d--ba70a78eb17b 253:21   0   9.1T  0 lvm  
    
    
    • 先看看 /etc/lvm/backup/*

      [root@ceph-59 ~]# ls /etc/lvm/backup/
      centos                                     ceph-3e5b8aba-ec7e-4448-b8c5-ea3e3fe7b4a5  ceph-9dbd5617-aa78-45cb-b80a-e33944890518  ceph-ebd8ee23-cada-4717-8933-3cc5cdbb9840
      ceph-0033060c-5010-4bd5-9859-78ffe5ceff27  ceph-6e4dedf0-130c-4b76-a3be-d2520b69f522  ceph-ab9e39dd-dde1-41c8-8bc1-11936adbf85a  ceph-f3220c11-eb66-45ba-9525-fb7017c45d4f
      ceph-28354762-18d9-4302-b11c-946832b7dceb  ceph-716552a4-af1b-4ceb-b034-a31195152391  ceph-bf7c58a5-a9e5-4b77-81c3-ee75634754f7  ceph-f95e3fc6-c7ee-48e8-bc70-ab40e995a868
      ceph-2be8cb74-f986-4893-b3f6-784e1b128e01  ceph-79778284-af9a-416a-814b-22bb1c732794  ceph-c7094bd4-d4a4-427c-add7-0eabd83ee1ba
      ceph-36ef94cd-7f9a-4ecc-9d12-4268289d60cd  ceph-86368e74-660f-4772-8844-bf9c29e4e730  ceph-ce85d728-29a1-4faa-92f4-5288801cf7c0
      ceph-3ce3bcda-0250-44cd-ac93-50d585ef5ad5  ceph-8800da1a-2764-4370-b8d7-8a0b1332bfc4  ceph-ebc2b2fc-be8e-48af-9729-8149f9a3960e
      
      

      **推荐看 ls /etc/lvm/archive/ **

      因为这里版本比较多些.

      [root@ceph-59 ~]# ls /etc/lvm/archive/*
      /etc/lvm/archive/centos_00000-1556371822.vg                                     /etc/lvm/archive/ceph-8800da1a-2764-4370-b8d7-8a0b1332bfc4_00002-498819197.vg
      /etc/lvm/archive/ceph-0033060c-5010-4bd5-9859-78ffe5ceff27_00000-191613742.vg   /etc/lvm/archive/ceph-8800da1a-2764-4370-b8d7-8a0b1332bfc4_00003-196229818.vg
      /etc/lvm/archive/ceph-0033060c-5010-4bd5-9859-78ffe5ceff27_00001-1071232553.vg  /etc/lvm/archive/ceph-8800da1a-2764-4370-b8d7-8a0b1332bfc4_00004-1508357692.vg
      /etc/lvm/archive/ceph-0033060c-5010-4bd5-9859-78ffe5ceff27_00002-1833047220.vg  /etc/lvm/archive/ceph-8800da1a-2764-4370-b8d7-8a0b1332bfc4_00005-482948107.vg
      /etc/lvm/archive/ceph-0033060c-5010-4bd5-9859-78ffe5ceff27_00003-1002373647.vg  /etc/lvm/archive/ceph-8800da1a-2764-4370-b8d7-8a0b1332bfc4_00006-695412169.vg
      /etc/lvm/archive/ceph-0033060c-5010-4bd5-9859-78ffe5ceff27_00004-882636099.vg   /etc/lvm/archive/ceph-9dbd5617-aa78-45cb-b80a-e33944890518_00000-1186551457.vg
      /etc/lvm/archive/ceph-0033060c-5010-4bd5-9859-78ffe5ceff27_00005-645028611.vg   /etc/lvm/archive/ceph-9dbd5617-aa78-45cb-b80a-e33944890518_00001-1556927714.vg
      /etc/lvm/archive/ceph-0033060c-5010-4bd5-9859-78ffe5ceff27_00006-2143351603.vg  /etc/lvm/archive/ceph-9dbd5617-aa78-45cb-b80a-e33944890518_00002-1272062293.vg
      /etc/lvm/archive/ceph-28354762-18d9-4302-b11c-946832b7dceb_00000-1313439840.vg  /etc/lvm/archive/ceph-9dbd5617-aa78-45cb-b80a-e33944890518_00003-163927944.vg
      /etc/lvm/archive/ceph-28354762-18d9-4302-b11c-946832b7dceb_00001-1936313617.vg  /etc/lvm/archive/ceph-9dbd5617-aa78-45cb-b80a-e33944890518_00004-427518662.vg
      /etc/lvm/archive/ceph-28354762-18d9-4302-b11c-946832b7dceb_00002-1765261441.vg  /etc/lvm/archive/ceph-9dbd5617-aa78-45cb-b80a-e33944890518_00005-544615481.vg
      /etc/lvm/archive/ceph-28354762-18d9-4302-b11c-946832b7dceb_00003-1532647703.vg  /etc/lvm/archive/ceph-9dbd5617-aa78-45cb-b80a-e33944890518_00006-671781823.vg
      /etc/lvm/archive/ceph-28354762-18d9-4302-b11c-946832b7dceb_00004-1642162971.vg  /etc/lvm/archive/ceph-ab9e39dd-dde1-41c8-8bc1-11936adbf85a_00000-2137077032.vg
      /etc/lvm/archive/ceph-28354762-18d9-4302-b11c-946832b7dceb_00005-66156891.vg    /etc/lvm/archive/ceph-ab9e39dd-dde1-41c8-8bc1-11936adbf85a_00001-533416413.vg
      /etc/lvm/archive/ceph-28354762-18d9-4302-b11c-946832b7dceb_00006-151645212.vg   /etc/lvm/archive/ceph-ab9e39dd-dde1-41c8-8bc1-11936adbf85a_00002-266310388.vg
      /etc/lvm/archive/ceph-2be8cb74-f986-4893-b3f6-784e1b128e01_00000-1665211260.vg 
      
    • 随便打开一个看看内容

      [root@ceph-59 ~]# cat /etc/lvm/backup/ceph-0033060c-5010-4bd5-9859-78ffe5ceff27 
      # Generated by LVM2 version 2.02.185(2)-RHEL7 (2019-05-13):
      
      contents = "Text Format Volume Group"
      version = 1
      
      description = "Created *after* executing 'vgchange -ay ceph-0033060c-5010-4bd5-9859-78ffe5ceff27'"
      
      creation_host = "ceph-59"       # Linux ceph-59 3.10.0-1062.el7.x86_64 #1 SMP Wed Aug 7 18:08:02 UTC 2019 x86_64
      creation_time = 1611333863     
      
      ceph-0033060c-5010-4bd5-9859-78ffe5ceff27 {
              id = "WIT8wF-kJdI-SS3E-5ue8-e3Ws-Awk3-AnkDkz"
              seqno = 6
              format = "lvm2"                 # informational
              status = ["RESIZEABLE", "READ", "WRITE"]
              flags = []
              extent_size = 8192              # 4 Megabytes
              max_lv = 0
              max_pv = 0
              metadata_copies = 0
      
              physical_volumes {
      
                      pv0 {
                              id = "EIGfcN-503D-MVDf-AwK1-SDtA-A4Fw-mvBqpe"
                              device = "/dev/sdn"     # Hint only
      
                              status = ["ALLOCATABLE"]
                              flags = []
                              dev_size = 19532873728  # 9.0957 Terabytes
                              pe_start = 2048
                              pe_count = 2384383      # 9.0957 Terabytes
                      }
              }
      
              logical_volumes {
      
                      osd-data-d9fa7e7d-57f6-437c-9b2b-c9652ec0697d {
                              id = "WtB7QP-qOxT-hV2b-3a2X-zPFS-09aP-HVKbc0"
                              status = ["READ", "WRITE", "VISIBLE"]
                              flags = []
                              tags = ["ceph.osdspec_affinity=", "ceph.vdo=0", "ceph.osd_id=208", "ceph.osd_fsid=c71e53d3-97e8-4f97-a356-1a4bb1187d45", "ceph.cluster_name=ceph", "ceph.cluster_fsid=9f4c8519-821e-434a-9e85-cdef908b808c", "ceph.encrypted=0", "ceph.cephx_lockbox_secret=", "ceph.type=block", "ceph.crush_device_class=None", "ceph.block_device=/dev/ceph-0033060c-5010-4bd5-9859-78ffe5ceff27/osd-data-d9fa7e7d-57f6-437c-9b2b-c9652ec0697d", "ceph.block_uuid=WtB7QP-qOxT-hV2b-3a2X-zPFS-09aP-HVKbc0"]
                              creation_time = 1601402387      # 2020-09-30 01:59:47 +0800
                              creation_host = "ceph-59"
                              segment_count = 1
      
                              segment1 {
                                      start_extent = 0
                                      extent_count = 2384383  # 9.0957 Terabytes
      
                                      type = "striped"
                                      stripe_count = 1        # linear
      
                                      stripes = [
                                              "pv0", 0
                                      ]
                              }
                      }
              }
      
      }
      
    • 挑点我们一会就用的

                      pv0 {
                              id = "EIGfcN-503D-MVDf-AwK1-SDtA-A4Fw-mvBqpe"
                              device = "/dev/sdn"     # Hint only
      

      device 是设备名 id是设备uuid

                              tags = ["ceph.osdspec_affinity=", "ceph.vdo=0", "ceph.osd_id=208", "ceph.osd_fsid=c71e53d3-97e8-4f97-a356-1a4bb1187d45", "ceph.cluster_name=ceph", "ceph.cluster_fsid=9f4c8519-821e-434a-9e85-cdef908b808c", "ceph.encrypted=0", "ceph.cephx_lockbox_secret=", "ceph.type=block", "ceph.crush_device_class=None", "ceph.block_device=/dev/ceph-0033060c-5010-4bd5-9859-78ffe5ceff27/osd-data-d9fa7e7d-57f6-437c-9b2b-c9652ec0697d", "ceph.block_uuid=WtB7QP-qOxT-hV2b-3a2X-zPFS-09aP-HVKbc0"]
      
      

      ceph.osd_id 是osd的id ceph.osd_fsid osd的fsid

    下面是临时写的一个垃圾脚本,基本能跳过已经存在vg不修复,根据备份的信息来自动恢复vg然后自动挂载和启动osd服务并加入集群.

    import os
    import re
    import time
    
    rea=r'id = "([w+-*w*]+)"s*device = "([w/]*)"'
    reb=r'ceph.osd_id=(d+)", "ceph.osd_fsid=([w*-*w*]+)"'
    
    
    
    
        # with open(r'C:UsersmakeitDesktopceph-0033060c-5010-4bd5-9859-78ffe5ceff27','r') as f:
    
    
    def getAllLvmBackFile():
        backs=os.listdir('/etc/lvm/backup/')
        flist=[]
        for i in backs:
            if  'ceph' in i:
                flist.append(os.path.join('/etc/lvm/backup/',i))
        return flist
    
    def getTargetLvms():
        flist=getAllLvmBackFile()
        cmd_get_old_vgs='''vgs|grep ceph|awk '{print $1}' >/tmp/oldlvms'''
        os.system(cmd_get_old_vgs)
        goodlvms=''
        with open('/tmp/oldlvms','r') as f:
            godlvms=f.readlines()
        for l in goodlvms:
            l=l.strip()
            for i in flist:
                # print(i,l,i.find(l))
                if i.find(l)!=-1:
                    print('remove ',i)
                    flist.remove(i)
                    break
    
        return flist
    
    
    
    
    def fixOneOSD(filename):
        content=''
        with open(filename,'r') as f:
            content=f.read()
    
        r=re.search(rea,content)
        uuid=''
        dev=''
        if r:
            uuid=r.group(1).replace('
    ','').replace('
    ','')
            dev=r.group(2)
            print(uuid,dev)
    
        #recovery lvm
    
        cmd_clear_part='dd if=/dev/zero of={} bs=1k count=6'.format(dev)
        os.system(cmd_clear_part)
        print('create pv ')
        cmd_create_new_pv='pvcreate --force --uuid {}  --restorefile {} {}'.format(uuid,filename,dev)
        cmd_restore_vg='vgcfgrestore {}'.format(os.path.split(filename)[1])
        cmd_active_vg='vgchange -ay {}'.format(os.path.split(filename)[1])
        os.system(cmd_create_new_pv)
        print('cmd_restore_vg ')
        os.system(cmd_restore_vg)
        print('cmd_active_vg')
        os.system(cmd_active_vg)
    
    
        # start osd 
        r=re.search(reb,content)
        osdid=''
        osdfsid=''
        if r:
            osdid=r.group(1)
            osdfsid=r.group(2)
            print(osdid,osdfsid)
        
        cmd_star_mount='systemctl start ceph-volume@lvm-{osdid}-{osdfsid};systemctl enable ceph-volume@lvm-{osdid}-{osdfsid}'.format(osdid=osdid,osdfsid=osdfsid)
        cmd_start_osd='systemctl start ceph-osd@{osdid};systemctl enable ceph-osd@{osdid}'.format(osdid=osdid)
        print('cmd_star_mount ')
        os.system(cmd_star_mount)
        print('cmd_start_osd ')
        os.system(cmd_start_osd)
        
    def main():
        ''
        # badlvms=getTargetLvms()
        # for i in badlvms:
            # fixOneOSD(i)
            # time.sleep(3)
        
    	
        fixOneOSD(os.path.join('/etc/lvm/backup/','ceph-54ced685-fec0-4725-8df3-e78ad313d223'))
        fixOneOSD(os.path.join('/etc/lvm/backup/','ceph-891ebdb2-3ba9-47c4-b2d4-788c4b0c1a2c'))
    
    if __name__ == "__main__":
        main()
    

    *** 经测试 发现一个问题,有的机器从新启动的时候/dev/sda有可能会变成/dev/sdb这种情况.***

    上面脚本就处理不好了,目前我是根据 /var/lib/ceph/osd/ceph-*/里面信息人工推测设备名

    如果设备名不匹配服务是起不来的,会报告keyring 不匹配等错误, 执行 ceph-osd -i $osdid --flush-journal 也会报告该错误.

    最笨的方法就一个一个测试,如果是对的就服务就能起来,osd就可以加入到集群中.

    反复从新启动机器,只要盘里面数据没问题,基本都可恢复,ceph还是挺抗揍的.

    如果确定设备名后修改/dev/sdn 到正确设备名即可

        pv0 {
                            id = "EIGfcN-503D-MVDf-AwK1-SDtA-A4Fw-mvBqpe"
                            device = "/dev/sdn"     # Hint only
    

    然后从修改脚本,

     fixOneOSD(os.path.join('/etc/lvm/backup/','ceph-891ebdb2-3ba9-47c4-b2d4-788c4b0c1a2c'))
    

    从新运行脚本. 正常是要从新启动机器的. 不然执行会出问题.

  • 相关阅读:
    OOP 三大特点:继承性,封装性,多态性
    PHP 知识点
    ELK安装和配置及常用插件安装
    istio1.2.2 安装及使用示例
    动态扩展磁盘(LVM)
    kuberadm集群升级
    nginx+nginx-upsync-module实现配置动态更新
    kubernetes资源优化
    ingress controller 和ingress使用实例
    helm安装及使用
  • 原文地址:https://www.cnblogs.com/lovesKey/p/14316449.html
Copyright © 2020-2023  润新知