• OpenStack_Swift源代码分析——ObjectReplicator源代码分析(2)


    1、Replicator运行代码具体分析

    上篇问中介绍了启动Replicator的详细过程,以下解说Replicator的运行代码的详细实现,首先看replicate方法:

    def replicate(self, override_devices=None, override_partitions=None):
            """Run a replication pass"""
            self.start = time.time()
            self.suffix_count = 0
            self.suffix_sync = 0
            self.suffix_hash = 0
            self.replication_count = 0
            self.last_replication_count = -1
            self.partition_times = []
    
            if override_devices is None:
                override_devices = []
            if override_partitions is None:
                override_partitions = []
            #heartbeat 为心跳函数 依据配置,配置没有 默觉得 300
            stats = eventlet.spawn(self.heartbeat)
            #detect_lockup  检查死锁
            lockup_detector = eventlet.spawn(self.detect_lockups)
            eventlet.sleep()  # Give spawns a cycle
    
            try:
                #replication 的 woker 数量
                self.run_pool = GreenPool(size=self.concurrency)
                # Returns a sorted list of jobs (dictionaries) that specify the
                # partitions, nodes, etc to be synced.
                # 返回专门为分区,节点同步工作的排序的列表
                #
                jobs = self.collect_jobs()
                for job in jobs:
                    #重写设备
                    if override_devices and job['device'] not in override_devices:
                        continue
                    #重写分区
                    if override_partitions and 
                            job['partition'] not in override_partitions:
                        continue
                    #假设重写设备及其重写分区在job 中
                    dev_path = join(self.devices_dir, job['device'])
                    if self.mount_check and not ismount(dev_path):
                        self.logger.warn(_('%s is not mounted'), job['device'])
                        continue
                    #ring没有改变
                    if not self.check_ring():
                        self.logger.info(_("Ring change detected. Aborting "
                                           "current replication pass."))
                        return
                    #假设
                    if job['delete']:
                        self.run_pool.spawn(self.update_deleted, job)
                    else:
                        #运行的是更新
                        self.run_pool.spawn(self.update, job)
                with Timeout(self.lockup_timeout):
                    self.run_pool.waitall()
            except (Exception, Timeout):
                self.logger.exception(_("Exception in top-level replication loop"))
                self.kill_coros()
            finally:
                stats.kill()
                lockup_detector.kill()
                self.stats_line()

    在replicate方法中,首先是为replicate方法运行的准备工作,当中最重要的是要收集要运行的job的collection_jobs方法,以下为其代码的详细实现:

    def collect_jobs(self):
            """
            Returns a sorted list of jobs (dictionaries) that specify the
            partitions, nodes, etc to be synced.
            """
            jobs = []
            ips = whataremyips()
            #replication_ip 和replication_port 在  RingBuilder中 load加入
            #self.object_ring = Ring(self.swift_dir, ring_name='object')
            for local_dev in [dev for dev in self.object_ring.devs
                              if dev and dev['replication_ip'] in ips and
                              dev['replication_port'] == self.port]:
                dev_path = join(self.devices_dir, local_dev['device'])
                obj_path = join(dev_path, 'objects')
                tmp_path = join(dev_path, 'tmp')
                if self.mount_check and not ismount(dev_path):
                    self.logger.warn(_('%s is not mounted'), local_dev['device'])
                    continue
            #Remove any file in a given path that that was last modified before mtime.
            #/srv/1/node/sdb1/tmp下的文件
                unlink_older_than(tmp_path, time.time() - self.reclaim_age)
                if not os.path.exists(obj_path):
                    try:
                        mkdirs(obj_path)
                    except Exception:
                        self.logger.exception('ERROR creating %s' % obj_path)
                    continue
                #root@kinglion-Lenovo-Product:/srv/1/node/sdb1/objects# ls
                #13069  133971  4799  58208  94238
                for partition in os.listdir(obj_path):
                    try:
                        job_path = join(obj_path, partition)
                        #推断当前路径是否为文件,假设是文件则删除
                        if isfile(job_path):
                            #
                            # Clean up any (probably zero-byte) files where a
                            # partition should be.
                            self.logger.warning('Removing partition directory '
                                                'which was a file: %s', job_path)
                            os.remove(job_path)
                            continue
                        #获得每一个partion相应的设备
                        part_nodes = 
                            self.object_ring.get_part_nodes(int(partition))
                        #nodes为不是本机器nodes的其它replica-1个nodes
                        nodes = [node for node in part_nodes
                                 if node['id'] != local_dev['id']]
                        #对objects下全部partion遍历,故有jobs的长度最大为_replica2part2dev分区备份中出现此设备有此设备id的分区和
                        jobs.append(
                            dict(path=job_path,
                                 device=local_dev['device'],
                                 nodes=nodes,
                                 #len(nodes)>len(part_nodes)-1的情况是当前节点已经不再是 当前partition所相应的设备了,有可能删除了该设备
                                 delete=len(nodes) > len(part_nodes) - 1,
                                 partition=partition))
                    except (ValueError, OSError):
                        continue
            #打乱顺序
            random.shuffle(jobs)
            if self.handoffs_first:
                # Move the handoff parts to the front of the list
                #将handoff 节点移到jobs队列的前边
                jobs.sort(key=lambda job: not job['delete'])
            self.job_count = len(jobs)
            return jobs

    对于第二层for循环,os.listdir(obj_path)列出objects目录下的全部partion,创建object是在objects目录下创建objects所映射的分区号的文件件,再在partion目录下创建以object的hash值后三位为名称的目录,然后再在后缀目录下创建以object的hash值为目录名的目录,object会存储为以object上传时间戳为名.data为文件后缀的文件。通过理解一致性hash算法可知,增加虚拟节点后每个设备会多个虚拟节点和其相应,假设一个设备相应的分区为n则,obj_path下子目录数目会<=n,由于存入的全部文件并不一定都能映射到当前设备所相应的分区。for循环首先判读obj_path下是否为文件,若是文件则删除,若不是则获得该分区号,依据分区号获得该分区号所映射的三个备份设备,并将设备id和本地设备id不想等的增加到nodes中,将nodes、path等信息增加到jobs中,最后打乱jobs的顺序,再将handoff 节点移到队列前边。返回jobs。再到replicate方法,首先我们看job[delete]为False的情况。当job[delete]为False会运行update方法,下边看update方法的详细实现:

    def update(self, job):
            """
            High-level method that replicates a single partition.
    
            :param job: a dict containing info about the partition to be replicated
            """
            self.replication_count += 1
            self.logger.increment('partition.update.count.%s' % (job['device'],))
            begin = time.time()
            try:
                #get_hashes 从hashes.pkl获取hashes值并更新 获取本地的hashes job[path] 为 job_path = join(obj_path, partition) local_hash为hashes.pkl中的反序列化回来的内容 hashed为改变的
                hashed, local_hash = tpool_reraise(
                    get_hashes, job['path'],
                    do_listdir=(self.replication_count % 10) == 0,
                    reclaim_age=self.reclaim_age)
                self.suffix_hash += hashed
                self.logger.update_stats('suffix.hashes', hashed)
                #
                attempts_left = len(job['nodes'])
                #此时的nodes为除去本节点外的全部节点 由于 job['nodes]不包括本地节点get_more_nodes(int(job['partition']))能获得除去本partion所相应节点 外的其它全部节点
                nodes = itertools.chain(
                    job['nodes'],
                    self.object_ring.get_more_nodes(int(job['partition'])))
               #此时attempts_left 为2 若果replica为3
                while attempts_left > 0:
                    # If this throws StopIterator it will be caught way below
                    node = next(nodes)
                    attempts_left -= 1
                    try:
                        with Timeout(self.http_timeout):
                            #REPLICARE方法 相应 sever里面的RELICATE方法
                            resp = http_connect(
                                node['replication_ip'], node['replication_port'],
                                node['device'], job['partition'], 'REPLICATE',
                                '', headers=self.headers).getresponse()
                            if resp.status == HTTP_INSUFFICIENT_STORAGE:
                                self.logger.error(_('%(ip)s/%(device)s responded'
                                                    ' as unmounted'), node)
                                attempts_left += 1
                                continue
                            if resp.status != HTTP_OK:
                                self.logger.error(_("Invalid response %(resp)s "
                                                    "from %(ip)s"),
                                                  {'resp': resp.status,
                                                   'ip': node['replication_ip']})
                                continue
                            #remote_hash 为 请求 'REPLICATE 返回的
                            remote_hash = pickle.loads(resp.read())
                            del resp
                        #找出本地后缀和远程后缀不同的
                        suffixes = [suffix for suffix in local_hash if
                                    local_hash[suffix] !=
                                    remote_hash.get(suffix, -1)]
                        #假设没有说明没有变动,则继续请求下一个节点
                        if not suffixes:
                            continue
    
                        #效果就是运行get_hashes方法 
                        hashed, recalc_hash = tpool_reraise(
                            get_hashes,
                            job['path'], recalculate=suffixes,
                            reclaim_age=self.reclaim_age)
                        self.logger.update_stats('suffix.hashes', hashed)
                        local_hash = recalc_hash
                        #假如 local_hash 为 123 321 122 remote_hash 123 321 124 则 122为变化的
                        #文件路径hash值后三位会不会反复
                        suffixes = [suffix for suffix in local_hash if
                                    local_hash[suffix] !=
                                    remote_hash.get(suffix, -1)]
                        #找到了不同的并知道其节点则将其同步到相应的节点,是基于推送模式的,故传的数据是自己本地的数据
                        self.sync(node, job, suffixes)  #同步变化的
                        with Timeout(self.http_timeout):
                            conn = http_connect(
                                node['replication_ip'], node['replication_port'],
                                node['device'], job['partition'], 'REPLICATE',
                                '/' + '-'.join(suffixes),
                                headers=self.headers)
                            conn.getresponse().read()
                        self.suffix_sync += len(suffixes)
                        self.logger.update_stats('suffix.syncs', len(suffixes))
                    except (Exception, Timeout):
                        self.logger.exception(_("Error syncing with node: %s") %
                                              node)
                #后缀数量 写日志时会用到
                self.suffix_count += len(local_hash)
            except (Exception, Timeout):
                self.logger.exception(_("Error syncing partition"))
            finally:
                self.partition_times.append(time.time() - begin)
                self.logger.timing_since('partition.update.timing', begin)

    update方法,中首先是获得本地文件里当前设备所相应hashes.pkl文件里每一个后缀所相应的hahes值,形如{'a83': '0db7b416c9808517a1bb2157af20b09b'},当中key为文件内容hash值的后三字节,value为后缀目录下全部子目录下(即以文件内容的md5值为名字的目录)全部.data文件的文件名称字的md5值,能够理解为全部文件名称的md5值和。

                hashed, local_hash = tpool_reraise(
                    get_hashes, job['path'],
                    do_listdir=(self.replication_count % 10) == 0,
                    reclaim_age=self.reclaim_age)

    如上代码片段会运行get_hashes方法,并将后边參数传递给get_hashes

    def get_hashes(partition_dir, recalculate=None, do_listdir=False,  
                   reclaim_age=ONE_WEEK):  
        """ 
        Get a list of hashes for the suffix dir.  do_listdir causes it to mistrust 
        the hash cache for suffix existence at the (unexpectedly high) cost of a 
        listdir.  reclaim_age is just passed on to hash_suffix. 
     
        :param partition_dir: absolute path of partition to get hashes for 
        :param recalculate: 形如 recalculate=['a83'] 
          list of suffixes(后缀,即 hash值的后缀  310即为后缀  root@kinglion-Lenovo-Product:/srv/1/node/sdb1/objects/94238# ls 
       310  hashes.pkl   ) which should be recalculated(又一次计算) when got 
        :param do_listdir: force existence check for all hashes in the partition(对partion中的hashe强行运行检查) 
        :param reclaim_age: age at which to remove tombstones 
     
        :returns: tuple of (number of suffix dirs hashed, dictionary of hashes) 
        """

    因没有传递recalulate这个參数故仅仅有do_listdir为True时会强制运行又一次计算后缀文件下全部文件名称字的hash值。文件名称字是时间戳,时间戳变了说明文件有更新,故须要和远程同步,检查是否为同一个版本号,不是同一个版本号的须要把本地版本号传递给远程server。

    attempts_left = len(job['nodes'])
                #此时的nodes为除去本节点外的全部节点 由于 job['nodes]不包括本地节点get_more_nodes(int(job['partition']))能获得除去本partion所相应节点 外的其它全部节点
                nodes = itertools.chain(
                    job['nodes'],
                    self.object_ring.get_more_nodes(int(job['partition'])))

    如上代码片段,attempts_left为当前job相应的分区去掉本地节点的其它的备份节点的个数。得到attempts_left后,下边接着更新了nodes,当中get_more_nodes方法会得到出去本分区所相应节点之外的其它全部节点的迭代器,全部nodes是除去本节点外全部节点的一个迭代器。

    下边就是while循环,循环attempts_left次,

    resp = http_connect(
                                node['replication_ip'], node['replication_port'],
                                node['device'], job['partition'], 'REPLICATE',
                                '', headers=self.headers).getresponse()

    依据迭代得到的node请求,因副本节点首先被迭代到,故首先请求副本节点。若果成功请求读取resp返回的内容,得到远程设备同一个partion下的remote_hash

    suffixes = [suffix for suffix in local_hash if
                                    local_hash[suffix] !=
                                    remote_hash.get(suffix, -1)]
                        #假设没有说明没有变动,则继续请求下一个节点
                        if not suffixes:
                            continue

    对照两个设备同样partion下的hashes.pkl文件同样key而value不同的key。suffixes则说明和远程备份文件都是同一个版本号,继续请求下一个备份。假设不为空,则须要处理,同一时候再一次得到自己hashes.pkl目录中的内容,由于上一次请求时间中可能有其它的备份已经有新的更新推送到本server了。得到本地最新的hashes.pkl内容后再一次对照,得到不同的同样分区下的不同后缀
    运行同步:

    self.sync(node, job, suffixes)  #同步变化的

    在同步变化时作者如今使用rsync方法,没有使用ssync,只是已经留出了ssync的实现,当ssync方法稳定时就会把rsync替换掉。(敬请期待)
     def sync(self, node, job, suffixes):  # Just exists for doc anchor point
            """
            Synchronize local suffix directories from a partition with a remote
            node.
    
            :param node: the "dev" entry for the remote node to sync with
            :param job: information about the partition being synced
            :param suffixes: a list of suffixes which need to be pushed
    
            :returns: boolean indicating success or failure
            """
            # self.sync_method = getattr(self, conf.get('sync_method') or 'rsync')
            #配置没有 sync_method方法 则运行类自己的rsync方法
            return self.sync_method(node, job, suffixes)
      sync_method方法从例如以下获得,没有配置则运行rsync方法

      self.sync_method = getattr(self, conf.get('sync_method') or 'rsync')

    def rsync(self, node, job, suffixes):
            """
            Uses rsync to implement the sync method. This was the first
            sync method in Swift.
            """
            if not os.path.exists(job['path']):
                return False
            args = [
                'rsync',
                '--recursive',
                '--whole-file',
                '--human-readable',
                '--xattrs',
                '--itemize-changes',
                '--ignore-existing',
                '--timeout=%s' % self.rsync_io_timeout,
                '--contimeout=%s' % self.rsync_io_timeout,
                '--bwlimit=%s' % self.rsync_bwlimit,
            ]
            node_ip = rsync_ip(node['replication_ip'])
            #包括了ip信息
            if self.vm_test_mode:
                rsync_module = '%s::object%s' % (node_ip, node['replication_port'])
            else:
                rsync_module = '%s::object' % node_ip
            had_any = False
            for suffix in suffixes:
                spath = join(job['path'], suffix)
                if os.path.exists(spath):
                    args.append(spath)
                    had_any = True
            if not had_any:
                return False
            args.append(join(rsync_module, node['device'],
                        'objects', job['partition']))
            #args里面包括了通的全部信息 包括设备名称,设备分区
            return self._rsync(args) == 0
    rsync方法将接受的參数都放到args中,然后运行_rsync方法。

        def _rsync(self, args):
            """
            Execute the rsync binary to replicate a partition.
    
            :returns: return code of rsync process. 0 is successful
            """
            start_time = time.time()
            ret_val = None
            try:
                with Timeout(self.rsync_timeout):
                    #此处即为同步操作了,推送模式
                    proc = subprocess.Popen(args,
                                            stdout=subprocess.PIPE,
                                            stderr=subprocess.STDOUT)
                    results = proc.stdout.read()
                    ret_val = proc.wait()
            except Timeout:
                self.logger.error(_("Killing long-running rsync: %s"), str(args))
                proc.kill()
                return 1  # failure response code
            total_time = time.time() - start_time
            for result in results.split('
    '):
                if result == '':
                    continue
                if result.startswith('cd+'):
                    continue
                if not ret_val:
                    self.logger.info(result)
                else:
                    self.logger.error(result)
            if ret_val:
                error_line = _('Bad rsync return code: %(ret)d <- %(args)s') % 
                    {'args': str(args), 'ret': ret_val}
                if self.rsync_error_log_line_length:
                    error_line = error_line[:self.rsync_error_log_line_length]
                self.logger.error(error_line)
            elif results:
                self.logger.info(
                    _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"),
                    {'src': args[-2], 'dst': args[-1], 'time': total_time})
            else:
                self.logger.debug(
                    _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"),
                    {'src': args[-2], 'dst': args[-1], 'time': total_time})
            return ret_val
    当中例如以下代码片段就是运行详细的推送:

      #此处即为同步操作了,推送模式
                    proc = subprocess.Popen(args,
                                            stdout=subprocess.PIPE,
                                            stderr=subprocess.STDOUT)

    若job[delete]为True出现这样的情况的可能就是,因增删了设备,Ring 又一次调整,当前partion中的备份不再有此server的ID如partion号为45678的在rebalance前的对于的备份设备的id为[1,2,3],假设当前设备id为1,则又一次rebalance后当前partion相应的备份为[4,2,3],则就会出现job[delete]为True的情况,我们看其代码详细实现:

        def update_deleted(self, job):
            """
            High-level method that replicates a single partition that doesn't
            belong on (不应放在 )this node.
    
            :param job: a dict containing info about the partition to be replicated
            """
            #得到parition下相应的后缀
            def tpool_get_suffixes(path):
                return [suff for suff in os.listdir(path)
                        if len(suff) == 3 and isdir(join(path, suff))]
            self.replication_count += 1
            self.logger.increment('partition.delete.count.%s' % (job['device'],))
            begin = time.time()
            try:
                responses = []
                suffixes = tpool.execute(tpool_get_suffixes, job['path'])
                if suffixes:
                    for node in job['nodes']:
                        success = self.sync(node, job, suffixes)      #运行同步
                        if success:
                            with Timeout(self.http_timeout):
                                conn = http_connect(
                                    node['replication_ip'],
                                    node['replication_port'],
                                    node['device'], job['partition'], 'REPLICATE',
                                    '/' + '-'.join(suffixes), headers=self.headers)
                                conn.getresponse().read()
                        responses.append(success)
                if self.handoff_delete:
                    # delete handoff if we have had handoff_delete successes
                    delete_handoff = len([resp for resp in responses if resp]) >= 
                        self.handoff_delete
                else:
                    # delete handoff if all syncs were successful
                    delete_handoff = len(responses) == len(job['nodes']) and 
                        all(responses)
                #suffixes为空或 请求的三个已经都响应成功后删除本地partion下的文件
                if not suffixes or delete_handoff:
                    self.logger.info(_("Removing partition: %s"), job['path'])
                    tpool.execute(shutil.rmtree, job['path'], ignore_errors=True)
            except (Exception, Timeout):
                self.logger.exception(_("Error syncing handoff partition"))
            finally:
                self.partition_times.append(time.time() - begin)
                self.logger.timing_since('partition.delete.timing', begin)
    


    至此 replicate操作就解说完成,文中若有理解不合理之处,请指正,谢谢!

  • 相关阅读:
    git分支管理策略
    git解决冲突
    git分支创建和合并
    git连接远程库
    git删除文件
    git撤销修改
    4k测试网站
    Windows10通过TightVNC远程连接Ubuntu18.04
    robot报告合并输出
    python 传参中的*和**
  • 原文地址:https://www.cnblogs.com/mengfanrong/p/3950948.html
Copyright © 2020-2023  润新知