• InnoDB 中的锁实现


    InnoDB 中的锁实现

    原贴:InnoDB 锁系统及死锁检测实现分析

    InnoDB 中,所有事务加的行锁通过一个全局的 hash 表 lock_sys 维护:

    /* The lock system */
    lock_sys_t *lock_sys = NULL;
    
    /** The lock system struct */
    struct lock_sys_t {
      char pad1[INNOBASE_CACHE_LINE_SIZE];
      /*!< padding to prevent other
      memory update hotspots from
      residing on the same memory
      cache line */
      LockMutex mutex;              /*!< Mutex protecting the
                                    locks */
      hash_table_t *rec_hash;       /*!< hash table of the record
                                    locks */
      hash_table_t *prdt_hash;      /*!< hash table of the predicate
                                    lock */
      hash_table_t *prdt_page_hash; /*!< hash table of the page
                                    lock */
    
      char pad2[INNOBASE_CACHE_LINE_SIZE]; /*!< Padding */
      LockMutex wait_mutex;                /*!< Mutex protecting the
                                           next two fields */
      srv_slot_t *waiting_threads;         /*!< Array  of user threads
                                           suspended while waiting for
                                           locks within InnoDB, protected
                                           by the lock_sys->wait_mutex */
      srv_slot_t *last_slot;               /*!< highest slot ever used
                                           in the waiting_threads array,
                                           protected by
                                           lock_sys->wait_mutex */
      int n_waiting;                       /*!< Number of slots in use.
                                           Protected by lock_sys->mutex */
      ibool rollback_complete;
      /*!< TRUE if rollback of all
      recovered transactions is
      complete. Protected by
      lock_sys->mutex */
    
      ulint n_lock_max_wait_time; /*!< Max wait time */
    
      os_event_t timeout_event; /*!< Set to the event that is
                                created in the lock wait monitor
                                thread. A value of 0 means the
                                thread is not active */
    
      /** Marker value before trx_t::age. */
      uint64_t mark_age_updated;
    
    #ifdef UNIV_DEBUG
      /** Lock timestamp counter */
      uint64_t m_seq;
    #endif /* UNIV_DEBUG */
    };
    
    

    在进一步介绍行锁实现之前,先介绍一下InnoDB中事务与锁相关的数据结构

    struct trx_t {
      enum isolation_level_t {
    
        /** dirty read: non-locking SELECTs are performed so that we
        do not look at a possible earlier version of a record; thus
        they are not 'consistent' reads under this isolation level;
        otherwise like level 2 */
        READ_UNCOMMITTED,
    
        /** somewhat Oracle-like isolation, except that in range UPDATE
        and DELETE we must block phantom rows with next-key locks;
        SELECT ... FOR UPDATE and ...  LOCK IN SHARE MODE only lock
        the index records, NOT the gaps before them, and thus allow
        free inserting; each consistent read reads its own snapshot */
        READ_COMMITTED,
    
        /** this is the default; all consistent reads in the same trx
        read the same snapshot; full next-key locking used in locking
        reads to block insertions into gaps */
        REPEATABLE_READ,
    
        /** all plain SELECTs are converted to LOCK IN SHARE MODE
        reads */
        SERIALIZABLE
      };
    
      TrxMutex mutex; /*!< Mutex protecting the fields
                      state and lock (except some fields
                      of lock, which are protected by
                      lock_sys->mutex) */
    
      bool owns_mutex; /*!< Set to the transaction that owns
                       the mutex during lock acquire and/or
                       release.
    
                       This is used to avoid taking the
                       trx_t::mutex recursively. */
    
      /* Note: in_depth was split from in_innodb for fixing a RO
      performance issue. Acquiring the trx_t::mutex for each row
      costs ~3% in performance. It is not required for correctness.
      Therefore we increment/decrement in_depth without holding any
      mutex. The assumption is that the Server will only ever call
      the handler from one thread. This is not true for kill_connection.
      Therefore in innobase_kill_connection. We don't increment this
      counter via TrxInInnoDB. */
    
      ib_uint32_t in_depth; /*!< Track nested TrxInInnoDB
                            count */
    
      ib_uint32_t in_innodb; /*!< if the thread is executing
                             in the InnoDB context count > 0. */
    
      bool abort; /*!< if this flag is set then
                  this transaction must abort when
                  it can */
    
      trx_id_t id; /*!< transaction id */
    
      trx_id_t no; /*!< transaction serialization number:
                   max trx id shortly before the
                   transaction is moved to
                   COMMITTED_IN_MEMORY state.
                   Protected by trx_sys_t::mutex
                   when trx->in_rw_trx_list. Initially
                   set to TRX_ID_MAX. */
    
      /** State of the trx from the point of view of concurrency control
      and the valid state transitions.
    
      Possible states:
    
      TRX_STATE_NOT_STARTED
      TRX_STATE_FORCED_ROLLBACK
      TRX_STATE_ACTIVE
      TRX_STATE_PREPARED
      TRX_STATE_COMMITTED_IN_MEMORY (alias below COMMITTED)
    
      Valid state transitions are:
    
      Regular transactions:
      * NOT_STARTED -> ACTIVE -> COMMITTED -> NOT_STARTED
    
      Auto-commit non-locking read-only:
      * NOT_STARTED -> ACTIVE -> NOT_STARTED
    
      XA (2PC):
      * NOT_STARTED -> ACTIVE -> PREPARED -> COMMITTED -> NOT_STARTED
    
      Recovered XA:
      * NOT_STARTED -> PREPARED -> COMMITTED -> (freed)
    
      XA (2PC) (shutdown or disconnect before ROLLBACK or COMMIT):
      * NOT_STARTED -> PREPARED -> (freed)
    
      Disconnected XA can become recovered:
      * ... -> ACTIVE -> PREPARED (connected) -> PREPARED (disconnected)
      Disconnected means from mysql e.g due to the mysql client disconnection.
      Latching and various transaction lists membership rules:
    
      XA (2PC) transactions are always treated as non-autocommit.
    
      Transitions to ACTIVE or NOT_STARTED occur when
      !in_rw_trx_list (no trx_sys->mutex needed).
    
      Autocommit non-locking read-only transactions move between states
      without holding any mutex. They are !in_rw_trx_list.
    
      All transactions, unless they are determined to be ac-nl-ro,
      explicitly tagged as read-only or read-write, will first be put
      on the read-only transaction list. Only when a !read-only transaction
      in the read-only list tries to acquire an X or IX lock on a table
      do we remove it from the read-only list and put it on the read-write
      list. During this switch we assign it a rollback segment.
    
      When a transaction is NOT_STARTED, it can be in_mysql_trx_list if
      it is a user transaction. It cannot be in rw_trx_list.
    
      ACTIVE->PREPARED->COMMITTED is only possible when trx->in_rw_trx_list.
      The transition ACTIVE->PREPARED is protected by trx_sys->mutex.
    
      ACTIVE->COMMITTED is possible when the transaction is in
      rw_trx_list.
    
      Transitions to COMMITTED are protected by both lock_sys->mutex
      and trx->mutex.
    
      NOTE: Some of these state change constraints are an overkill,
      currently only required for a consistent view for printing stats.
      This unnecessarily adds a huge cost for the general case. */
    
      trx_state_t state;
    
      /* If set, this transaction should stop inheriting (GAP)locks.
      Generally set to true during transaction prepare for RC or lower
      isolation, if requested. Needed for replication replay where
      we don't want to get blocked on GAP locks taken for protecting
      concurrent unique insert or replace operation. */
      bool skip_lock_inheritance;
    
      ReadView *read_view; /*!< consistent read view used in the
                           transaction, or NULL if not yet set */
    
      UT_LIST_NODE_T(trx_t)
      trx_list; /*!< list of transactions;
                protected by trx_sys->mutex. */
      UT_LIST_NODE_T(trx_t)
      no_list; /*!< Required during view creation
               to check for the view limit for
               transactions that are committing */
    
      trx_lock_t lock;   /*!< Information about the transaction
                         locks and state. Protected by
                         trx->mutex or lock_sys->mutex
                         or both */
      bool is_recovered; /*!< 0=normal transaction,
                         1=recovered, must be rolled back,
                         protected by trx_sys->mutex when
                         trx->in_rw_trx_list holds */
    
      hit_list_t hit_list; /*!< List of transactions to kill,
                           when a high priority transaction
                           is blocked on a lock wait. */
    
      os_thread_id_t killed_by; /*!< The thread ID that wants to
                                kill this transaction asynchronously.
                                This is required because we recursively
                                enter the handlerton methods and need
                                to distinguish between the kill thread
                                and the transaction thread.
    
                                Note: We need to be careful w.r.t the
                                Thread Pool. The thread doing the kill
                                should not leave InnoDB between the
                                mark and the actual async kill because
                                the running thread can change. */
    
      /* These fields are not protected by any mutex. */
      const char *op_info;   /*!< English text describing the
                             current operation, or an empty
                             string */
      ulint isolation_level; /*!< TRX_ISO_REPEATABLE_READ, ... */
      bool check_foreigns;   /*!< normally TRUE, but if the user
                             wants to suppress foreign key checks,
                             (in table imports, for example) we
                             set this FALSE */
      /*------------------------------*/
      /* MySQL has a transaction coordinator to coordinate two phase
      commit between multiple storage engines and the binary log. When
      an engine participates in a transaction, it's responsible for
      registering itself using the trans_register_ha() API. */
      bool is_registered; /* This flag is set to true after the
                          transaction has been registered with
                          the coordinator using the XA API, and
                          is set to false  after commit or
                          rollback. */
      /*------------------------------*/
      bool check_unique_secondary;
      /*!< normally TRUE, but if the user
      wants to speed up inserts by
      suppressing unique key checks
      for secondary indexes when we decide
      if we can use the insert buffer for
      them, we set this FALSE */
      bool flush_log_later;      /* In 2PC, we hold the
                                 prepare_commit mutex across
                                 both phases. In that case, we
                                 defer flush of the logs to disk
                                 until after we release the
                                 mutex. */
      bool must_flush_log_later; /*!< this flag is set to TRUE in
                            trx_commit() if flush_log_later was
                            TRUE, and there were modifications by
                            the transaction; in that case we must
                            flush the log in
                            trx_commit_complete_for_mysql() */
      ulint duplicates;          /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */
      bool has_search_latch;
      /*!< TRUE if this trx has latched the
      search system latch in S-mode.
      This now can only be true in
      row_search_mvcc, the btr search latch
      must has been released before exiting,
      and this flag would be set to false */
      trx_dict_op_t dict_operation; /**< @see enum trx_dict_op_t */
    
      bool ddl_operation;  /*!< True if this trx involves dd table
                            change */
      bool ddl_must_flush; /*!< True if this trx involves dd table
                           change, and must flush */
      bool in_truncate;    /* This trx is doing truncation */
    
      /* Fields protected by the srv_conc_mutex. */
      bool declared_to_be_inside_innodb;
      /*!< this is TRUE if we have declared
      this transaction in
      srv_conc_enter_innodb to be inside the
      InnoDB engine */
      ib_uint32_t n_tickets_to_enter_innodb;
      /*!< this can be > 0 only when
      declared_to_... is TRUE; when we come
      to srv_conc_innodb_enter, if the value
      here is > 0, we decrement this by 1 */
      ib_uint32_t dict_operation_lock_mode;
      /*!< 0, RW_S_LATCH, or RW_X_LATCH:
      the latch mode trx currently holds
      on dict_operation_lock. Protected
      by dict_operation_lock. */
    
      time_t start_time; /*!< time the state last time became
                         TRX_STATE_ACTIVE */
    
      /** Weight/Age of the transaction in the record lock wait queue. */
      int32_t age;
    
      /** For tracking if Weight/age has been updated. */
      uint64_t age_updated;
    
      lsn_t commit_lsn; /*!< lsn at the time of the commit */
    
      /*------------------------------*/
      THD *mysql_thd; /*!< MySQL thread handle corresponding
                      to this trx, or NULL */
    
      const char *mysql_log_file_name;
      /*!< if MySQL binlog is used, this field
      contains a pointer to the latest file
      name; this is NULL if binlog is not
      used */
      int64_t mysql_log_offset;
      /*!< if MySQL binlog is used, this
      field contains the end offset of the
      binlog entry */
      /*------------------------------*/
      ib_uint32_t n_mysql_tables_in_use; /*!< number of Innobase tables
                                  used in the processing of the current
                                  SQL statement in MySQL */
      ib_uint32_t mysql_n_tables_locked;
      /*!< how many tables the current SQL
      statement uses, except those
      in consistent read */
      /*------------------------------*/
    #ifdef UNIV_DEBUG
      /** The following two fields are mutually exclusive. */
      /* @{ */
    
      bool in_rw_trx_list; /*!< true if in trx_sys->rw_trx_list */
                           /* @} */
    #endif                 /* UNIV_DEBUG */
      UT_LIST_NODE_T(trx_t)
      mysql_trx_list; /*!< list of transactions created for
                      MySQL; protected by trx_sys->mutex */
    #ifdef UNIV_DEBUG
      bool in_mysql_trx_list;
      /*!< true if in
      trx_sys->mysql_trx_list */
    #endif /* UNIV_DEBUG */
      /*------------------------------*/
      dberr_t error_state;             /*!< 0 if no error, otherwise error
                                       number; NOTE That ONLY the thread
                                       doing the transaction is allowed to
                                       set this field: this is NOT protected
                                       by any mutex */
      const dict_index_t *error_index; /*!< if the error number indicates a
                                       duplicate key error, a pointer to
                                       the problematic index is stored here */
      ulint error_key_num;             /*!< if the index creation fails to a
                                       duplicate key error, a mysql key
                                       number of that index is stored here */
      sess_t *sess;                    /*!< session of the trx, NULL if none */
      que_t *graph;                    /*!< query currently run in the session,
                                       or NULL if none; NOTE that the query
                                       belongs to the session, and it can
                                       survive over a transaction commit, if
                                       it is a stored procedure with a COMMIT
                                       WORK statement, for instance */
      /*------------------------------*/
      UT_LIST_BASE_NODE_T(trx_named_savept_t)
      trx_savepoints; /*!< savepoints set with SAVEPOINT ...,
                      oldest first */
      /*------------------------------*/
      UndoMutex undo_mutex; /*!< mutex protecting the fields in this
                            section (down to undo_no_arr), EXCEPT
                            last_sql_stat_start, which can be
                            accessed only when we know that there
                            cannot be any activity in the undo
                            logs! */
      undo_no_t undo_no;    /*!< next undo log record number to
                            assign; since the undo log is
                            private for a transaction, this
                            is a simple ascending sequence
                            with no gaps; thus it represents
                            the number of modified/inserted
                            rows in a transaction */
      space_id_t undo_rseg_space;
      /*!< space id where last undo record
      was written */
      trx_savept_t last_sql_stat_start;
      /*!< undo_no when the last sql statement
      was started: in case of an error, trx
      is rolled back down to this undo
      number; see note at undo_mutex! */
      trx_rsegs_t rsegs;    /* rollback segments for undo logging */
      undo_no_t roll_limit; /*!< least undo number to undo during
                            a partial rollback; 0 otherwise */
    #ifdef UNIV_DEBUG
      bool in_rollback;   /*!< true when the transaction is
                          executing a partial or full rollback */
    #endif                /* UNIV_DEBUG */
      ulint pages_undone; /*!< number of undo log pages undone
                          since the last undo log truncation */
      /*------------------------------*/
      ulint n_autoinc_rows;       /*!< no. of AUTO-INC rows required for
                                  an SQL statement. This is useful for
                                  multi-row INSERTs */
      ib_vector_t *autoinc_locks; /* AUTOINC locks held by this
                                  transaction. Note that these are
                                  also in the lock list trx_locks. This
                                  vector needs to be freed explicitly
                                  when the trx instance is destroyed.
                                  Protected by lock_sys->mutex. */
      /*------------------------------*/
      bool read_only;        /*!< true if transaction is flagged
                             as a READ-ONLY transaction.
                             if auto_commit && will_lock == 0
                             then it will be handled as a
                             AC-NL-RO-SELECT (Auto Commit Non-Locking
                             Read Only Select). A read only
                             transaction will not be assigned an
                             UNDO log. */
      bool auto_commit;      /*!< true if it is an autocommit */
      ib_uint32_t will_lock; /*!< Will acquire some locks. Increment
                             each time we determine that a lock will
                             be acquired by the MySQL layer. */
    #ifndef UNIV_HOTBACKUP
      /*------------------------------*/
      fts_trx_t *fts_trx;       /*!< FTS information, or NULL if
                                transaction hasn't modified tables
                                with FTS indexes (yet). */
      doc_id_t fts_next_doc_id; /* The document id used for updates */
      /*------------------------------*/
      ib_uint32_t flush_tables; /*!< if "covering" the FLUSH TABLES",
                                count of tables being flushed. */
    
      /*------------------------------*/
      bool internal; /*!< true if it is a system/internal
                     transaction background task. Such
                     transactions are always treated as
                     read-write. */
                     /*------------------------------*/
    #ifdef UNIV_DEBUG
      ulint start_line;       /*!< Track where it was started from */
      const char *start_file; /*!< Filename where it was started */
    #endif                    /* UNIV_DEBUG */
    
      lint n_ref; /*!< Count of references, protected
                  by trx_t::mutex. We can't release the
                  locks nor commit the transaction until
                  this reference is 0.  We can change
                  the state to COMMITTED_IN_MEMORY to
                  signify that it is no longer
                  "active". */
    
      /** Version of this instance. It is incremented each time the
      instance is re-used in trx_start_low(). It is used to track
      whether a transaction has been restarted since it was tagged
      for asynchronous rollback. */
      ulint version;
    
      XID *xid;                    /*!< X/Open XA transaction
                                   identification to identify a
                                   transaction branch */
      trx_mod_tables_t mod_tables; /*!< List of tables that were modified
                                   by this transaction */
    #endif                         /* !UNIV_HOTBACKUP */
                                   /*------------------------------*/
      bool api_trx;                /*!< trx started by InnoDB API */
      bool api_auto_commit;        /*!< automatic commit */
      bool read_write;             /*!< if read and write operation */
    
      /*------------------------------*/
      char *detailed_error;          /*!< detailed error message for last
                                     error, or empty. */
      FlushObserver *flush_observer; /*!< flush observer */
    
    #ifdef UNIV_DEBUG
      bool is_dd_trx; /*!< True if the transaction is used for
                      doing Non-locking Read-only Read
                      Committed on DD tables */
    #endif            /* UNIV_DEBUG */
      ulint magic_n;
    
      bool is_read_uncommitted() const {
        return (isolation_level == READ_UNCOMMITTED);
      }
    
      bool skip_gap_locks() const {
        switch (isolation_level) {
          case READ_UNCOMMITTED:
          case READ_COMMITTED:
            return (true);
          case REPEATABLE_READ:
          case SERIALIZABLE:
            return (false);
        }
        ut_ad(0);
        return (false);
      }
    
      bool allow_semi_consistent() const { return (skip_gap_locks()); }
    };
    
    

    上面太长不看,简化如下:

    struct trx_t {
    	...
    	trx_lock_t lock;   /*!< Information about the transaction
                         locks and state. Protected by
                         trx->mutex or lock_sys->mutex
                         or both */
    	...
    }
    

    在 8.0 中,将 lock 单独从 trx_t 中解耦出来。观察 trx_lock_t 的定义。

    /** Latching protocol for trx_lock_t::que_state.  trx_lock_t::que_state
     captures the state of the query thread during the execution of a query.
     This is different from a transaction state. The query state of a transaction
     can be updated asynchronously by other threads.  The other threads can be
     system threads, like the timeout monitor thread or user threads executing
     other queries. Another thing to be mindful of is that there is a delay between
     when a query thread is put into LOCK_WAIT state and before it actually starts
     waiting.  Between these two events it is possible that the query thread is
     granted the lock it was waiting for, which implies that the state can be
     changed asynchronously.
    
     All these operations take place within the context of locking. Therefore state
     changes within the locking code must acquire both the lock mutex and the
     trx->mutex when changing trx->lock.que_state to TRX_QUE_LOCK_WAIT or
     trx->lock.wait_lock to non-NULL but when the lock wait ends it is sufficient
     to only acquire the trx->mutex.
     To query the state either of the mutexes is sufficient within the locking
     code and no mutex is required when the query thread is no longer waiting. */
    
    /** The locks and state of an active transaction. Protected by
    lock_sys->mutex, trx->mutex or both. */
    struct trx_lock_t {
      ulint n_active_thrs; /*!< number of active query threads */
    
      trx_que_t que_state; /*!< valid when trx->state
                           == TRX_STATE_ACTIVE: TRX_QUE_RUNNING,
                           TRX_QUE_LOCK_WAIT, ... */
    
      lock_t *wait_lock;         /*!< if trx execution state is
                                 TRX_QUE_LOCK_WAIT, this points to
                                 the lock request, otherwise this is
                                 NULL; set to non-NULL when holding
                                 both trx->mutex and lock_sys->mutex;
                                 set to NULL when holding
                                 lock_sys->mutex; readers should
                                 hold lock_sys->mutex, except when
                                 they are holding trx->mutex and
                                 wait_lock==NULL */
      ib_uint64_t deadlock_mark; /*!< A mark field that is initialized
                                 to and checked against lock_mark_counter
                                 by lock_deadlock_recursive(). */
      bool was_chosen_as_deadlock_victim;
      /*!< when the transaction decides to
      wait for a lock, it sets this to false;
      if another transaction chooses this
      transaction as a victim in deadlock
      resolution, it sets this to true.
      Protected by trx->mutex. */
      time_t wait_started; /*!< lock wait started at this time,
                           protected only by lock_sys->mutex */
    
      que_thr_t *wait_thr; /*!< query thread belonging to this
                           trx that is in QUE_THR_LOCK_WAIT
                           state. For threads suspended in a
                           lock wait, this is protected by
                           lock_sys->mutex. Otherwise, this may
                           only be modified by the thread that is
                           serving the running transaction. */
    
      lock_pool_t rec_pool; /*!< Pre-allocated record locks */
    
      lock_pool_t table_pool; /*!< Pre-allocated table locks */
    
      ulint rec_cached; /*!< Next free rec lock in pool */
    
      ulint table_cached; /*!< Next free table lock in pool */
    
      mem_heap_t *lock_heap; /*!< memory heap for trx_locks;
                             protected by lock_sys->mutex */
    
      trx_lock_list_t trx_locks; /*!< locks requested by the transaction;
                                 insertions are protected by trx->mutex
                                 and lock_sys->mutex; removals are
                                 protected by lock_sys->mutex */
    
      lock_pool_t table_locks; /*!< All table locks requested by this
                               transaction, including AUTOINC locks */
    
      ulint n_rec_locks; /*!< number of rec locks in this trx */
    #ifdef UNIV_DEBUG
      /** When a transaction is forced to rollback due to a deadlock
      check or by another high priority transaction this is true. Used
      by debug checks in lock0lock.cc */
      bool in_rollback;
    #endif /* UNIV_DEBUG */
    
      /** The transaction called ha_innobase::start_stmt() to
      lock a table. Most likely a temporary table. */
      bool start_stmt;
    };
    
    typedef UT_LIST_BASE_NODE_T(lock_t) trx_lock_list_t;
    
    trx_lock_list_t trx_locks; /*!< locks requested by the transaction;
                                 insertions are protected by trx->mutex
                                 and lock_sys->mutex; removals are
                                 protected by lock_sys->mutex */
    

    每个事务都会维护执行过程中加的所有锁( trx_locks,双向链表),以及发生锁等待时等待的锁( wait_lock

    lock_sys 中保存的对象为 lock_t ,它可以描述一个行锁或者表锁,如果是行锁,会被 lock_sys 维护,若是表锁,会被对应的 dict_table_t 维护

    /** Lock struct; protected by lock_sys->mutex */
    struct lock_t {
      /** transaction owning the lock */
      trx_t *trx;
    
      /** list of the locks of the transaction */
      UT_LIST_NODE_T(lock_t) trx_locks;
    
      /** Index for a record lock */
      dict_index_t *index;
    
      /** Hash chain node for a record lock. The link node in a singly
      linked list, used by the hash table. */
      lock_t *hash;
    
      union {
        /** Table lock */
        lock_table_t tab_lock;
    
        /** Record lock */
        lock_rec_t rec_lock;
      };
    
    #ifdef HAVE_PSI_THREAD_INTERFACE
    #ifdef HAVE_PSI_DATA_LOCK_INTERFACE
      /** Performance schema thread that created the lock. */
      ulonglong m_psi_internal_thread_id;
    
      /** Performance schema event that created the lock. */
      ulonglong m_psi_event_id;
    #endif /* HAVE_PSI_DATA_LOCK_INTERFACE */
    #endif /* HAVE_PSI_THREAD_INTERFACE */
    
      /** The lock type and mode bit flags.
      LOCK_GAP or LOCK_REC_NOT_GAP, LOCK_INSERT_INTENTION, wait flag, ORed */
      uint32_t type_mode;
    
    #if defined(UNIV_DEBUG)
      /** Timestamp when it was created. */
      uint64_t m_seq;
    #endif /* UNIV_DEBUG */
    
      /** Remove GAP lock from a next Key Lock */
      void remove_gap_lock() {
        ut_ad(!is_gap());
        ut_ad(!is_insert_intention());
        ut_ad(is_record_lock());
    
        type_mode |= LOCK_REC_NOT_GAP;
      }
    
      /** Determine if the lock object is a record lock.
      @return true if record lock, false otherwise. */
      bool is_record_lock() const { return (type() == LOCK_REC); }
    
      /** Determine if it is predicate lock.
      @return true if predicate lock, false otherwise. */
      bool is_predicate() const {
        return (type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE));
      }
    
      /** @return true if the lock wait flag is set */
      bool is_waiting() const { return (type_mode & LOCK_WAIT); }
    
      /** @return true if the gap lock bit is set */
      bool is_gap() const { return (type_mode & LOCK_GAP); }
    
      /** @return true if the not gap lock bit is set */
      bool is_record_not_gap() const { return (type_mode & LOCK_REC_NOT_GAP); }
    
      /** @return true if the insert intention bit is set */
      bool is_insert_intention() const {
        return (type_mode & LOCK_INSERT_INTENTION);
      }
    
      /** @return the lock mode */
      uint32_t type() const { return (type_mode & LOCK_TYPE_MASK); }
    
      /** @return the precise lock mode */
      lock_mode mode() const {
        return (static_cast<lock_mode>(type_mode & LOCK_MODE_MASK));
      }
    
      /** Get lock hash table
      @return lock hash table */
      hash_table_t *hash_table() const { return (lock_hash_get(type_mode)); }
    
      /** @return the record lock tablespace ID */
      space_id_t space_id() const {
        ut_ad(is_record_lock());
    
        return (rec_lock.space);
      }
    
      /** @return the record lock page number */
      page_no_t page_no() const {
        ut_ad(is_record_lock());
    
        return (rec_lock.page_no);
      }
    
      /** @return the transaction's query thread state. */
      trx_que_t trx_que_state() const { return (trx->lock.que_state); }
    
      /** Print the lock object into the given output stream.
      @param[in,out]	out	the output stream
      @return the given output stream. */
      std::ostream &print(std::ostream &out) const;
    
      /** Convert the member 'type_mode' into a human readable string.
      @return human readable string */
      std::string type_mode_string() const;
    
      /* @return the string/text representation of the record type. */
      const char *type_string() const {
        switch (type_mode & LOCK_TYPE_MASK) {
          case LOCK_REC:
            return ("LOCK_REC");
          case LOCK_TABLE:
            return ("LOCK_TABLE");
          default:
            ut_error;
        }
      }
    };
    

    不论行锁还是表锁,都记录了所属的事务,锁创建后会通过 trx_locks(next和prev指针)添加到所属事务已获得锁链表中( trx_t.lock.trx_locks ),如果创建的锁为行锁,会使用(space_no , page_no )经过hash函数计算后添加到 lock_sys 中。如果创建的锁为表锁,将其加入到对应dict_table_t的表锁链表尾部(lock_table_create)。

    行锁与表锁

    InnoDB 中一个事务中的行锁是按照 page 和锁类型组织,相同的锁类型,锁一行和锁这行所在page中所有行的存储代价是一样的,lock_rec_t 中的成员 nbits 后面有一个 bitmapbitmap 占用的存储空间为:(1 + (nbits-1)/8) bytes,即:lock_rec_t 占用的实际存储空间为:sizeof(lock_rec_t) + 1 + (nbits-1)/8,bitmap 中的每一位标识对应的行是否加锁

    /** Record lock for a page */
    struct lock_rec_t {
      space_id_t space;  /*!< space id */
      page_no_t page_no; /*!< page number */
      uint32_t n_bits;   /*!< number of bits in the lock
                         bitmap; NOTE: the lock bitmap is
                         placed immediately after the
                         lock struct */
    
      /** Print the record lock into the given output stream
      @param[in,out]	out	the output stream
      @return the given output stream. */
      std::ostream &print(std::ostream &out) const;
    };
    

    lock_table_t 表锁,保存了对应的 table 和 table上表锁链接 locks(prev和next指针)

    /** A table lock */
    struct lock_table_t {
      dict_table_t *table; /*!< database table in dictionary
                           cache */
      UT_LIST_NODE_T(lock_t)
      locks; /*!< list of locks on the same
             table */
      /** Print the table lock into the given output stream
      @param[in,out]	out	the output stream
      @return the given output stream. */
      std::ostream &print(std::ostream &out) const;
    };
    

    lock_t 表示一个锁,表锁或者行锁,其中的 trx_locks 成员会被链接到 trx_t 中的已加锁链表 trx_t.lock.trx_locks 中,而 lock_table_t 中的 locks 成员是需要链接到对应 table 后面,作用不同但容易混淆;lock_t中的 hash 和 index 成员放到 lock_rec_t 结构体中更加合适,可能是基于 lock_t 是行锁的可能性更大的前提考虑的

    Wait-For-Graph在InnoDB中的实现

    InnoDB 在实现 Wait-For-Graph 时基于性能方面的考虑,定义了两个变量:

    LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK (默认200):深度优先遍历的层数超过此值,即认为发生死锁,回滚当前事务

    LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK(默认1000000):lock_deadlock_recursive递归调用的次数超过此值,即认为发生死锁,回滚当前事务

    笔者测试的版本中,死锁检测单独为一个类 DeadlockChecker

    class DeadlockChecker {
     public:
      /** Checks if a joining lock request results in a deadlock. If
      a deadlock is found this function will resolve the deadlock
      by choosing a victim transaction and rolling it back. It
      will attempt to resolve all deadlocks. The returned transaction
      id will be the joining transaction id or 0 if some other
      transaction was chosen as a victim and rolled back or no
      deadlock found.
    
      @param lock lock the transaction is requesting
      @param trx transaction requesting the lock
    
      @return id of transaction chosen as victim or 0 */
      static const trx_t *	(const lock_t *lock, trx_t *trx);
    
     private:
      /** Do a shallow copy. Default destructor OK.
      @param trx the start transaction (start node)
      @param wait_lock lock that a transaction wants
      @param mark_start visited node counter */
      DeadlockChecker(const trx_t *trx, const lock_t *wait_lock,
                      uint64_t mark_start)
          : m_cost(),
            m_start(trx),
            m_too_deep(),
            m_wait_lock(wait_lock),
            m_mark_start(mark_start),
            m_n_elems() {}
    
      /** Check if the search is too deep. */
      bool is_too_deep() const {
        return (m_n_elems > LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK ||
                m_cost > LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK);
      }
    
      /** Save current state.
      @param lock lock to push on the stack.
      @param heap_no the heap number to push on the stack.
      @return false if stack is full. */
      bool push(const lock_t *lock, ulint heap_no) {
        ut_ad((lock_get_type_low(lock) & LOCK_REC) ||
              (lock_get_type_low(lock) & LOCK_TABLE));
    
        ut_ad(((lock_get_type_low(lock) & LOCK_TABLE) != 0) ==
              (heap_no == ULINT_UNDEFINED));
    
        /* Ensure that the stack is bounded. */
        if (m_n_elems >= UT_ARR_SIZE(s_states)) {
          return (false);
        }
    
        state_t &state = s_states[m_n_elems++];
    
        state.m_lock = lock;
        state.m_wait_lock = m_wait_lock;
        state.m_heap_no = heap_no;
    
        return (true);
      }
    
      /** Restore state.
      @param[out] lock current lock
      @param[out] heap_no current heap_no */
      void pop(const lock_t *&lock, ulint &heap_no) {
        ut_a(m_n_elems > 0);
    
        const state_t &state = s_states[--m_n_elems];
    
        lock = state.m_lock;
        heap_no = state.m_heap_no;
        m_wait_lock = state.m_wait_lock;
      }
    
      /** Check whether the node has been visited.
      @param lock lock to check
      @return true if the node has been visited */
      bool is_visited(const lock_t *lock) const {
        return (lock->trx->lock.deadlock_mark > m_mark_start);
      }
    
      /** Get the next lock in the queue that is owned by a transaction
      whose sub-tree has not already been searched.
      @param lock Lock in queue
      @param heap_no heap_no if lock is a record lock else ULINT_UNDEFINED
      @return next lock or NULL if at end of queue */
      const lock_t *get_next_lock(const lock_t *lock, ulint heap_no) const;
    
      /** Get the first lock to search. The search starts from the current
      wait_lock. What we are really interested in is an edge from the
      current wait_lock's owning transaction to another transaction that has
      a lock ahead in the queue. We skip locks where the owning transaction's
      sub-tree has already been searched.
    
      For record locks, we first position the iterator on first lock on
      the page and then reposition on the actual heap_no. This is required
      due to the way the record lock has is implemented.
    
      @param[out] heap_no if rec lock, else ULINT_UNDEFINED.
    
      @return first lock or NULL */
      const lock_t *get_first_lock(ulint *heap_no) const;
    
      /** Notify that a deadlock has been detected and print the conflicting
      transaction info.
      @param lock lock causing deadlock */
      void notify(const lock_t *lock) const;
    
      /** Select the victim transaction that should be rolledback.
      @return victim transaction */
      const trx_t *select_victim() const;
    
      /** Rollback transaction selected as the victim. */
      void trx_rollback();
    
      /** Looks iteratively for a deadlock. Note: the joining transaction
      may have been granted its lock by the deadlock checks.
    
      @return 0 if no deadlock else the victim transaction.*/
      const trx_t *search();
    
    #ifdef UNIV_DEBUG
      /** Determines if a situation in which the lock takes part in a deadlock
      cycle is expected (as in: handled correctly) or not (say because it is on a DD
      table, for which there is no reason to expect a deadlock and we don't handle
      deadlocks correctly). The purpose of the function is to use it in an assertion
      failing as soon as the deadlock is identified, to give developer a chance to
      investigate the root cause of the situation (without such assertion, the code
      might continue to run and either fail at later stage when the data useful for
      debugging is no longer on stack, or not fail at all, which is risky).
      @param[in] lock lock found in a deadlock cycle
      @return true if we expect that this lock can take part in a deadlock cycle */
      static bool is_allowed_to_be_on_cycle(const lock_t *lock);
    #endif /* UNIV_DEBUG */
    
      /** Print transaction data to the deadlock file and possibly to stderr.
      @param trx transaction
      @param max_query_len max query length to print */
      static void print(const trx_t *trx, ulint max_query_len);
    
      /** rewind(3) the file used for storing the latest detected deadlock
      and print a heading message to stderr if printing of all deadlocks to
      stderr is enabled. */
      static void start_print();
    
      /** Print lock data to the deadlock file and possibly to stderr.
      @param lock record or table type lock */
      static void print(const lock_t *lock);
    
      /** Print a message to the deadlock file and possibly to stderr.
      @param msg message to print */
      static void print(const char *msg);
    
      /** Print info about transaction that was rolled back.
      @param trx transaction rolled back
      @param lock lock trx wants */
      static void rollback_print(const trx_t *trx, const lock_t *lock);
    
     private:
      /** DFS state information, used during deadlock checking. */
      struct state_t {
        const lock_t *m_lock;      /*!< Current lock */
        const lock_t *m_wait_lock; /*!< Waiting for lock */
        ulint m_heap_no;           /*!< heap number if rec lock */
      };
    
      /** Used in deadlock tracking. Protected by lock_sys->mutex. */
      static uint64_t s_lock_mark_counter;
    
      /** Calculation steps thus far. It is the count of the nodes visited. */
      ulint m_cost;
    
      /** Joining transaction that is requesting a lock in an
      incompatible mode */
      const trx_t *m_start;
    
      /** true if search was too deep and was aborted */
      bool m_too_deep;
    
      /** Lock that trx wants */
      const lock_t *m_wait_lock;
    
      /**  Value of lock_mark_count at the start of the deadlock check. */
      uint64_t m_mark_start;
    
      /** Number of states pushed onto the stack */
      size_t m_n_elems;
    
      /** This is to avoid malloc/free calls. */
      static state_t s_states[MAX_STACK_SIZE];
    };
    
    /** Looks iteratively for a deadlock. Note: the joining transaction may
    have been granted its lock by the deadlock checks.
    @return 0 if no deadlock else the victim transaction instance.*/
    const trx_t *DeadlockChecker::search() {
      ut_ad(lock_mutex_own());
      ut_ad(!trx_mutex_own(m_start));
    
      ut_ad(m_start != NULL);
      ut_ad(m_wait_lock != NULL);
      check_trx_state(m_wait_lock->trx);
      ut_ad(m_mark_start <= s_lock_mark_counter);
    
      /* Look at the locks ahead of wait_lock in the lock queue. */
      ulint heap_no;
      const lock_t *lock = get_first_lock(&heap_no);
    
      for (;;) {
        /* We should never visit the same sub-tree more than once. */
        ut_ad(lock == NULL || !is_visited(lock));
    
        while (m_n_elems > 0 && lock == NULL) {
          /* Restore previous search state. */
    
          pop(lock, heap_no);
    
          lock = get_next_lock(lock, heap_no);
        }
    
        if (lock == NULL) {
          break;
        } else if (lock == m_wait_lock) {
          /* We can mark this subtree as searched */
          ut_ad(lock->trx->lock.deadlock_mark <= m_mark_start);
    
          lock->trx->lock.deadlock_mark = ++s_lock_mark_counter;
    
          /* We are not prepared for an overflow. This 64-bit
          counter should never wrap around. At 10^9 increments
          per second, it would take 10^3 years of uptime. */
    
          ut_ad(s_lock_mark_counter > 0);
    
          /* Backtrack */
          lock = NULL;
    
        } else if (!lock_has_to_wait(m_wait_lock, lock)) {
          /* No conflict, next lock */
          lock = get_next_lock(lock, heap_no);
    
        } else if (lock->trx == m_start) {
          /* Found a cycle. */
    
          notify(lock);
    
          /* We don't expect deadlocks with most DD tables and all SDI tables.
          If we find, we crash early to find the transactions causing deadlock */
          ut_ad(is_allowed_to_be_on_cycle(lock));
          ut_ad(is_allowed_to_be_on_cycle(m_wait_lock));
    
          return (select_victim());
    
        } else if (is_too_deep()) {
          /* Search too deep to continue. */
          m_too_deep = true;
          return (m_start);
    
        } else if (lock->trx_que_state() == TRX_QUE_LOCK_WAIT) {
          /* Another trx ahead has requested a lock in an
          incompatible mode, and is itself waiting for a lock. */
    
          ++m_cost;
    
          if (!push(lock, heap_no)) {
            m_too_deep = true;
            return (m_start);
          }
    
          m_wait_lock = lock->trx->lock.wait_lock;
    
          lock = get_first_lock(&heap_no);
    
          if (is_visited(lock)) {
            lock = get_next_lock(lock, heap_no);
          }
    
        } else {
          lock = get_next_lock(lock, heap_no);
        }
      }
    
      ut_a(lock == NULL && m_n_elems == 0);
    
      /* No deadlock found. */
      return (0);
    }
    

    InnoDB通常在哪些情况下可能发生死锁

    回顾一下cs课本上关于发生死锁的几个条件:

    1. 资源互斥;
    2. 请求保持;
    3. 不剥夺;
    4. 循环等待

    对于DB而言,导致死锁意味着发生了循环等待,在InnoDB中由于行锁的引入,比较容易发生死锁,下面总结一些发生死锁的情况(不全):

    1. 同一索引上,两个session相反的顺序加锁多行记录
    2. Primary key和Secondary index,通过primary key找到记录,更新Secondary index字段与通过Secondary index更新记录
    3. UPDATE/DELETE通过不同的二级索引更新多条记录,可能造成在Primary key上不同的加锁顺序
  • 相关阅读:
    KubeCon 2020 演讲集锦|《阿里巴巴云原生技术与实践 13 讲》开放下载
    突围数字化转型,让特步同比增长24.8%的全渠道中台
    阿里云飞天大数据产品价值解读——《一站式高质量搜索开放搜索》
    高德AR驾车导航解决方案
    我在阿里写代码学会的六件事
    送外卖也要“黑科技”?阿里移动感知技术应用揭秘
    阿里云机器学习怎么玩?这本新手入门指南揭秘了!
    用户自定义类型03 零基础入门学习Delphi33
    用户自定义类型03 零基础入门学习Delphi33
    用户自定义类型01 零基础入门学习Delphi31
  • 原文地址:https://www.cnblogs.com/Forgenvueory/p/12023954.html
Copyright © 2020-2023  润新知