数据库内核月报－ 2019/02 - MySQL · 源码分析 · 8.0 Functional index的实现过程 - 《数据库内核月报》

MySQL从8.0.13开始支持functional index。Functional index类似于ORACLE的Function-Based Indexes。该索引可以根据将索引定义的表达式的值按照索引顺序存到索引里，进而减少表达式的计算，加速查询。

下面我们看一下如何创建一个functional index：

CREATE TABLE t1 (col1 INT, col2 INT, INDEX func_index ((ABS(col1))));
CREATE INDEX idx1 ON t1 ((col1 + col2));
CREATE INDEX idx2 ON t1 ((col1 + col2), (col1 - col2), col1);
ALTER TABLE t1 ADD INDEX ((col1 * 40) DESC);

接下来我们继续看一下functional index的效果：

mysql> CREATE TABLE t1 (col1 INT, col2 INT);
Query OK, 0 rows affected (0.13 sec)
mysql> SELECT * FROM t1 WHERE col1+col2 > 10;
Empty set (0.01 sec)
mysql> EXPLAIN SELECT * FROM t1 WHERE col1+col2 > 10;
+----+-------------+-------+------------+------+---------------+------+---------+------+------+----------+-------------+
| id | select_type | table | partitions | type | possible_keys | key  | key_len | ref  | rows | filtered | Extra       |
+----+-------------+-------+------------+------+---------------+------+---------+------+------+----------+-------------+
|  1 | SIMPLE      | t1    | NULL       | ALL  | NULL          | NULL | NULL    | NULL |    1 |   100.00 | Using where |
+----+-------------+-------+------------+------+---------------+------+---------+------+------+----------+-------------+
1 row in set, 1 warning (0.01 sec)
mysql> CREATE INDEX idx1 ON t1 ((col1 + col2));
Query OK, 0 rows affected (0.14 sec)
Records: 0  Duplicates: 0  Warnings: 0
mysql> EXPLAIN SELECT * FROM t1 WHERE col1+col2 > 10;
+----+-------------+-------+------------+-------+---------------+------+---------+------+------+----------+-------------+
| id | select_type | table | partitions | type  | possible_keys | key  | key_len | ref  | rows | filtered | Extra       |
+----+-------------+-------+------------+-------+---------------+------+---------+------+------+----------+-------------+
|  1 | SIMPLE      | t1    | NULL       | range | idx1          | idx1 | 9       | NULL |    1 |   100.00 | Using where |
+----+-------------+-------+------------+-------+---------------+------+---------+------+------+----------+-------------+
1 row in set, 1 warning (0.00 sec)

从上面的例子中我们可以看到查询中使用了functional索引 idx1来加速查询。

MySQL的functinal index是利用generated column来辅助实现的，后面的章节中我们会详细的进行分析。所以对于创建functional index的一些限制可以参考：创建generated column 以及增加generated column。

下面我们从源码来看一下MySQL functional index的实现过程。

上面的流程图是MySQL创建functional index的一个基本流程。我们重点看一下add_functional_index_to_create_list这个函数的处理过程。

/**
  Prepares a functional index by adding a hidden indexed generated column for the key part.
  A functional index is implemented as a hidden generated column over the
  expression specified in the index, and the hidden generated column is then indexed. This function adds a hidden generated column to the Create_list, and updates the key specification to point to this new column. The generated column is given a name that is a hash of the key name and the key part number.
*/
static bool add_functional_index_to_create_list(THD *thd,
                        Key_spec *key_spec,
                        Alter_info *alter_info,
                        Key_part_spec *kp,
                        uint key_part_number,
                        HA_CREATE_INFO *create_info) {
  // A functional index cannot be a primary key
  /* 这里限制了functional index 不能作为主键，因为它是个generated column */
  if (key_spec->type == KEYTYPE_PRIMARY) {
    my_error(ER_FUNCTIONAL_INDEX_PRIMARY_KEY, MYF(0));
    return true;
  }
  // If the key isn't given a name explicitly by the user, we must auto-generate
  // a name here. "Normal" indexes will be given a name in prepare_key(), but
  // that is too late for functional indexes since we want the hidden generated
  // column name to be based on the index name.
  // 生成一个默认的索引名称
  if (key_spec->name.str == nullptr) {
    std::string key_name;
    int count = 2;
    key_name.assign("functional_index");
    while (key_name_exists(alter_info->key_list, key_name, nullptr)) {
      key_name.assign("functional_index_");
      key_name.append(std::to_string(count++));
    }
    key_spec->name.length = key_name.size();
    key_spec->name.str = strmake_root(thd->stmt_arena->mem_root,
                                      key_name.c_str(), key_name.size());
  } else {    if (key_name_exists(alter_info->key_list,
                        {key_spec->name.str, key_spec->name.length},
                        key_spec)) {
      my_error(ER_DUP_KEYNAME, MYF(0), key_spec->name.str);
      return true;
    }
  }
  // First we need to resolve the expression in the functional index so that we
  // know the correct collation, data type, length etc...
  ulong saved_privilege = thd->want_privilege;
  thd->want_privilege = SELECT_ACL;
  {
    // Create a scope guard so that we are guaranteed that the privileges are
    // set back to the original value.
    auto handler_guard = create_scope_guard(
        [thd, saved_privilege]() { thd->want_privilege = saved_privilege; });
    Functional_index_error_handler error_handler(
        {key_spec->name.str, key_spec->name.length}, thd);
    Item *expr = kp->get_expression();
    if (expr->type() == Item::FIELD_ITEM) {
      my_error(ER_FUNCTIONAL_INDEX_ON_FIELD, MYF(0));
      return true;
    }
    // 这里验证表达式的合法性，是否违反generated column的约束条件
    if (pre_validate_value_generator_expr(kp->get_expression(),
                                          key_spec->name.str, true)) {
      return true;
    }
    Replace_field_processor_arg replace_field_argument(
        thd, &alter_info->create_list, create_info, key_spec->name.str);
    if (expr->walk(&Item::replace_field_processor, Item::WALK_PREFIX,
                   reinterpret_cast<uchar *>(&replace_field_argument))) {
      return true;
    }
    if (kp->resolve_expression(thd)) return true;
  }
  // 默认隐式列生成一个名字
  const char *field_name = make_functional_index_column_name(
      {key_spec->name.str, key_spec->name.length}, key_part_number,
      thd->stmt_arena->mem_root);
  Item *item = kp->get_expression();
  // Ensure that we aren't trying to index a field
  DBUG_ASSERT(item->type() != Item::FIELD_ITEM);  TABLE tmp_table;
  TABLE_SHARE share;
  tmp_table.s = &share;
  init_tmp_table_share(thd, &share, "", 0, "", "", nullptr);
  tmp_table.s->db_create_options = 0;
  tmp_table.s->db_low_byte_first = false;
  tmp_table.set_not_started();
  // 生成generated column的创建信息
  Create_field *cr = generate_create_field(thd, item, &tmp_table);
  if (cr == nullptr) {
    return true; /* purecov: deadcode */
  }
  if (is_blob(cr->sql_type)) {
    my_error(ER_FUNCTIONAL_INDEX_ON_LOB, MYF(0));
    return true;
  }
  cr->field_name = field_name;
  cr->field = nullptr;
  cr->hidden = dd::Column::enum_hidden_type::HT_HIDDEN_SQL;
  cr->stored_in_db = false;
  Value_generator *gcol_info = new (thd->mem_root) Value_generator();
  gcol_info->expr_item = kp->get_expression();
  // 生成一个virtual generated column
  gcol_info->set_field_stored(false);
  gcol_info->set_field_type(cr->sql_type);  cr->gcol_info = gcol_info;
  alter_info->create_list.push_back(cr);
  alter_info->flags |= Alter_info::ALTER_ADD_COLUMN;
  // 这里将KEY的索引列设置为隐式generated column
  kp->set_name_and_prefix_length(field_name, 0);
  return false;
}

函数的注释里面说的非常详细，functional index的创建过程依赖于generated column来做辅助。创建functional index的时候都要隐式的创建一个generated column，然后在该generated column上创建对应的索引。

上面我们看到了源码中是如何创建一个functional index。那么接下来我们继续看一下MySQL是如何为查询寻找合适的functional index的。

就拿上面的例子看一下调用堆栈：

EXPLAIN SELECT * FROM t1 WHERE col1+col2 > 10;
#0  substitute_gc (thd=0x2aab94000be0, select_lex=0x2aab94270298, where_cond=0x2aab94271ec8, group_list=0x0, order=0x0) 
#1  0x0000000003049283 in JOIN::optimize (this=0x2aab94272750) 
#2  0x0000000003165c32 in SELECT_LEX::optimize (this=0x2aab94270298, thd=0x2aab94000be0) 
#3  0x000000000316221c in Sql_cmd_dml::execute_inner (this=0x2aab94272078, thd=0x2aab94000be0) 
#4  0x00000000031614d3 in Sql_cmd_dml::execute (this=0x2aab94272078, thd=0x2aab94000be0) 
#5  0x00000000030a7396 in mysql_execute_command (thd=0x2aab94000be0, first_level=true) 
#6  0x00000000030ac74b in mysql_parse (thd=0x2aab94000be0, parser_state=0x2aab8c2462d0, force_primary_storage_engine=false)
#7  0x0000000003095b0d in dispatch_command (thd=0x2aab94000be0, com_data=0x2aab8c246c40, command=COM_QUERY) 
#8  0x0000000003091d7d in do_command (thd=0x2aab94000be0) 
#9  0x00000000033d145b in handle_connection (arg=0xcb9cee0) 
#10 0x00000000066cd007 in pfs_spawn_thread (arg=0xca3bde0) 
#11 0x00002aaaaacd4aa1 in start_thread () from /lib64/libpthread.so.0
#12 0x00002aaaabfb993d in clone () from /lib64/libc.so.6

上面的堆栈可以看到优化器调用了substitute_gc这个函数，这个函数就可以将WHERE，GROUP_BY 以及ORDER BY中的相关表达式替换为隐式的generated column，进而可以让优化器来选择functional index。我们再来研究一下substitute_gc这个函数的源码。

bool substitute_gc(THD *thd, SELECT_LEX *select_lex, Item *where_cond,
                   ORDER *group_list, ORDER *order) {
  List<Field> indexed_gc;
  Opt_trace_context *const trace = &thd->opt_trace;
  Opt_trace_object trace_wrapper(trace);
  Opt_trace_object subst_gc(trace, "substitute_generated_columns");
  // Collect all GCs that are a part of a key
  // 这里要遍历所有的表来收集所有可以被替换的generated columns。后面的代码中会分析哪些表达式可以被替换
  for (TABLE_LIST *tl = select_lex->leaf_tables; tl; tl = tl->next_leaf) {
    if (tl->table->s->keys == 0) continue;
    for (uint i = 0; i < tl->table->s->fields; i++) {
      Field *fld = tl->table->field[i];
      // 这里判断只有在索引中的列并且generated column可以用来替换表达式才会作为候选的列。
      if (fld->is_gcol() &&
          !(fld->part_of_key.is_clear_all() &&
            fld->part_of_prefixkey.is_clear_all()) &&
          fld->gcol_info->expr_item->can_be_substituted_for_gc()) {
        // Don't check allowed keys here as conditions/group/order use
        // different keymaps for that.
        indexed_gc.push_back(fld);
      }
    }
  }  // No GC in the tables used in the query
  if (indexed_gc.elements == 0) return false;
  if (where_cond) {
    // Item_func::compile will dereference this pointer, provide valid value.
    uchar i, *dummy = &i;
    /**
      这里会利用generated column来替换where_cond里面对应的表达式。
      Item::gc_subst_analyzer 该虚函数定义了每一种Item是否需要进行generated column的替换过程
      Item::gc_subst_transformer 该函数定义了每一种可替换的Item如何利用generated column进行替换
    */
    where_cond->compile(&Item::gc_subst_analyzer, &dummy,
                        &Item::gc_subst_transformer, 
                        (uchar *)&indexed_gc);
    subst_gc.add("resulting_condition", where_cond);
  }
  if (!(group_list || order)) return false;
  // Filter out GCs that do not have index usable for GROUP/ORDER
  Field *gc;
  List_iterator<Field> li(indexed_gc);
  while ((gc = li++)) {
    Key_map tkm = gc->part_of_key;
    // 这里判断generated column相关的索引是否与group-by 或者 order-by的列有交集，如果没有相关性，就忽略。
    tkm.intersect(group_list ? gc->table->keys_in_use_for_group_by
                             : gc->table->keys_in_use_for_order_by);
    if (tkm.is_clear_all()) li.remove();
  }
  if (!indexed_gc.elements) return false;
  // Index could be used for ORDER only if there is no GROUP
  ORDER *list = group_list ? group_list : order;
  bool changed = false;
  for (ORDER *ord = list; ord; ord = ord->next) {  
    li.rewind();
    // 这里判断group-by或者order-by的列是否是表达式或者函数来进行generated column替换。
    if (!(*ord->item)->can_be_substituted_for_gc()) continue;
    while ((gc = li++)) {
      Item_func *tmp = pointer_cast<Item_func *>(*ord->item);
      Item_field *field;
      // 这里会根据表达式与generated column->gcol_info->expr_item进行比较来获取匹配的generated column
      if ((field = get_gc_for_expr(&tmp, gc, gc->result_type()))) {
        changed = true;
        /* Add new field to field list. */
        ord->item = select_lex->add_hidden_item(field);
        break;
      }
    }
  }
  if (changed && trace->is_started()) {
    String str;
    SELECT_LEX::print_order(
        &str, list,
        enum_query_type(QT_TO_SYSTEM_CHARSET | QT_SHOW_SELECT_NUMBER |
                        QT_NO_DEFAULT_DB));
    subst_gc.add_utf8(group_list ? "resulting_GROUP_BY" : "resulting_ORDER_BY",
                      str.ptr(), str.length());
  }
  return changed;
}

综上所述，本篇文章主要从源码层面对MySQL 8.0 实现的Functional index进行了一下简要的分析。Functional index主要依赖于generated column，利用内部隐式的创建一个generated column来辅助创建functional index。代码层面也比较容易理解，希望该篇文章能够帮助广大读者了解MySQL functional index的实现原理。