From 4cff562f3f89d4df03e09233d835d0451bc37cc4 Mon Sep 17 00:00:00 2001 From: Sergei Petrunia Date: Fri, 21 Nov 2025 13:17:19 +0200 Subject: [PATCH] MDEV-38164: Fix the estimates reported by TABLE::key_storage_length() They were based on the maximum possible key tuple length, which can be much larger than the real data size. The return value is used by handler::keyread_time(), which is used to estimate the cost of range access. This could cause range access not to be picked, even if it uses the clustered PK and reads about 8% of the table. The fix is to add KEY::stat_storage_length (next to KEY::rec_per_key) and have the storage engine fill it in handler::info(HA_STATUS_CONST). Currently, only InnoDB fills this based on its internal statistics: index->stat_index_size and ib_table->stat_n_rows. Also changed: - In handler::calculate_costs(), use ha_keyread_clustered_time() when computing clustered PK read cost, not ha_keyread_time(). The fix is OFF by default and enabled by setting FIX_INDEX_LOOKUP_COST flag in @@new_mode. --- mysql-test/main/mysqld--help.result | 2 +- mysql-test/main/optimizer_costs_innodb.result | 42 ++++++++++++++++++ mysql-test/main/optimizer_costs_innodb.test | 44 +++++++++++++++++++ mysql-test/suite/sys_vars/r/new_mode.result | 2 +- .../sys_vars/r/sysvars_server_embedded.result | 2 +- .../r/sysvars_server_notembedded.result | 2 +- sql/multi_range_read.cc | 22 +++++++--- sql/sql_class.h | 3 +- sql/sql_select.cc | 3 ++ sql/structs.h | 6 +++ sql/sys_vars.cc | 5 ++- sql/table.cc | 15 +++++++ sql/table.h | 15 ++++++- storage/innobase/handler/ha_innodb.cc | 22 ++++++++++ 14 files changed, 170 insertions(+), 15 deletions(-) create mode 100644 mysql-test/main/optimizer_costs_innodb.result create mode 100644 mysql-test/main/optimizer_costs_innodb.test diff --git a/mysql-test/main/mysqld--help.result b/mysql-test/main/mysqld--help.result index f9ed13e1e17..7d8e9c08e28 100644 --- a/mysql-test/main/mysqld--help.result +++ b/mysql-test/main/mysqld--help.result @@ -744,7 +744,7 @@ The following specify which files/extra groups are read (specified before remain connection before aborting the write --new-mode=name Used to introduce new behavior to existing MariaDB versions. Any combination of: - FIX_INDEX_STATS_FOR_ALL_NULLS + FIX_INDEX_STATS_FOR_ALL_NULLS, FIX_INDEX_LOOKUP_COST Use 'ALL' to set all combinations. --note-verbosity=name Verbosity level for note-warnings given to the user. See diff --git a/mysql-test/main/optimizer_costs_innodb.result b/mysql-test/main/optimizer_costs_innodb.result new file mode 100644 index 00000000000..0e79f9996b1 --- /dev/null +++ b/mysql-test/main/optimizer_costs_innodb.result @@ -0,0 +1,42 @@ +# +# MDEV-38164: Poor cost calculations for index access cause bad query plans for big VARCHARs in 11.x +# +create table t1 ( +pk1 int not null, +pk2 int not null, +filler1 varchar(500), +filler2 varchar(500), +filler3 varchar(500), +filler4 varchar(500), +filler5 varchar(500), +filler6 varchar(500), +filler7 varchar(500), +filler8 varchar(500), +filler9 varchar(500), +fillerA varchar(500), +fillerB varchar(500), +fillerC varchar(500), +fillerD varchar(500), +fillerE varchar(500), +fillerF varchar(500), +primary key(pk1, pk2) +) collate utf8mb4_general_ci engine=innodb; +insert into t1 (pk1, pk2) select +mod(seq, 8), seq +from +seq_1_to_10000; +analyze table t1; +Table Op Msg_type Msg_text +test.t1 analyze status Engine-independent statistics collected +test.t1 analyze status OK +# Before the fix: this will use full table scan: +explain select * from t1 where pk1=1; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ALL PRIMARY NULL NULL # 10000 Using where +set @save_new_mode=@@new_mode; +set new_mode=concat(@@new_mode,'FIX_INDEX_LOOKUP_COST'); +explain select * from t1 where pk1=1; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ref PRIMARY PRIMARY 4 # 1250 +set new_mode=@save_new_mode; +drop table t1; diff --git a/mysql-test/main/optimizer_costs_innodb.test b/mysql-test/main/optimizer_costs_innodb.test new file mode 100644 index 00000000000..330f1157f65 --- /dev/null +++ b/mysql-test/main/optimizer_costs_innodb.test @@ -0,0 +1,44 @@ +--source include/have_innodb.inc +--source include/have_sequence.inc + +--echo # +--echo # MDEV-38164: Poor cost calculations for index access cause bad query plans for big VARCHARs in 11.x +--echo # +create table t1 ( + pk1 int not null, + pk2 int not null, + filler1 varchar(500), + filler2 varchar(500), + filler3 varchar(500), + filler4 varchar(500), + filler5 varchar(500), + filler6 varchar(500), + filler7 varchar(500), + filler8 varchar(500), + filler9 varchar(500), + fillerA varchar(500), + fillerB varchar(500), + fillerC varchar(500), + fillerD varchar(500), + fillerE varchar(500), + fillerF varchar(500), + primary key(pk1, pk2) +) collate utf8mb4_general_ci engine=innodb; + +insert into t1 (pk1, pk2) select + mod(seq, 8), seq +from + seq_1_to_10000; +analyze table t1; + +--echo # Before the fix: this will use full table scan: +--replace_column 8 # +explain select * from t1 where pk1=1; + +set @save_new_mode=@@new_mode; +set new_mode=concat(@@new_mode,'FIX_INDEX_LOOKUP_COST'); +--replace_column 8 # +explain select * from t1 where pk1=1; + +set new_mode=@save_new_mode; +drop table t1; diff --git a/mysql-test/suite/sys_vars/r/new_mode.result b/mysql-test/suite/sys_vars/r/new_mode.result index d078537a1e1..eeec6544e53 100644 --- a/mysql-test/suite/sys_vars/r/new_mode.result +++ b/mysql-test/suite/sys_vars/r/new_mode.result @@ -25,7 +25,7 @@ ERROR 42000: Variable 'new_mode' can't be set to the value of 'TEST_WARNING3' SET @@session.new_mode = "ALL"; select @@session.new_mode; @@session.new_mode -FIX_INDEX_STATS_FOR_ALL_NULLS +FIX_INDEX_STATS_FOR_ALL_NULLS,FIX_INDEX_LOOKUP_COST SET @@global.new_mode = NULL; ERROR 42000: Variable 'new_mode' can't be set to the value of 'NULL' SET @@global.new_mode = ''; diff --git a/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result b/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result index 34c01434f40..cf27b7294b4 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result +++ b/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result @@ -2319,7 +2319,7 @@ VARIABLE_COMMENT Used to introduce new behavior to existing MariaDB versions NUMERIC_MIN_VALUE NULL NUMERIC_MAX_VALUE NULL NUMERIC_BLOCK_SIZE NULL -ENUM_VALUE_LIST FIX_INDEX_STATS_FOR_ALL_NULLS +ENUM_VALUE_LIST FIX_INDEX_STATS_FOR_ALL_NULLS,FIX_INDEX_LOOKUP_COST READ_ONLY NO COMMAND_LINE_ARGUMENT REQUIRED VARIABLE_NAME NOTE_VERBOSITY diff --git a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result index c898835003e..3a3ea304d71 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result +++ b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result @@ -2529,7 +2529,7 @@ VARIABLE_COMMENT Used to introduce new behavior to existing MariaDB versions NUMERIC_MIN_VALUE NULL NUMERIC_MAX_VALUE NULL NUMERIC_BLOCK_SIZE NULL -ENUM_VALUE_LIST FIX_INDEX_STATS_FOR_ALL_NULLS +ENUM_VALUE_LIST FIX_INDEX_STATS_FOR_ALL_NULLS,FIX_INDEX_LOOKUP_COST READ_ONLY NO COMMAND_LINE_ARGUMENT REQUIRED VARIABLE_NAME NOTE_VERBOSITY diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc index 71143965162..d0f85e572a1 100644 --- a/sql/multi_range_read.cc +++ b/sql/multi_range_read.cc @@ -75,9 +75,19 @@ void handler::calculate_costs(Cost_estimate *cost, uint keyno, { /* Clustered index */ io_blocks= unassigned_single_point_ranges; - cost->index_cost= ha_keyread_time(keyno, n_ranges, - total_rows + multi_row_ranges, - io_blocks); + if (TEST_NEW_MODE_FLAG(table->in_use, NEW_MODE_FIX_INDEX_LOOKUP_COST)) + { + cost->index_cost= + ha_keyread_clustered_time(keyno, n_ranges, + total_rows + multi_row_ranges, + io_blocks); + } + else + { + cost->index_cost= ha_keyread_time(keyno, n_ranges, + total_rows + multi_row_ranges, + io_blocks); + } cost->copy_cost= rows2double(total_rows) * ROW_COPY_COST; } /* Adjust io cost to data size */ @@ -178,11 +188,9 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, */ ulonglong unassigned_single_point_ranges= 0; - uint len= table->key_info[keyno].key_length + table->file->ref_length; - if (table->file->is_clustering_key(keyno)) - len= table->s->stored_rec_length; + size_t len= table->key_storage_length(keyno); /* Assume block is 75 % full */ - uint avg_block_records= ((uint) (stats.block_size*3/4))/len + 1; + size_t avg_block_records= ((uint) (stats.block_size*3/4))/len + 1; uint limit= thd->variables.eq_range_index_dive_limit; bool use_statistics_for_eq_range= eq_ranges_exceeds_limit(seq, seq_init_param, diff --git a/sql/sql_class.h b/sql/sql_class.h index a255f952885..066064d2ef5 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -220,7 +220,8 @@ void old_mode_deprecated_warnings(THD *thd, ulonglong v); */ #define NEW_MODE_FIX_INDEX_STATS_FOR_ALL_NULLS (1ULL << 0) -#define NEW_MODE_MAX 1 +#define NEW_MODE_FIX_INDEX_LOOKUP_COST (1ULL << 1) +#define NEW_MODE_MAX 2 /* Definitions above that have transitioned from new behaviour to default */ diff --git a/sql/sql_select.cc b/sql/sql_select.cc index c97241d2b75..8e693ea1734 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -13486,6 +13486,7 @@ static bool create_hj_key_for_table(JOIN *join, JOIN_TAB *join_tab, keyinfo->flags= HA_GENERATED_KEY; keyinfo->is_statistics_from_stat_tables= FALSE; keyinfo->all_nulls_key_parts= 0; + keyinfo->stat_storage_length= 0; keyinfo->name.str= "$hj"; keyinfo->name.length= 3; keyinfo->rec_per_key= (ulong*) thd->calloc(sizeof(ulong)*key_parts); @@ -22482,6 +22483,7 @@ bool Create_tmp_table::finalize(THD *thd, keyinfo->algorithm= HA_KEY_ALG_UNDEF; keyinfo->is_statistics_from_stat_tables= FALSE; keyinfo->all_nulls_key_parts= 0; + keyinfo->stat_storage_length= 0; keyinfo->name= group_key; keyinfo->comment.str= 0; ORDER *cur_group= m_group; @@ -22604,6 +22606,7 @@ bool Create_tmp_table::finalize(THD *thd, keyinfo->algorithm= HA_KEY_ALG_UNDEF; keyinfo->is_statistics_from_stat_tables= FALSE; keyinfo->all_nulls_key_parts= 0; + keyinfo->stat_storage_length= 0; keyinfo->read_stats= NULL; keyinfo->collected_stats= NULL; diff --git a/sql/structs.h b/sql/structs.h index d81666e256e..cad3763b89b 100644 --- a/sql/structs.h +++ b/sql/structs.h @@ -154,6 +154,12 @@ typedef struct st_key { */ ulong *rec_per_key; + /* + Average space index tuple takes on disk, according to the engine's + statistics. 0 if statistics is not available. + */ + size_t stat_storage_length; + /* This structure is used for statistical data on the index that has been read from the statistical table index_stat diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc index 3d565b01000..96621793567 100644 --- a/sql/sys_vars.cc +++ b/sql/sys_vars.cc @@ -4056,6 +4056,7 @@ static Sys_var_set Sys_old_behavior( static const char *new_mode_all_names[]= { "FIX_INDEX_STATS_FOR_ALL_NULLS", + "FIX_INDEX_LOOKUP_COST", "TEST_WARNING1", // Default from here, See NEW_MODE_MAX "TEST_WARNING2", 0 @@ -4063,8 +4064,8 @@ static const char *new_mode_all_names[]= static int new_mode_hidden_names[] = { - 1, // TEST_WARNING1 - 2, // TEST_WARNING2 + 2, // TEST_WARNING1 + 3, // TEST_WARNING2 -1 // End of list }; diff --git a/sql/table.cc b/sql/table.cc index ff311a87572..3c0a6108bcd 100644 --- a/sql/table.cc +++ b/sql/table.cc @@ -8643,6 +8643,7 @@ bool TABLE::add_tmp_key(uint key, uint key_parts, keyinfo->read_stats= NULL; keyinfo->collected_stats= NULL; keyinfo->all_nulls_key_parts= 0; + keyinfo->stat_storage_length= 0; for (i= 0; i < key_parts; i++) { @@ -10932,3 +10933,17 @@ void TABLE::mark_table_for_reopen() DBUG_ASSERT(thd); thd->locked_tables_list.mark_table_for_reopen(this); } + +/* + @brief + Estimate how much space is required to store one index tuple +*/ +size_t TABLE::key_storage_length(uint index) +{ + /* Use storage engine's statistics if it is available */ + if (TEST_NEW_MODE_FLAG(in_use, NEW_MODE_FIX_INDEX_LOOKUP_COST) && + key_info[index].stat_storage_length) + return key_info[index].stat_storage_length; + + return key_storage_length_from_ddl(index); +} diff --git a/sql/table.h b/sql/table.h index 84749b5a1f8..81278b1dcce 100644 --- a/sql/table.h +++ b/sql/table.h @@ -1756,7 +1756,20 @@ public: uint actual_n_key_parts(KEY *keyinfo); ulong actual_key_flags(KEY *keyinfo); int update_virtual_field(Field *vf, bool ignore_warnings); - inline size_t key_storage_length(uint index) + + size_t key_storage_length(uint index); + /* + @brief + Estimate how index tuple takes in storage, based solely on table's DDL + + @detail + This is a conservative number that assumes the value is stored in + KeyTupleFormat (or table->record format for clustered PK), without + endspace compression, etc. + On the other hand, it doesn't account that the storage engine may need + to store transactionIds, etc. + */ + inline size_t key_storage_length_from_ddl(uint index) { if (is_clustering_key(index)) return s->stored_rec_length; diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 5a875595479..6f11349d878 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -15003,6 +15003,28 @@ stats_fetch: } KEY* key = &table->key_info[i]; + /* + Provide statistics about how many bytes an index record takes on disk, + on average. + */ + if (ib_table->stat_n_rows && !(key->flags & (HA_FULLTEXT | HA_SPATIAL))) { + /* + Start with total space used by the index divided by number of rows + */ + key->stat_storage_length = (size_t) ((index->stat_index_size * srv_page_size) / + ib_table->stat_n_rows); + + /* + The above can be too large + A) in case of tables with very few rows + B) in case of indexes with partially full pages. + So, clip the above estimate by a conservative estimate we've used + before: + */ + size_t conservative_estimate = table->key_storage_length_from_ddl(i); + if (key->stat_storage_length > conservative_estimate) + key->stat_storage_length = conservative_estimate; + } for (j = 0; j < key->ext_key_parts; j++) {