Skip to content

Commit

Permalink
A new Galera feature that allows retrying of applying of writesets at
Browse files Browse the repository at this point in the history
slave nodes (codership/mysql-wsrep-bugs/MariaDB#1619). Currently replication
applying stops for first non ignored failure occurring in event
applying, and node will do emergency abort (or start inconsistency
voting). Some failures, however, can be concurrency related, and
applying may succeed if the operation is tried at later time.

This feature introduces a new dynamic global option variable
"wsrep_applier_retry_count" that controls the retry-applying feature:
a zero value disables retrying and a positive value sets the maximum
number of retry attempts. The default value for this option is zero,
which means that this feature is disabled by default.
  • Loading branch information
plampio committed Jan 16, 2025
1 parent 7fcaab7 commit dfedb1a
Show file tree
Hide file tree
Showing 13 changed files with 346 additions and 31 deletions.
1 change: 1 addition & 0 deletions mysql-test/suite/galera/r/galera_defaults.result
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ AND VARIABLE_NAME NOT IN (
ORDER BY VARIABLE_NAME;
VARIABLE_NAME VARIABLE_VALUE
WSREP_ALLOWLIST
WSREP_APPLIER_RETRY_COUNT 0
WSREP_AUTO_INCREMENT_CONTROL ON
WSREP_CERTIFICATION_RULES strict
WSREP_CERTIFY_NONPK ON
Expand Down
73 changes: 73 additions & 0 deletions mysql-test/suite/galera/r/galera_retry_applying.result
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
connection node_2;
connection node_1;
CREATE TABLE t1 (f1 INTEGER PRIMARY KEY DEFAULT 0, f2 char(12));
CREATE TABLE t3 (f1 INTEGER PRIMARY KEY DEFAULT 0, f2 char(12));
START TRANSACTION;
INSERT INTO t3 (f1, f2) VALUES (1, 'a');
INSERT INTO t3 (f1, f2) VALUES (2, 'b');
INSERT INTO t3 (f1, f2) VALUES (3, 'c'), (4, 'd'), (5, 'e');
COMMIT;
connection node_2;
SET GLOBAL wsrep_applier_retry_count = 2;
SET GLOBAL debug_dbug = "d,apply_event_fail_once:o,/dev/null";
CALL mtr.add_suppression("Event .* Write_rows.* apply failed");
connection node_1;
START TRANSACTION;
UPDATE t3 SET f2 = 'ax' WHERE f1 = 1;
UPDATE t3 SET f2 = 'bx' WHERE f1 = 2;
INSERT INTO t1 (f1, f2) VALUES (3, 'c'), (4, 'd'), (5, 'e');
UPDATE t3 SET f2 = 'cx' WHERE f1 = 3;
UPDATE t3 SET f2 = 'dx' WHERE f1 = 4;
DELETE FROM t3 WHERE f1 = 5;
COMMIT;
connection node_2;
connection node_1;
SELECT COUNT(*) AS expect_3 FROM t1;
expect_3
3
SELECT COUNT(*) AS expect_4 FROM t3;
expect_4
4
connection node_2;
SELECT COUNT(*) AS expect_3 FROM t1;
expect_3
3
SELECT COUNT(*) AS expect_4 FROM t3;
expect_4
4
connection node_1;
DROP TABLE t1;
DROP TABLE t3;
connection node_2;
Shutting down server ...
SET wsrep_on=OFF;
Restarting server ...
connection node_1;
SET wsrep_sync_wait=0;
CREATE TABLE t2 (f1 INTEGER PRIMARY KEY DEFAULT 0, f2 char(12));
connection node_2;
CALL mtr.add_suppression("Event .* Update_rows.* apply failed");
CALL mtr.add_suppression("Inconsistency detected");
CALL mtr.add_suppression("Failed to apply write set:.*");
SET GLOBAL wsrep_applier_retry_count = 2;
SET GLOBAL debug_dbug = '';
SET GLOBAL debug_dbug = "d,apply_event_fail_always:o,/dev/null";
connection node_1;
START TRANSACTION;
INSERT INTO t2 (f1, f2) VALUES (1, 'a'), (2, 'b');
COMMIT;
connection node_2;
Shutting down server ...
SET wsrep_on=OFF;
Restarting server ...
connection node_1;
SET wsrep_sync_wait=0;
connection node_2;
SELECT COUNT(*) AS expect_2 FROM t2;
expect_2
2
SET GLOBAL debug_dbug = DEFAULT;
connection node_1;
SET GLOBAL wsrep_applier_retry_count = 0;
SET DEBUG_SYNC = 'RESET';
DROP TABLE t2;
135 changes: 135 additions & 0 deletions mysql-test/suite/galera/t/galera_retry_applying.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#
# Test retrying applying of a transaction
#

--source include/galera_cluster.inc
--source include/have_debug_sync.inc

#
# Case 1: Retrying succeeds after one retry event, no error is raised.
#
CREATE TABLE t1 (f1 INTEGER PRIMARY KEY DEFAULT 0, f2 char(12));
CREATE TABLE t3 (f1 INTEGER PRIMARY KEY DEFAULT 0, f2 char(12));

START TRANSACTION;
INSERT INTO t3 (f1, f2) VALUES (1, 'a');
INSERT INTO t3 (f1, f2) VALUES (2, 'b');
INSERT INTO t3 (f1, f2) VALUES (3, 'c'), (4, 'd'), (5, 'e');
COMMIT;

# wait till the insert transaction has been replicated and committed in node_2
--connection node_2
--let $wait_condition = SELECT COUNT(*) > 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't3';
--source include/wait_condition.inc
--let $wait_condition = SELECT COUNT(*) > 0 FROM t3;
--source include/wait_condition.inc

SET GLOBAL wsrep_applier_retry_count = 2;
SET GLOBAL debug_dbug = "d,apply_event_fail_once:o,/dev/null";
CALL mtr.add_suppression("Event .* Write_rows.* apply failed");

--connection node_1
START TRANSACTION;
UPDATE t3 SET f2 = 'ax' WHERE f1 = 1;
UPDATE t3 SET f2 = 'bx' WHERE f1 = 2;
INSERT INTO t1 (f1, f2) VALUES (3, 'c'), (4, 'd'), (5, 'e');
UPDATE t3 SET f2 = 'cx' WHERE f1 = 3;
UPDATE t3 SET f2 = 'dx' WHERE f1 = 4;
DELETE FROM t3 WHERE f1 = 5;
COMMIT;

# wait till the transaction has been replicated and committed in node_2
--connection node_2
--let $wait_condition = SELECT COUNT(*) = 4 FROM t3;
--source include/wait_condition.inc

--connection node_1
SELECT COUNT(*) AS expect_3 FROM t1;
SELECT COUNT(*) AS expect_4 FROM t3;

--connection node_2
SELECT COUNT(*) AS expect_3 FROM t1;
SELECT COUNT(*) AS expect_4 FROM t3;

#
# Cleanup after Case 1.
#

--connection node_1
DROP TABLE t1;
DROP TABLE t3;

# shutdown node 2 and restart it
--connection node_2
--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't3';
--source include/wait_condition.inc
--echo Shutting down server ...
SET wsrep_on=OFF;
--source include/shutdown_mysqld.inc
--remove_file $MYSQLTEST_VARDIR/mysqld.2/data/grastate.dat
--echo Restarting server ...
--source include/start_mysqld.inc

# wait till node 2 is back in the cluster
--connection node_1
SET wsrep_sync_wait=0;
--let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM performance_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_size'
--source include/wait_condition.inc


#
# Case 2: Slave retries applying of a transaction multiple times. All
# retry attempts fail, and the applying will fail with the expected
# error.
#

CREATE TABLE t2 (f1 INTEGER PRIMARY KEY DEFAULT 0, f2 char(12));

--connection node_2
CALL mtr.add_suppression("Event .* Update_rows.* apply failed");
CALL mtr.add_suppression("Inconsistency detected");
CALL mtr.add_suppression("Failed to apply write set:.*");

SET GLOBAL wsrep_applier_retry_count = 2;
SET GLOBAL debug_dbug = '';
SET GLOBAL debug_dbug = "d,apply_event_fail_always:o,/dev/null";

--connection node_1
START TRANSACTION;
INSERT INTO t2 (f1, f2) VALUES (1, 'a'), (2, 'b');
COMMIT;

# node 2 should crash now, wait for the crash
--let $wait_condition = SELECT VARIABLE_VALUE = 1 FROM performance_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_size'
--source include/wait_condition.inc

# restart node 2
--connection node_2
--echo Shutting down server ...
SET wsrep_on=OFF;
--source include/shutdown_mysqld.inc
--source include/wait_until_disconnected.inc
--remove_file $MYSQLTEST_VARDIR/mysqld.2/data/grastate.dat
--echo Restarting server ...
--source include/start_mysqld.inc

# wait till node 2 is back in the cluster
--connection node_1
SET wsrep_sync_wait=0;
--let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM performance_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_size'
--source include/wait_condition.inc

--connection node_2
--let $wait_condition = SELECT COUNT(*) = 2 FROM t2;
--source include/wait_condition.inc
SELECT COUNT(*) AS expect_2 FROM t2;
SET GLOBAL debug_dbug = DEFAULT;

#
# Cleanup
#

--connection node_1
SET GLOBAL wsrep_applier_retry_count = 0;
SET DEBUG_SYNC = 'RESET';
DROP TABLE t2;
28 changes: 27 additions & 1 deletion sql/log_event_server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4925,7 +4925,33 @@ int Rows_log_event::do_apply_event(rpl_group_info *rgi)
/* remove trigger's tables */
goto err;
}

#ifdef WITH_WSREP
DBUG_EXECUTE_IF("apply_event_fail_once", {
if (WSREP(thd)) {
RPL_TABLE_LIST *ptr= static_cast<RPL_TABLE_LIST*>(rgi->tables_to_lock);
error= HA_ERR_LOCK_WAIT_TIMEOUT;
slave_rows_error_report(
INFORMATION_LEVEL, error, rgi, thd, ptr->table,
get_type_str(), RPL_LOG_NAME, log_pos);
my_error(error, MYF(0));
thd->is_slave_error= 1;
DBUG_SET("-d,apply_event_fail_once");
goto err;
}
};);
DBUG_EXECUTE_IF("apply_event_fail_always", {
if (WSREP(thd)) {
RPL_TABLE_LIST *ptr= static_cast<RPL_TABLE_LIST*>(rgi->tables_to_lock);
error= HA_ERR_LOCK_WAIT_TIMEOUT;
slave_rows_error_report(
INFORMATION_LEVEL, error, rgi, thd, ptr->table,
get_type_str(), RPL_LOG_NAME, log_pos);
my_error(error, MYF(0));
thd->is_slave_error= 1;
goto err;
}
};);
#endif /* WITH_WSREP */
/*
When the open and locking succeeded, we check all tables to
ensure that they still have the correct type.
Expand Down
6 changes: 6 additions & 0 deletions sql/sys_vars.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6493,6 +6493,12 @@ static Sys_var_charptr Sys_wsrep_allowlist(
READ_ONLY GLOBAL_VAR(wsrep_allowlist), CMD_LINE(REQUIRED_ARG),
DEFAULT(""));

static Sys_var_uint Sys_wsrep_applier_retry_count (
"wsrep_applier_retry_count", "Maximum number of applier retry attempts",
GLOBAL_VAR(wsrep_applier_retry_count), CMD_LINE(OPT_ARG),
VALID_RANGE(0, UINT_MAX), DEFAULT(0), BLOCK_SIZE(1),
NO_MUTEX_GUARD, NOT_IN_BINLOG);

#endif /* WITH_WSREP */

static bool fix_host_cache_size(sys_var *, THD *, enum_var_type)
Expand Down
11 changes: 11 additions & 0 deletions sql/transaction.cc
Original file line number Diff line number Diff line change
Expand Up @@ -783,3 +783,14 @@ bool trans_release_savepoint(THD *thd, LEX_CSTRING name)

DBUG_RETURN(MY_TEST(res));
}

#ifdef WITH_WSREP
/* check if a named savepoint exists for the current transaction */
bool trans_savepoint_exists(THD *thd, LEX_CSTRING name)
{
SAVEPOINT **sv = find_savepoint(thd, name);

return (*sv != NULL);
}
#endif /* WITH_WSREP */

3 changes: 3 additions & 0 deletions sql/transaction.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ bool trans_rollback_stmt(THD *thd);
bool trans_savepoint(THD *thd, LEX_CSTRING name);
bool trans_rollback_to_savepoint(THD *thd, LEX_CSTRING name);
bool trans_release_savepoint(THD *thd, LEX_CSTRING name);
#ifdef WITH_WSREP
bool trans_savepoint_exists(THD *thd, LEX_CSTRING name);
#endif /* WITH_WSREP */

void trans_reset_one_shot_chistics(THD *thd);

Expand Down
Loading

0 comments on commit dfedb1a

Please sign in to comment.