MySQL · 捉蟲動態 · MySQL字符集相關變數介紹及binlog中字符集相關缺陷分析
MySQL字符集相關變數介紹及binlog中字符集相關缺陷分析
MySQL支援多種字符集(character set)提供使用者儲存資料,同時允許用不同排序規則(collation)做比較。
本文基於MySQL5.7介紹了字符集相關變數的使用,通過例子描述了這些變數具體意義。分析了MySQL binlog中字符集相關處理的缺陷,這些缺陷會導致複製中斷或者主備不一致。最後給出了修復這些缺陷的方法。
MySQL字符集相關基礎知識介紹
character_set_system
character_set_system為元資料的字符集,即所有的元資料都使用同一個字符集。試想如果元資料採用不同字符集,INFORMATION_SCHEMA中的相關資訊在不同行之間就很難展示。同時該字符集要能夠支援多種語言,方便不同語言人群使用自己的語言命名database、table、column。MySQL選擇UTF-8作為元資料編碼,用原始碼固定。
sql/mysqld.cc
int mysqld_main(int argc, char **argv)
{
...
system_charset_info= &my_charset_utf8_general_ci;
}
> select @@global.character_set_system;
+-------------------------------+
| @@global.character_set_system |
+-------------------------------+
| utf8 |
+-------------------------------+
MySQL會將identifier轉換為system_charset_info(utf8)。
sql/sql_lex.cc
static int lex_one_token(YYSTYPE *yylval, THD *thd)
{
case MY_LEX_IDENT:
...
lip->body_utf8_append_literal
...
}
void Lex_input_stream::body_utf8_append_literal(THD *thd,
const LEX_STRING *txt,
const CHARSET_INFO *txt_cs,
const char *end_ptr)
{
...
if (!my_charset_same(txt_cs, &my_charset_utf8_general_ci))
{
hd->convert_string(&utf_txt,
&my_charset_utf8_general_ci,
txt->str, txt->length,
txt_cs);
}
else
{
utf_txt.str= txt->str;
utf_txt.length= txt->length;
}
...
}
sql/sql_yacc.yy
IDENT_sys:
IDENT { $$= $1; }
| IDENT_QUOTED
{
THD *thd= YYTHD;
if (thd->charset_is_system_charset)
{
...
}
else
{
if (thd->convert_string(&$$, system_charset_info,
$1.str, $1.length, thd->charset()))
MYSQL_YYABORT;
}
}
;
character_set_server/collation_server
當create database沒有指定charset/collation就會用character_set_server/collation_server,這兩個變數可以動態設定,有session/global級別。
在原始碼中character_set_server/collation_server實際對應一個變數,因為一個collation對應著一個charset,所以原始碼中只記錄CHARSET_INFO結構的collation_server即可。當修改character_set_server,會選擇對應charset的預設collation。對於其他同時有charset和collation的變數,原始碼記錄也都是記錄collation。
static Sys_var_struct Sys_character_set_server(
"character_set_server", "The default character set",
SESSION_VAR(collation_server), NO_CMD_LINE,
offsetof(CHARSET_INFO, csname), DEFAULT(&default_charset_info),
NO_MUTEX_GUARD, IN_BINLOG, ON_CHECK(check_charset_not_null));
static Sys_var_struct Sys_collation_server(
"collation_server", "The server default collation",
SESSION_VAR(collation_server), NO_CMD_LINE,
offsetof(CHARSET_INFO, name), DEFAULT(&default_charset_info),
NO_MUTEX_GUARD, IN_BINLOG, ON_CHECK(check_collation_not_null));
通過下面case可以看到通過設定session中不同的character_set_server使建立database的預設charset和collation不同。
> set character_set_server='utf8';
> create database cs_test1;
> select * from SCHEMATA where SCHEMA_NAME='cs_test1';
+--------------+-------------+----------------------------+------------------------+----------+
| CATALOG_NAME | SCHEMA_NAME | DEFAULT_CHARACTER_SET_NAME | DEFAULT_COLLATION_NAME | SQL_PATH |
+--------------+-------------+----------------------------+------------------------+----------+
| def | cs_test1 | utf8 | utf8_general_ci | NULL |
+--------------+-------------+----------------------------+------------------------+----------+
> set character_set_server='latin1';
> create database cs_test2;
> select * from SCHEMATA where SCHEMA_NAME='cs_test2';
+--------------+-------------+----------------------------+------------------------+----------+
| CATALOG_NAME | SCHEMA_NAME | DEFAULT_CHARACTER_SET_NAME | DEFAULT_COLLATION_NAME | SQL_PATH |
+--------------+-------------+----------------------------+------------------------+----------+
| def | cs_test2 | latin1 | latin1_swedish_ci | NULL |
+--------------+-------------+----------------------------+------------------------+----------+
character_set_database/collation_database
該變數值session級別表示當前database的charset/collation,在後面的原始碼版本中該變數可能修正為只讀,不建議修改該值。其global級別變數後面也會移除。
> use cs_test1;
> select @@character_set_database;
+--------------------------+
| @@character_set_database |
+--------------------------+
| utf8 |
+--------------------------+
> use cs_test2;
> select @@character_set_database;
+--------------------------+
| @@character_set_database |
+--------------------------+
| latin1 |
+--------------------------+
character_set_client
客戶端傳送到server的字串使用的字符集,server會按照該變數值來解析客戶端發來的語句。如果指定值和語句實際編碼字符集不符就會解析出錯,報語法錯誤或者得到非預期結果,例如下面的兩個case。
case1:實際使用utf8編碼且包含中文字元,但設定character_set_client為latin1。
> set character_set_client='latin1';
> create table 字符集(c1 varchar(10));
ERROR 1064 (42000): You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near '—符集(c1 varchar(10))' at line 1
> set character_set_client='utf8';
> create table 字符集(c1 varchar(10));
Query OK, 0 rows affected (0.14 sec)
case2:實際使用utf8編碼且包含中文字元,但設定character_set_client為gbk。
> create database cs_test;
> use cs_test;
> set character_set_client='gbk';
> create table 收費(c1 varchar(10));
> show tables;
+-------------------+
| Tables_in_cs_test |
+-------------------+
| 鏀惰垂 |
+-------------------+
> set character_set_client='utf8';
> create table 收費(c1 varchar(10));
> show tables;
+-------------------+
| Tables_in_cs_test |
+-------------------+
| 收費 |
| 鏀惰垂 |
+-------------------+
2 rows in set (0.00 sec)
character_set_connection/collation_connection
沒有指定字符集的常量字串使用時的字符集,例如下面兩個case。
case1中一個設定為utf8_general_ci比較時候忽略大小寫,導致’a’=’A’結果為1,如果設定為utf8_bin不忽略大小寫,’a’ = ‘A’的結果就是0。
case2中當設定character_set_connection為’latin1’的時候,’你好’ = ‘我好’返回結果為1,如果設定為’utf8’,返回結果就是0。設定為’latin1’返回結果為1的原因是utf8編碼的utf8字元是無法轉換為latin1字元的。這裡MySQL就把’你好’和’我好’都轉換成了’??’。
case3中character_set_connection的不同導致create table語句中column的實際default value不同。
case1:設定collation_connection是否忽略大小寫導致結果不一致。
> set collation_connection=utf8_general_ci;
> select 'a' = 'A';
+-----------+
| 'a' = 'A' |
+-----------+
| 1 |
+-----------+
> set collation_connection=utf8_bin;
> select 'a' = 'A';
+-----------+
| 'a' = 'A' |
+-----------+
| 0 |
+-----------+
case2:設定character_set_connection不同導致結果不一致。
> set character_set_connection='latin1';
Query OK, 0 rows affected (0.00 sec)
> select '你好' = '我好';
+---------------------+
| '你好' = '我好' |
+---------------------+
| 1 |
+---------------------+
1 row in set, 2 warnings (0.00 sec)
> set character_set_connection='utf8';
Query OK, 0 rows affected (0.00 sec)
> select '你好' = '我好';
+---------------------+
| '你好' = '我好' |
+---------------------+
| 0 |
+---------------------+
> set character_set_connection='latin1';
> select '你好';
+----+
| ?? |
+----+
| ?? |
+----+
case3:設定character_set_connection導致實際default value不同。
> set character_set_connection='utf8';
> create table cs_t(c1 varchar(10) default '你好')charset=utf8;
> insert into cs_t values();
> select * from cs_t;
+--------+
| c1 |
+--------+
| 你好 |
+--------+
> set character_set_connection='latin1';
> create table cs_t1(c1 varchar(10) default '你好')charset=utf8;
> insert into cs_t1 values();
> select * from cs_t1;
+------+
| c1 |
+------+
| ?? |
+------+
character_set_results
查詢結果和錯誤資訊的字符集,server會把返回給客戶端的結果轉換為對應字符集。例如下面case,當設定character_set_results為’latin1’的時候,會導致返回的中文變成’?’。
> set character_set_results='utf8';
> select '你好';
+--------+
| 你好 |
+--------+
| 你好 |
+--------+
> set character_set_results='latin1';
> select '你好';
+----+
| ?? |
+----+
| ?? |
+----+
> create table cs_test(c1 varchar(10)) charset=utf8;
> insert into cs_test values('你好'),('我好');
> select * from cs_test;
+------+
| c1 |
+------+
| ?? |
| ?? |
+------+
> set character_set_results='utf8';
> select * from cs_test;
+--------+
| c1 |
+--------+
| 你好 |
| 我好 |
+--------+
binlog 中字符集相關缺陷
binlog當前字符集相關實現
對於很多DDL語句,binlog都是直接記錄客戶端發來的字串,對於這些語句只要記錄語句執行時候的環境變數就可以在備庫正確執行。binlog中Query_log_event記錄了character_set_client、collation_connection和collation_server,程式碼如下。記錄這三個變數的原因讀者可以參考前面各個變數的介紹case。
int THD::binlog_query(THD::enum_binlog_query_type qtype, const char *query_arg,
size_t query_len, bool is_trans, bool direct,
bool suppress_use, int errcode)
{
...
case THD::STMT_QUERY_TYPE:
/*
The MYSQL_BIN_LOG::write() function will set the STMT_END_F flag and
flush the pending rows event if necessary.
*/
{
Query_log_event qinfo(this, query_arg, query_len, is_trans, direct,
suppress_use, errcode);
/*
Binlog table maps will be irrelevant after a Query_log_event
(they are just removed on the slave side) so after the query
log event is written to the binary log, we pretend that no
table maps were written.
*/
int error= mysql_bin_log.write_event(&qinfo);
binlog_table_maps= 0;
DBUG_RETURN(error);
}
...
}
Query_log_event::Query_log_event(THD* thd_arg, const char* query_arg,
size_t query_length, bool using_trans,
bool immediate, bool suppress_use,
int errcode, bool ignore_cmd_internals)
{
...
int2store(charset, thd_arg->variables.character_set_client->number);
int2store(charset+2, thd_arg->variables.collation_connection->number);
int2store(charset+4, thd_arg->variables.collation_server->number);
...
}
例如前面建立表cs_t1的case我們可以看到binlog如下。
> set character_set_connection='latin1';
> create table cs_t1(c1 varchar(10) default '你好')charset=utf8;
SET TIMESTAMP=1516089074/*!*/;
/*!\C utf8 *//*!*/;
SET @@session.character_set_client=33,@@session.collation_connection=8,@@session.collation_server=8/*!*/;
create table cs_t1(c1 varchar(10) default '你好')charset=utf8
binlog字符集相關缺陷
對於Query_log_event如果記錄的query僅僅是客戶端的輸入,上面記錄字符集變數的方法沒有問題。但如果query是server內部生成或者拼接成的,上面直接從thread中獲取變數值得方法就可能存在問題。
例如下面的testcase,這裡為便於觀察和理解case沒有使用mysql-test方式,後面有mysql-test。這裡主庫執行成功,成功建立了表t和檢視’收費明細表’,但備庫在建立檢視的時候卻報語法錯誤。
用gbk編碼寫如下sql文字
cs_test.sql
use test;
set @@session.character_set_client=gbk;
set @@session.collation_connection=gbk_chinese_ci;
create table t(c1 int);
create view `收費明細表` as select * from t;
在主庫執行
> source path/cs_test.sql;
> set character_set_results='gbk';
> use test;
> show tables;
+----------------+
| Tables_in_test |
+----------------+
| 收費明細表 |
| t |
+----------------+
備庫
> show slave status\G
...
Last_SQL_Errno: 1064
Last_SQL_Error: Error 'You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near '`閺鎯板瀭閺勫海綺忕悰鈺? AS select * from t' at line 1' on query. Default database: 'test'. Query: 'CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `鏀惰垂鏄庣粏琛╜ AS select * from t'
...
缺陷分析,MySQL記錄create view的binlog程式碼如下。由前面基礎知識可以知道對於db、table這些元資料MySQL會先轉換為system_charset_info(utf8)。因此在下面程式碼中append_identifier新增的db name、table name均為utf8編碼的’收費明細表’,但是views->source.str又是client端原始的gbk編碼方式,binlog_query記錄的是thd中的character_set_client。即binlog中的query可能是由system_charset_info和character_set_client兩種編碼方式組成的字串。
sql/sql_view.cc
bool mysql_create_view(THD *thd, TABLE_LIST *views,
enum_view_create_mode mode)
{
...
if (views->db && views->db[0] &&
(thd->db().str == NULL || strcmp(views->db, thd->db().str)))
{
append_identifier(thd, &buff, views->db,
views->db_length);
buff.append('.');
}
append_identifier(thd, &buff, views->table_name,
views->table_name_length);
if (lex->view_list.elements)
{
List_iterator_fast<LEX_STRING> names(lex->view_list);
LEX_STRING *name;
int i;
for (i= 0; (name= names++); i++)
{
buff.append(i ? ", " : "(");
append_identifier(thd, &buff, name->str, name->length);
}
buff.append(')');
}
buff.append(STRING_WITH_LEN(" AS "));
buff.append(views->source.str, views->source.length);
int errcode= query_error_code(thd, TRUE);
thd->add_to_binlog_accessed_dbs(views->db);
if (thd->binlog_query(THD::STMT_QUERY_TYPE,
buff.ptr(), buff.length(), FALSE, FALSE, FALSE, errcode))
res= TRUE;
...
}
在MySQL原始碼中搜索binlog_query還可以找到多處類似的bug,見後面的testcase。
--disable_warnings
--source include/master-slave.inc
--enable_warnings
# case1:建立gbk編碼中文名檢視
create table t(c1 int);
SET @@session.character_set_client=gbk;
set @@session.collation_connection=gbk_chinese_ci;
set @@session.collation_server=utf8_general_ci;
create view `收費明細` as select * from t;
drop view `收費明細`;
show tables;
--sync_slave_with_master
connection slave;
show tables;
connection master;
drop table t;
# case2:建立gbk編碼中文名檢視,且view body中包含中文
connection master;
SET @@session.character_set_client=gbk;
create table 檢視(c1 int);
create view 檢視資訊 as select * from 檢視;
drop view 檢視資訊;
# case3: drop table 語句會是generated by server.
drop table 檢視;
--sync_slave_with_master
# case4:記憶體表,重啟後再次訪問時會生成delete from tableName語句.
connection master;
SET @@session.character_set_client=utf8;
set @@session.collation_connection=utf8_general_ci;
set @@session.collation_server=utf8_general_ci;
create table `收費明細表`(c1 int) engine=memory;
create view tv as select * from `收費明細表`;
--connection slave
-- source include/stop_slave.inc
--let $rpl_server_number= 1
--source include/rpl_restart_server.inc
# access memory table after restarting server cause binlog 'delete from tableName'
connection master;
SET @@session.character_set_client=gbk;
set @@session.collation_connection=gbk_chinese_ci;
set @@session.collation_server=utf8_general_ci;
select * from tv;
--connection slave
-- source include/start_slave.inc
connection master;
--sync_slave_with_master
connection slave;
# case5:中文名的procedure
# procedure with chinese when charset not utf.
connection master;
delimiter $$;
create procedure 收費明細()
begin
select 'hello world';
end $$
delimiter ;$$
drop procedure `收費明細`;
connection master;
SET @@session.character_set_client=utf8;
set @@session.collation_connection=utf8_general_ci;
set @@session.collation_server=utf8_general_ci;
drop view tv;
drop table `收費明細表`;
--sync_slave_with_master
connection slave;
show tables;
# case6: create table like/as
set character_set_client = utf8;
set character_set_connection = utf8;
set character_set_database = utf8;
set character_set_results = utf8;
set character_set_server = utf8;
CREATE TABLE `t1` (
`id` int(11) NOT NULL,
`orderType` char(6) NOT NULL DEFAULT '已建立',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
create temporary table `tm` (c1 varchar(10) default '你好');
show create table t1;
## switch client charset
set character_set_client = latin1;
set character_set_connection = latin1;
set collation_server = utf8_bin;
CREATE TABLE t2 SELECT * FROM t1;
create table t3 like tm;
show create table t2;
show create table t3;
--sync_slave_with_master
connection slave;
show tables;
set character_set_client = utf8;
set character_set_connection = utf8;
set character_set_database = utf8;
set character_set_results = utf8;
set character_set_server = utf8;
show create table t1;
show create table t2;
show create table t3;
connection master;
drop table t1;
drop table t2;
drop table t3;
--sync_slave_with_master
--source include/rpl_end.inc
修復方法
對於create view/create procedure等一個query包含兩種編碼的可以將system_charset_info的部分轉換為thread中的character_set_client。這裡的轉換需要考慮character_set_client不支援utf8字元的問題,當轉換失敗需要報錯,否則主備會不一致。
對於完全由server生成的query:delete from和drop table語句,其query實際可以理解為system_charset_info,這種語句就可以直接使binlog記錄system_charset_info,而不是character_set_client。
該bug在MariaDB中也存在,可以見MDEV-14249,參考bug連結中的fix diff或者MariaDB的修復。