An example for explaining schema evolution with Avro (row based) binary data format.
Prerequisites
- OS: Linux (RHEL 7.9)
- Hadoop: Cloudera (CDH 6.1.1)
- Authentication via Kerberos
- OpenJDK 64-Bit 1.8.0_292
Avro stores the data definition in a JSON file (.avsc)
Basic structure of an Avro JSON schema file
{
"type" : "record",
"name" : "file_name_excluding_extension",
"doc" : "documentation",
"fields" : [{
"name" : "field_name",
"type" : "datatype",
"columnName" : "hive_table_column_name"
},{}],
"tableName" : "hive_table_name"
}
Original schema: avro_schema_evolution.avsc
{
"type":"record",
"name":"avro_schema_evolution",
"doc":"avro schema evolution",
"fields":[
{
"name":"id",
"type":"int",
"columnName":"id"
},
{
"name":"col_a",
"type":"string",
"columnName":"col_a"
}
],
"tableName":"AvroSchemaEvolution"
}
# Kerberos
$ su - <user>
$ kinit -kt /etc/security/keytabs/<user>.keytab <user>
$ klist
$ hdfs dfs -put -f /home/<user>/schema/avro_schema_evolution.avsc /user/<user>
hive> CREATE DATABASE IF NOT EXISTS avro_schema_evolution LOCATION "/user/hive/databases/avro_schema_evolution.db";
hive> CREATE EXTERNAL TABLE avro_schema_evolution.avro_schema_evolution
STORED AS AVRO
LOCATION '/user/hive/databases/avro_schema_evolution.db'
TBLPROPERTIES ('avro.schema.url' = 'hdfs:///user/<user>/avro_schema_evolution.avsc');
hive> SHOW CREATE TABLE avro_schema_evolution.avro_schema_evolution;
OK
CREATE EXTERNAL TABLE `avro_schema_evolution.avro_schema_evolution`(
`id` int COMMENT '',
`col_b` string COMMENT '')
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION
'hdfs://nameservice1/user/hive/databases/avro_schema_evolution.db'
TBLPROPERTIES (
'avro.schema.url'='hdfs:///user/<user>/avro_schema_evolution.avsc',
'transient_lastDdlTime'='1645356091')
hive> INSERT OVERWRITE TABLE avro_schema_evolution.avro_schema_evolution
VALUES (1, 'A1'), (2, 'A2');
hive> SELECT * FROM avro_schema_evolution.avro_schema_evolution;
id col_a
1 A1
2 A2
Updated schema (added new col_b): avro_schema_evolution.avsc
{
"type":"record",
"name":"avro_schema_evolution",
"doc":"avro schema evolution",
"fields":[
{
"name":"id",
"type":"int",
"columnName":"id"
},
{
"name":"col_b",
"type":"string",
"columnName":"col_b",
"default":"null"
},
{
"name":"col_a",
"type":"string",
"columnName":"col_a"
}
],
"tableName":"AvroSchemaEvolution"
}
NOTE: new col_b contains a default value
$ hdfs dfs -put -f /home/<user>/schema/new/avro_schema_evolution.avsc /user/<user>
hive > INSERT INTO TABLE avro_schema_evolution.avro_schema_evolution
VALUES (3, 'B3', 'A3'), (4, 'B4', 'A4');
hive> SELECT * FROM avro_schema_evolution.avro_schema_evolution;
id col_b col_a
1 null A1
2 null A2
3 B3 A3
4 B4 A4
Updated schema (renamed col_a -> col_aa via ALIAS): avro_schema_evolution.avsc
{
"type":"record",
"name":"avro_schema_evolution",
"doc":"avro schema evolution",
"fields":[
{
"name":"id",
"type":"int",
"columnName":"id"
},
{
"name":"col_b",
"type":"string",
"columnName":"col_b",
"default":"null"
},
{
"name":"col_aa",
"type":"string",
"columnName":"col_aa",
"aliases" : [ "col_a" ]
}
],
"tableName":"AvroSchemaEvolution"
}
$ hdfs dfs -put -f /home/<user>/schema/new_alias/avro_schema_evolution.avsc /user/<user>
hive> SELECT * FROM avro_schema_evolution.avro_schema_evolution;
id col_b col_aa
1 null A1
2 null A2
3 B3 A3
4 B4 A4
WARNING: You CANNOT change a field’s datatype. You need to add a new column instead!