hadoop深入研究:(十六)——Avro序列化與反序列化
阿新 • • 發佈:2019-02-05
使用avro在很多情況下是對原有系統的改造,框架格式都已經定義好了,我們只能直接用avro對原有資料進行整合。(如果是新建系統,最好還是用avro的datafile,下一章講datafile)
準備工作
將一下schema儲存成檔案StringPair.avsc,放在src/test/resources目錄下引入最新版本的avro時要主要,最新的avro包為1.7.4,依賴org.codehaus.jackson:jackson-core-asl:1.8.8包,但是maven庫中已經沒有該版本所以要換成其他版本{ "type":"record", "name":"StringPair", "doc":"A pair of strings", "fields":[ {"name":"left","type":"string"}, {"name":"right","type":"string"} ] }
如果你用的時1.0.4版本的hadoop(或者其他版本),依賴於jackson-mapper-asl,如果與jackson-core-asl版本不一致就會產生找不到方法等異常你需要入引入相同版本<dependency> <groupId>org.codehaus.jackson</groupId> <artifactId>jackson-core-asl</artifactId> <version>1.9.9</version> </dependency>
<dependency> <groupId>org.codehaus.jackson</groupId> <artifactId>jackson-mapper-asl</artifactId> <version>1.9.9</version> </dependency>
generic方式
這一節我們用程式碼講解package com.sweetop.styhadoop;
import junit.framework.Assert;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.*;
import org.junit.Test;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
/**
* Created with IntelliJ IDEA.
* User: lastsweetop
* Date: 13-8-5
* Time: 下午7:59
* To change this template use File | Settings | File Templates.
*/
public class TestGenericMapping {
@Test
public void test() throws IOException {
//將schema從StringPair.avsc檔案中載入
Schema.Parser parser = new Schema.Parser();
Schema schema = parser.parse(getClass().getResourceAsStream("/StringPair.avsc"));
//根據schema建立一個record示例
GenericRecord datum = new GenericData.Record(schema);
datum.put("left", "L");
datum.put("right", "R");
ByteArrayOutputStream out = new ByteArrayOutputStream();
//DatumWriter可以將GenericRecord變成edncoder可以理解的型別
DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(schema);
//encoder可以將資料寫入流中,binaryEncoder第二個引數是重用的encoder,這裡不重用,所用傳空
Encoder encoder = EncoderFactory.get().binaryEncoder(out, null);
writer.write(datum,encoder);
encoder.flush();
out.close();
DatumReader<GenericRecord> reader=new GenericDatumReader<GenericRecord>(schema);
Decoder decoder=DecoderFactory.get().binaryDecoder(out.toByteArray(),null);
GenericRecord result=reader.read(null,decoder);
Assert.assertEquals("L",result.get("left").toString());
Assert.assertEquals("R",result.get("right").toString());
}
}
result.get返回的是utf-8格式,需要呼叫toString方法,才能和字串一致。
specific方式
首先使用avro-maven-plugin生成程式碼,pom的配置 <plugin>
<groupId>org.apache.avro</groupId>
<artifactId>avro-maven-plugin</artifactId>
<version>1.7.0</version>
<executions>
<execution>
<id>schemas</id>
<phase>generate-sources</phase>
<goals>
<goal>schema</goal>
</goals>
<configuration>
<includes>
<include>StringPair.avsc</include>
</includes>
<sourceDirectory>src/test/resources</sourceDirectory>
<outputDirectory>${project.build.directory}/generated-sources/java</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
avro-maven-plugin外掛繫結在generate-sources階段,呼叫mvn generate-sources即可生成原始碼,我們來看下生成的原始碼
package com.sweetop.styhadoop;
/**
* Autogenerated by Avro
* <p/>
* DO NOT EDIT DIRECTLY
*/
@SuppressWarnings("all")
/** A pair of strings */
public class StringPair extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord {
public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"StringPair\",\"doc\":\"A pair of strings\",\"fields\":[{\"name\":\"left\",\"type\":\"string\",\"avro.java.string\":\"String\"},{\"name\":\"right\",\"type\":\"string\"}]}");
@Deprecated
public java.lang.CharSequence left;
@Deprecated
public java.lang.CharSequence right;
public org.apache.avro.Schema getSchema() {
return SCHEMA$;
}
// Used by DatumWriter. Applications should not call.
public java.lang.Object get(int field$) {
switch (field$) {
case 0:
return left;
case 1:
return right;
default:
throw new org.apache.avro.AvroRuntimeException("Bad index");
}
}
// Used by DatumReader. Applications should not call.
@SuppressWarnings(value = "unchecked")
public void put(int field$, java.lang.Object value$) {
switch (field$) {
case 0:
left = (java.lang.CharSequence) value$;
break;
case 1:
right = (java.lang.CharSequence) value$;
break;
default:
throw new org.apache.avro.AvroRuntimeException("Bad index");
}
}
/**
* Gets the value of the 'left' field.
*/
public java.lang.CharSequence getLeft() {
return left;
}
/**
* Sets the value of the 'left' field.
*
* @param value the value to set.
*/
public void setLeft(java.lang.CharSequence value) {
this.left = value;
}
/**
* Gets the value of the 'right' field.
*/
public java.lang.CharSequence getRight() {
return right;
}
/**
* Sets the value of the 'right' field.
*
* @param value the value to set.
*/
public void setRight(java.lang.CharSequence value) {
this.right = value;
}
}
為了相容之前的版本生成了一組get,put方法,1.6.0後生成添加了getter/setter方法,還有一個與Builder的類,沒什麼用已經被我刪掉
另外上一篇文章有點沒講到就是schama裡的name裡可以使用名稱空間,如com.sweetop.styhadoop.StringPair,這樣生成的原始碼才會是帶package的
那我們來看如果使用這個生成的類,和generic方式有什麼不同:
package com.sweetop.styhadoop; import junit.framework.Assert; import org.apache.avro.Schema; import org.apache.avro.io.*; import org.apache.avro.specific.SpecificDatumReader; import org.apache.avro.specific.SpecificDatumWriter; import org.junit.Test; import java.io.ByteArrayOutputStream; import java.io.IOException; /** * Created with IntelliJ IDEA. * User: lastsweetop * Date: 13-8-6 * Time: 下午2:19 * To change this template use File | Settings | File Templates. */ public class TestSprecificMapping { @Test public void test() throws IOException { //因為已經生成StringPair的原始碼,所以不再使用schema了,直接呼叫setter和getter即可 StringPair datum=new StringPair(); datum.setLeft("L"); datum.setRight("R"); ByteArrayOutputStream out=new ByteArrayOutputStream(); //不再需要傳schema了,直接用StringPair作為範型和引數, DatumWriter<StringPair> writer=new SpecificDatumWriter<StringPair>(StringPair.class); Encoder encoder= EncoderFactory.get().binaryEncoder(out,null); writer.write(datum, encoder); encoder.flush(); out.close(); DatumReader<StringPair> reader=new SpecificDatumReader<StringPair>(StringPair.class); Decoder decoder= DecoderFactory.get().binaryDecoder(out.toByteArray(),null); StringPair result=reader.read(null,decoder); Assert.assertEquals("L",result.getLeft().toString()); Assert.assertEquals("R",result.getRight().toString()); } }不同點總結一下,schema->StringPair.class, GenericRecord->StringPair如果我的文章對您有幫助,請用支付寶打賞: