hadoop mapreduce讀取orcfile的java程式碼示例
orcfile在hive 0.11版本後提供支援,orcfile相比rcfile具有更高的資料壓縮比,在不使用任何壓縮演算法,僅僅使用orcfile儲存格式,資料量大小就能縮小一半以上。
下面以hive 0.13版本為例,列舉了mapreduce讀取orcfile的java示例程式碼:
需要引入的包:hive-common-0.13.1.jar、hive-exec-0.13.1.jar
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcNewInputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcStruct;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
main函式關鍵程式碼:
public static void main(String[] args) throws IOException,
URISyntaxException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setInputFormatClass(OrcNewInputFormat.class);
FileInputFormat.addInputPath(job, new Path(inputPath));
job.setMapperClass(ExtractorMapper.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
map實現函式關鍵程式碼:
private static class ExtractorMapper extends
Mapper {
private static final String SCHEMA = "struct<column_name1:string,column_name2:string>"
protected void map(
NullWritable key,
Writable value,
Mapper.Context context)
throws IOException, InterruptedException {
OrcStruct struct = (OrcStruct)value;
TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(SCHEMA);
StructObjectInspector inspector = (StructObjectInspector)
OrcStruct.createObjectInspector(typeInfo);
StringBuffer outputKey = new StringBuffer();
outputKey.append(inspector.getStructFieldData(struct, inspector.getStructFieldRef("column_name1")).toString());
outputKey.append(TAB);
outputKey.append(inspector.getStructFieldData(struct, inspector.getStructFieldRef("column_name2")).toString());
System.out.println(outputKey.toString());
}