1. 程式人生 > 實用技巧 >java - hive - 讀寫orc檔案

java - hive - 讀寫orc檔案

讀取orc檔案

    @Test
    public void readOrc() throws IOException {
        Configuration conf = new Configuration();
        Reader reader = OrcFile.createReader(new Path("/tmp/Orc.orc"),
                OrcFile.readerOptions(conf));
        RecordReader rows = reader.rows();
        VectorizedRowBatch batch 
= reader.getSchema().createRowBatch(); while (rows.nextBatch(batch)) { System.out.println(batch.toString()); } rows.close(); }

寫orc檔案---一行

    @Test
    public void writeLine3() throws IOException {
        Configuration conf = new Configuration();
        TypeDescription schema 
= TypeDescription.fromString("struct<x:int,y:int>"); Writer writer = OrcFile.createWriter(new Path("/tmp/Orc.orc"), OrcFile.writerOptions(conf) .setSchema(schema)); VectorizedRowBatch batch = schema.createRowBatch(); LongColumnVector x
= (LongColumnVector) batch.cols[0]; LongColumnVector y = (LongColumnVector) batch.cols[1]; int row = batch.size++; x.vector[row] = 2; y.vector[row] = 2 * 3; if (batch.size != 0) { writer.addRowBatch(batch); batch.reset(); } writer.close(); }

寫orc檔案--多行

    @Test
    public void writeLine2() throws IOException {
        String[] lines = new String[]{"1,a,aa", "2,b,bb", "3,c,cc", "4,d,dd", "1,a,aa", "2,b,bb", "3,c,cc", "4,d,dd", "1,a,aa", "2,b,bb", "3,c,cc", "4,d,dd", "1,a,aa", "2,b,bb", "3,c,cc", "4,d,dd"};
//        String[] lines = new String[]{"1,2,4", "1,2,3", "1,2,3", "1,2,3", "1,2,3", "1,2,3", "1,2,3", "1,2,3"};


        Configuration conf = new Configuration();
        TypeDescription schema = TypeDescription.fromString("struct<field1:String,field2:String,field3:String>");
//        TypeDescription schema = TypeDescription.fromString("struct<field1:int,field2:int,field3:int>");
        Writer writer = OrcFile.createWriter(new Path("/tmp/Orc.orc"),
                OrcFile.writerOptions(conf)
                        .setSchema(schema).overwrite(true));
        VectorizedRowBatch batch = schema.createRowBatch();
        List<? super ColumnVector> columnVectors = new ArrayList<>();

        for (int i = 0; i < batch.numCols; i++) {
            columnVectors.add(batch.cols[i]);
        }

        for (String line : lines) {
            String[] columns = line.split(",");
            System.out.println(batch.size);
            int row = batch.size++;
            for (int i = 0; i < columns.length; i++) {
                switch (columnVectors.get(i).getClass().getName()) {
                    case "org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector":
                        BytesColumnVector bytesColumnVector = BytesColumnVector.class.cast(columnVectors.get(i));
                        bytesColumnVector.setVal(row,columns[i].getBytes(),0,columns[i].getBytes().length);
                        break;
                    case "org.apache.hadoop.hive.ql.exec.vector.LongColumnVector":
                        LongColumnVector longColumnVector = LongColumnVector.class.cast(columnVectors.get(i));
                        longColumnVector.vector[row] = Long.parseLong(columns[i]);
                        break;
                }
                if (batch.size == batch.getMaxSize()) {
                    writer.addRowBatch(batch);
                    batch.reset();
                }
            }
        }
        if (batch.size != 0) {
            writer.addRowBatch(batch);
            batch.reset();
        }
        writer.close();

    }

引用jar

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.*;
import org.junit.Test;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;