Hadoop 中SequenceFile的簡介
阿新 • • 發佈:2019-02-01
- MapFile 一個key-value 對應的查詢資料結構,由資料檔案/data 和索引檔案 /index 組成,資料檔案中包含所有需要儲存的key-value對,按key的順序排列。索引檔案包含一部分key值,用以指向資料檔案的關鍵位置
- SetFile – 基於 MapFile 實現的,他只有key,value為不可變的資料。
- ArrayFile – 也是基於 MapFile 實現,他就像我們使用的陣列一樣,key值為序列化的數字。
- BloomMapFile – 他在 MapFile 的基礎上增加了一個 /bloom 檔案,包含的是二進位制的過濾表,在每一次寫操作完成時,會更新這個過濾表。
操作
讀寫檔案package com.eric.hadoop.io.sequencefile; import java.io.IOException; import java.net.URI; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; /** * @author Eric.sunah 2014年12月10日 * */ public class SequenceFileDemo { private static final String OPERA_FILE = "./output.seq"; /** * 隨便從網上擷取的一段文字 */ private static String[] testArray = { "<plugin> ", " <groupId>org.apache.avro</groupId> ", " <artifactId>avro-maven-plugin</artifactId> ", " <version>1.7.7</version> ", " <executions> ", " <execution> ", " <phase>generate-sources</phase> ", " <goals> ", " <goal>schema</goal> ", " </goals> ", " <configuration> ", " <sourceDirectory>${project.basedir}/src/main/avro/</sourceDirectory> ", " <outputDirectory>${project.basedir}/src/main/java/</outputDirectory> ", " </configuration> ", " </execution> ", " </executions> ", "</plugin> ", "<plugin> ", " <groupId>org.apache.maven.plugins</groupId> ", " <artifactId>maven-compiler-plugin</artifactId> ", " <configuration> ", " <source>1.6</source> ", " <target>1.6</target> ", " </configuration> ", "</plugin>"}; public static void main(String[] args) throws IOException { writeSequenceFile(OPERA_FILE); readSequenceFile(OPERA_FILE); } private static void readSequenceFile(String inputFile) throws IOException { Configuration config = new Configuration(); Path path = new Path(inputFile); SequenceFile.Reader reader = null; try { FileSystem fs = FileSystem.get(URI.create(inputFile), config); reader = new SequenceFile.Reader(fs, path, config); IntWritable key = new IntWritable(); Text value = new Text(); long posion = reader.getPosition(); // reader.next()返回非空的話表示正在讀,如果返回null表示已經讀到檔案結尾的地方 while (reader.next(key, value)) { //列印同步點的位置資訊 String syncMark = reader.syncSeen() ? "*" : ""; System.out.printf("[%s\t%s]\t%s\t%s\n", posion, syncMark, key, value); posion = reader.getPosition(); } } finally { IOUtils.closeStream(reader); } } /** * 寫Sequence File 檔案 * * @param outputFile * @throws IOException */ private static void writeSequenceFile(String outputFile) throws IOException { Configuration config = new Configuration(); Path path = new Path(outputFile); IntWritable key = new IntWritable(); Text value = new Text(); SequenceFile.Writer writer = null; try { FileSystem fs = FileSystem.get(URI.create(outputFile), config); writer = SequenceFile.createWriter(fs, config, path, key.getClass(), value.getClass()); for (int i = 1; i < 2000; i++) { key.set(2000 - i); value.set(testArray[i % testArray.length]); System.out.printf("[%s]\t%s\t%s\n", writer.getLength() + "", key, value); // 通過Append方法進行寫操作 writer.append(key, value); } } finally { IOUtils.closeStream(writer); } } }