1. 程式人生 > >Hadoop 中SequenceFile的簡介

Hadoop 中SequenceFile的簡介

  1. MapFile  一個key-value 對應的查詢資料結構,由資料檔案/data 和索引檔案 /index 組成,資料檔案中包含所有需要儲存的key-value對,按key的順序排列。索引檔案包含一部分key值,用以指向資料檔案的關鍵位置
  2. SetFile – 基於 MapFile 實現的,他只有key,value為不可變的資料。
  3. ArrayFile – 也是基於 MapFile 實現,他就像我們使用的陣列一樣,key值為序列化的數字。
  4. BloomMapFile – 他在 MapFile 的基礎上增加了一個 /bloom 檔案,包含的是二進位制的過濾表,在每一次寫操作完成時,會更新這個過濾表。

操作

讀寫檔案
package com.eric.hadoop.io.sequencefile;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
/**
 * @author Eric.sunah 2014年12月10日
 *
 */
public class SequenceFileDemo {
 
 private static final String OPERA_FILE = "./output.seq";
 /**
  * 隨便從網上擷取的一段文字
  */
 private static String[]    testArray = { "<plugin>                                                                     ",
         "  <groupId>org.apache.avro</groupId>                                         ",
         "  <artifactId>avro-maven-plugin</artifactId>                                 ",
         "  <version>1.7.7</version>                                                   ",
         "  <executions>                                                               ",
         "    <execution>                                                              ",
         "      <phase>generate-sources</phase>                                        ",
         "      <goals>                                                                ",
         "        <goal>schema</goal>                                                  ",
         "      </goals>                                                               ",
         "      <configuration>                                                        ",
         "        <sourceDirectory>${project.basedir}/src/main/avro/</sourceDirectory> ",
         "        <outputDirectory>${project.basedir}/src/main/java/</outputDirectory> ",
         "      </configuration>                                                       ",
         "    </execution>                                                             ",
         "  </executions>                                                              ",
         "</plugin>                                                                    ",
         "<plugin>                                                                     ",
         "  <groupId>org.apache.maven.plugins</groupId>                                ",
         "  <artifactId>maven-compiler-plugin</artifactId>                             ",
         "  <configuration>                                                            ",
         "    <source>1.6</source>                                                     ",
         "    <target>1.6</target>                                                     ",
         "  </configuration>                                                           ", "</plugin>"};
 
 public static void main(String[] args) throws IOException {
  writeSequenceFile(OPERA_FILE);
  readSequenceFile(OPERA_FILE);
 }
 
 private static void readSequenceFile(String inputFile) throws IOException {
  Configuration config = new Configuration();
  Path path = new Path(inputFile);
  SequenceFile.Reader reader = null;
  try {
   
   FileSystem fs = FileSystem.get(URI.create(inputFile), config);
   reader = new SequenceFile.Reader(fs, path, config);
   IntWritable key = new IntWritable();
   Text value = new Text();
   long posion = reader.getPosition();
   // reader.next()返回非空的話表示正在讀,如果返回null表示已經讀到檔案結尾的地方
   while (reader.next(key, value)) {
    //列印同步點的位置資訊
    String syncMark = reader.syncSeen() ? "*" : "";
    System.out.printf("[%s\t%s]\t%s\t%s\n", posion, syncMark, key, value);
    posion = reader.getPosition();
   }
  } finally {
   IOUtils.closeStream(reader);
  }
 
 }
 
 /**
  * 寫Sequence File 檔案
  *
  * @param outputFile
  * @throws IOException
  */
 private static void writeSequenceFile(String outputFile) throws IOException {
  Configuration config = new Configuration();
  Path path = new Path(outputFile);
  IntWritable key = new IntWritable();
  Text value = new Text();
  SequenceFile.Writer writer = null;
  try {
   
   FileSystem fs = FileSystem.get(URI.create(outputFile), config);
   writer = SequenceFile.createWriter(fs, config, path, key.getClass(), value.getClass());
   for (int i = 1; i < 2000; i++) {
    key.set(2000 - i);
    value.set(testArray[i % testArray.length]);
    System.out.printf("[%s]\t%s\t%s\n", writer.getLength() + "", key, value);
    // 通過Append方法進行寫操作
    writer.append(key, value);
   }
  } finally {
   IOUtils.closeStream(writer);
  }
 
 }
}