Hbase幾種資料入庫(load)方式比較
1. 預先生成HFile入庫
2. 通過MapReduce入庫
/* MapReduce 讀取hdfs上的檔案,以HTable.put(put)的方式在map中完成資料寫入,無reduce過程*/
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class HBaseImport extends Configured implements Tool{
static final Log LOG = LogFactory.getLog(HBaseImport.class);
public static final String JOBNAME = "MRImport ";
public static class Map extends Mapper<LongWritable , Text, NullWritable, NullWritable>{
Configuration configuration = null;
HTable xTable = null;
private boolean wal = true;
static long count = 0;
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
// TODO Auto-generated method stub
super.cleanup(context);
xTable.flushCommits();
xTable.close();
}
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String all[] = value.toString().split("/t");
If(all.length==2){
put = new Put(Bytes.toBytes(all[0]))); put.add(Bytes.toBytes("xxx"),Bytes.toBytes("20110313"),Bytes.toBytes(all[1]));
}
if (!wal) {
put.setWriteToWAL(false);
}
xTable.put(put);
if ((++count % 100)==0) {
context.setStatus(count +" DOCUMENTS done!");
context.progress();
System.out.println(count +" DOCUMENTS done!");
}
}
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
// TODO Auto-generated method stub
super.setup(context);
configuration = context.getConfiguration();
xTable = new HTable(configuration,"testKang");
xTable.setAutoFlush(false);
xTable.setWriteBufferSize(12*1024*1024);
wal = true;
}
}
@Override
public int run(String[] args) throws Exception {
String input = args[0];
Configuration conf = HBaseConfiguration.create(getConf());
conf.set("hbase.master", "m0:60000");
Job job = new Job(conf,JOBNAME);
job.setJarByClass(HBaseImport.class);
job.setMapperClass(Map.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.setInputPaths(job, input);
job.setOutputFormatClass(NullOutputFormat.class);
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) throws IOException {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
int res = 1;
try {
res = ToolRunner.run(conf, new HBaseImport (), otherArgs);
} catch (Exception e) {
e.printStackTrace();
}
System.exit(res);
}
}
3. 通過Java程式入庫
/* Java多執行緒讀取本地磁碟上的檔案,以HTable.put(put)的方式完成資料寫入*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
public class InsertContactJava {
public static long startTime;
public static long rowkey = 0; //起始rowkey
public static final int lineCount = 100000; //每次提交時錄入的行數
public static String tableName = "usercontact_kang"; //錄入目的表名
public static int countLie = 8; //表的列數
public static void main(String[] args) throws IOException {
startTime = System.currentTimeMillis() / 1000;
System.out.println("start time = " + startTime);
Thread t1 = new Thread() {
@Override
public void run() {
try {
insert_one("/run/jar/123");
//loadByLieWithVector("/run/jar/123");
//loadByLieWithArrayList("/run/jar/123");
} catch (IOException e) {
e.printStackTrace();
}
}
};
t1.start();
}
public static void insert_one(String path) throws IOException {
Configuration conf = HBaseConfiguration.create();
HTable table = new HTable(conf, tableName);
File f = new File(path);
ArrayList<Put> list = new ArrayList<Put>();
BufferedReader br = new BufferedReader(new FileReader(f));
String tmp = br.readLine();
int count = 0;
while (tmp != null) {
if (list.size() > 10000) {
table.put(list);
table.flushCommits();
list.clear();
} else {
String arr_value[] = tmp.toString().split("/t", 10);
String first[] = arr_value[0].split("~", 5);
String second[] = arr_value[1].split("~", 5);
String rowname = getIncreasRowKey();
String firstaccount = first[0];
String firstprotocolid = first[1];
String firstdomain = first[2];
String inserttime = Utils.getToday("yyyyMMdd");
String secondaccount = second[0];
String secondprotocolid = second[1];
String seconddomain = second[2];
String timescount = Integer.valueOf(arr_value[2]).toString();
Put p = new Put(rowname.getBytes());
p.add(("ucvalue").getBytes(), "FIRSTACCOUNT".getBytes(),
firstaccount.getBytes());
p.add(("ucvalue").getBytes(), "FIRSTDOMAIN".getBytes(),
firstdomain.getBytes());
p.add(("ucvalue").getBytes(), "FIRSTPROTOCOLID".getBytes(),
firstprotocolid.getBytes());
p.add(("ucvalue").getBytes(), "INSERTTIME".getBytes(),
inserttime.getBytes());
p.add(("ucvalue").getBytes(), "SECONDACCOUNT".getBytes(),
secondaccount.getBytes());
p.add(("ucvalue").getBytes(), "SECONDDOMAIN".getBytes(),
seconddomain.getBytes());
p.add(("ucvalue").getBytes(), "SECONDPROTOCOLID".getBytes(),
secondprotocolid.getBytes());
p.add(("ucvalue").getBytes(), "TIMESCOUNT".getBytes(),
timescount.getBytes());
list.add(p);
}
tmp = br.readLine();
count++;
}
if (list.size() > 0) {
table.put(list);
table.flushCommits();
}
table.close();
System.out.println("total = " + count);
long endTime = System.currentTimeMillis() / 1000;
long costTime = endTime - startTime;
System.out.println("end time = " + endTime);
System.out.println(path + ": cost time = " + costTime);
}
4. 入庫方式比較
Ø 生成HFile方式:
生成HFile的過程比較慢,生成HFile後寫入hbase非常快,基本上就是hdfs上的mv過程.對於生成HFile方式入庫的時候有一個改進的方案,就是先對資料排序,然後生成HFile。
HFile方式在所有的載入方案裡面是最快的,不過有個前提——資料是第一次匯入,表是空的。如果表中已經有了資料。HFile再匯入到hbase的表中會觸發split操作,最慢的時候這種操作會耗時1小時。
Ø MapReduce方式:
開始會很快,但是由於mr和hbase競爭資源,到一個特定的時間點會變很慢
Ø Java程式方式:
多客戶端,多執行緒同時入庫,目前看來是最好的方式,client和regionserver分開,硬碟讀寫分開,瓶頸只在網路和記憶體上。諮詢了一些牛人,大多推薦這種方式,並且一定要多客戶端,多執行緒。關於入庫效率的調優,在我另一篇部落格中有說明。