Hbase幾種資料入庫（load）方式比較

阿新 • • 發佈：2019-01-02

1. 預先生成HFile入庫

2. 通過MapReduce入庫

/* MapReduce 讀取hdfs上的檔案，以HTable.put(put)的方式在map中完成資料寫入，無reduce過程*/

import java.io.IOException;

import org.apache.commons.logging.Log;

import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.conf.Configured;

import

org.apache.hadoop.hbase.HBaseConfiguration;

import org.apache.hadoop.hbase.client.HTable;

import org.apache.hadoop.hbase.client.Put;

import org.apache.hadoop.hbase.util.Bytes;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import

org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

public class HBaseImport extends Configured implements Tool{

static final Log LOG = LogFactory.getLog(HBaseImport.class);

public static final String JOBNAME = "MRImport ";

public static class Map extends Mapper<LongWritable , Text, NullWritable, NullWritable>{

Configuration configuration = null;

HTable xTable = null;

private boolean wal = true;

static long count = 0;

@Override

protected void cleanup(Context context) throws IOException,

InterruptedException {

// TODO Auto-generated method stub

super.cleanup(context);

xTable.flushCommits();

xTable.close();

}

@Override

protected void map(LongWritable key, Text value, Context context)

throws IOException, InterruptedException {

String all[] = value.toString().split("/t");

If(all.length==2){

put = new Put(Bytes.toBytes(all[0]))); put.add(Bytes.toBytes("xxx"),Bytes.toBytes("20110313"),Bytes.toBytes(all[1]));

}

if (!wal) {

put.setWriteToWAL(false);

}

xTable.put(put);

if ((++count % 100)==0) {

context.setStatus(count +" DOCUMENTS done!");

context.progress();

System.out.println(count +" DOCUMENTS done!");

}

@Override

protected void setup(Context context) throws IOException,

InterruptedException {

// TODO Auto-generated method stub

super.setup(context);

configuration = context.getConfiguration();

xTable = new HTable(configuration,"testKang");

xTable.setAutoFlush(false);

xTable.setWriteBufferSize(12*1024*1024);

wal = true;

}

@Override

public int run(String[] args) throws Exception {

String input = args[0];

Configuration conf = HBaseConfiguration.create(getConf());

conf.set("hbase.master", "m0:60000");

Job job = new Job(conf,JOBNAME);

job.setJarByClass(HBaseImport.class);

job.setMapperClass(Map.class);

job.setNumReduceTasks(0);

job.setInputFormatClass(TextInputFormat.class);

TextInputFormat.setInputPaths(job, input);

job.setOutputFormatClass(NullOutputFormat.class);

return job.waitForCompletion(true)?0:1;

}

public static void main(String[] args) throws IOException {

Configuration conf = new Configuration();

String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

int res = 1;

try {

res = ToolRunner.run(conf, new HBaseImport (), otherArgs);

} catch (Exception e) {

e.printStackTrace();

}

System.exit(res);

}

3. 通過Java程式入庫

/* Java多執行緒讀取本地磁碟上的檔案，以HTable.put(put)的方式完成資料寫入*/

import java.io.BufferedReader;

import java.io.File;

import java.io.FileReader;

import java.io.IOException;

import java.util.ArrayList;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.hbase.HBaseConfiguration;

import org.apache.hadoop.hbase.client.HTable;

import org.apache.hadoop.hbase.client.Put;

public class InsertContactJava {

public static long startTime;

public static long rowkey = 0; //起始rowkey

public static final int lineCount = 100000; //每次提交時錄入的行數

public static String tableName = "usercontact_kang"; //錄入目的表名

public static int countLie = 8; //表的列數

public static void main(String[] args) throws IOException {

startTime = System.currentTimeMillis() / 1000;

System.out.println("start time = " + startTime);

Thread t1 = new Thread() {

@Override

public void run() {

try {

insert_one("/run/jar/123");

//loadByLieWithVector("/run/jar/123");

//loadByLieWithArrayList("/run/jar/123");

} catch (IOException e) {

e.printStackTrace();

}

};

t1.start();

}

public static void insert_one(String path) throws IOException {

Configuration conf = HBaseConfiguration.create();

HTable table = new HTable(conf, tableName);

File f = new File(path);

ArrayList<Put> list = new ArrayList<Put>();

BufferedReader br = new BufferedReader(new FileReader(f));

String tmp = br.readLine();

int count = 0;

while (tmp != null) {

if (list.size() > 10000) {

table.put(list);

table.flushCommits();

list.clear();

} else {

String arr_value[] = tmp.toString().split("/t", 10);

String first[] = arr_value[0].split("~", 5);

String second[] = arr_value[1].split("~", 5);

String rowname = getIncreasRowKey();

String firstaccount = first[0];

String firstprotocolid = first[1];

String firstdomain = first[2];

String inserttime = Utils.getToday("yyyyMMdd");

String secondaccount = second[0];

String secondprotocolid = second[1];

String seconddomain = second[2];

String timescount = Integer.valueOf(arr_value[2]).toString();

Put p = new Put(rowname.getBytes());

p.add(("ucvalue").getBytes(), "FIRSTACCOUNT".getBytes(),

firstaccount.getBytes());

p.add(("ucvalue").getBytes(), "FIRSTDOMAIN".getBytes(),

firstdomain.getBytes());

p.add(("ucvalue").getBytes(), "FIRSTPROTOCOLID".getBytes(),

firstprotocolid.getBytes());

p.add(("ucvalue").getBytes(), "INSERTTIME".getBytes(),

inserttime.getBytes());

p.add(("ucvalue").getBytes(), "SECONDACCOUNT".getBytes(),

secondaccount.getBytes());

p.add(("ucvalue").getBytes(), "SECONDDOMAIN".getBytes(),

seconddomain.getBytes());

p.add(("ucvalue").getBytes(), "SECONDPROTOCOLID".getBytes(),

secondprotocolid.getBytes());

p.add(("ucvalue").getBytes(), "TIMESCOUNT".getBytes(),

timescount.getBytes());

list.add(p);

}

tmp = br.readLine();

count++;

}

if (list.size() > 0) {

table.put(list);

table.flushCommits();

}

table.close();

System.out.println("total = " + count);

long endTime = System.currentTimeMillis() / 1000;

long costTime = endTime - startTime;

System.out.println("end time = " + endTime);

System.out.println(path + ": cost time = " + costTime);

}

4. 入庫方式比較

Ø 生成HFile方式：

生成HFile的過程比較慢，生成HFile後寫入hbase非常快，基本上就是hdfs上的mv過程.對於生成HFile方式入庫的時候有一個改進的方案，就是先對資料排序，然後生成HFile。

HFile方式在所有的載入方案裡面是最快的，不過有個前提——資料是第一次匯入，表是空的。如果表中已經有了資料。HFile再匯入到hbase的表中會觸發split操作，最慢的時候這種操作會耗時1小時。

Ø MapReduce方式：

開始會很快，但是由於mr和hbase競爭資源，到一個特定的時間點會變很慢

Ø Java程式方式：

多客戶端，多執行緒同時入庫，目前看來是最好的方式，client和regionserver分開，硬碟讀寫分開，瓶頸只在網路和記憶體上。諮詢了一些牛人，大多推薦這種方式，並且一定要多客戶端，多執行緒。關於入庫效率的調優，在我另一篇部落格中有說明。

Hbase幾種資料入庫（load）方式比較

1. 預先生成HFile入庫 2. 通過MapReduce入庫 /* MapReduce 讀取hdfs上的檔案，以HTable.put(put)的方式在map中完成資料寫入，無reduce過程*/ import java.io.IOException; import or

Hbase幾種資料入庫方式比較

import java.io.IOException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configurati

javascript 面向物件的幾種常見寫法（轉）

//定義Circle類，擁有成員變數r，常量PI和計算面積的成員函式area() 文章轉自：http://www.iteye.com/topic/434462 1.工廠方式 var Circle = function() { var obj = new Object

JS中六種資料型別（一）——Undefined

轉載自：http://blog.csdn.net/a2296096931/article/details/51072448 侵刪 JS中有6種資料型別：Undefined、Null、Boolean、Number、String和Object。JS中不支援任何建立自定義型別

JS中六種資料型別（五）——String

String型別用於表示由零或多個16位Unicode字元組成的字元序列，即字串。字串可以由雙引號（“）或單引號（‘）表示，因此下面兩種字串的寫法都是有效的： var firstNam

JS中六種資料型別（六）——Object

ECMAScript中的物件是可變的鍵控集合（即一組資料和功能的集合）。它將很多值聚合在一起，可通過名字訪問這些值。物件也可看做屬性的容器，每個屬性都是一個名/值對。屬性的名字可以是包括空字串在內的任

JS中六種資料型別（四）——Number

Number型別應該是ECMAScript中最令人關注的資料型別了，這種型別使用IEEE754格式來表示整數和浮點數值（浮點數值在某些語言中也被稱為雙精度數值）。為支援各種數值型別，EC

計算幾何與圖形學有關的幾種常用演算法（二）

3.6 用向量的叉積判斷直線段是否有交向量叉積計算的另一個常用用途是直線段求交。求交演算法是計算機圖形學的核心演算法，也是體現速度和穩定性的重要標誌，高效並且穩定的求交演算法是任何一個CAD軟體都必需要重點關注的。求交包含兩層概念，一個是判斷是否相

算法系列之九：計算幾何與圖形學有關的幾種常用演算法（二）

3.6 用向量的叉積判斷直線段是否有交向量叉積計算的另一個常用用途是直線段求交。求交演算法是計算機圖形學的核心演算法，也是體現速度和穩定性的重要標誌，高效並且穩定的求交演算法是任何一個CAD軟體都必需要重點關注的。求交包含兩層概念，一個是判斷是否相交，另一個是

計算幾何與圖形學有關的幾種常用演算法（一）

我的專業是計算機輔助設計（CAD），算是一半機械一半軟體，《計算機圖形學》是必修課，也是我最喜歡的課程。熱衷於用程式碼擺平一切的我幾乎將這本教科書上的每種演算法都實現了一遍，這種重複勞動雖然意義不大，但是收穫很多，特別是丟棄了多年的數學又重新回到了腦袋中，算是最大的收

幾種排序總結（上）——堆排序

堆排序這幾天看了演算法導論的排序部分，作一下總結。堆排序的優點 1)最壞情況下o(nlgn)的時間複雜度 2)就地排序，不用輔助陣列幾種操作（以最大堆為例） 1.保持堆性質這是主要操作，對於節點A[i]，前提是以LEFT(

【微信開發】微信公眾號後臺底部選單欄json資料加入（獲取）方式操作

首先獲取微信公號的開發者ID(AppID) 開發者密碼(AppSecret) 登入公眾號找到以下選項找到獲得access_token的引數根據引數取得access_token URL： https://a

JS幾種資料型別轉換（最全）

一、轉為字串：使用 .toString或者String。 1、 .toString()方法：注意，不可以轉null和underfined //轉為字串-->toString方法 var bool=true; console.log(bool.toString()); //注意，toSt

模運算（附加幾種資料型別的資料範圍判斷）-hdu3123

題目連結：http://acm.hdu.edu.cn/showproblem.php?pid=3123 預備知識點： (a+b)%m=[(a%m)+(b%m)]%m(a*b)%m=[(a%m)*(b%m)]%m所以(0!+1!+2!+.....+n!)%m=[(0!%m+1!%m)%m+(1!%m)*(2