MapReduce的兩表join操作優化
注:優化前的分析過程詳見本博的上篇博文
案例
地址(Address)和人員(Person)的一對多關聯
原始資料
地址(Address)資料
id AddreName
1 beijing
2 shanghai
3 guangzhou
人員(Person)資料
1 zhangsan 1
2 lisi 2
3 wangwu 1
4 zhaoliu 3
5 maqi 3
優化前,我們通過構造一個通用的JavaBean來儲存兩張表的屬性。但是我們發現最後reduce時用List陣列來儲存和Address地址表區分開來的Person表陣列,將造成大量的記憶體開銷,所有我們想到重新構造Map的key型別,在資料進行reduce前重新group分組操作
優化前程式碼分析
1.自定義JavaBean程式碼
/* * 人員和地址的通用bean */ public class Bean implements WritableComparable<Bean> { private String userNo = ""; private String userName = ""; private String addreNo = ""; private String addreName = ""; private int flag; public Bean(Bean bean) { this.userName = bean.getUserName(); this.userNo = bean.getUserNo(); this.addreName = bean.getAddreName(); this.addreNo = bean.getAddreNo(); this.flag = bean.getFlag(); } public Bean() { super(); // TODO Auto-generated constructor stub } public Bean(String userNo, String userName, String addreNo, String addreName, int flag) { super(); this.userNo = userNo; this.userName = userName; this.addreNo = addreNo; this.addreName = addreName; this.flag = flag; } public String getUserNo() { return userNo; } public void setUserNo(String userNo) { this.userNo = userNo; } public String getUserName() { return userName; } public void setUserName(String userName) { this.userName = userName; } public String getAddreNo() { return addreNo; } public void setAddreNo(String addreNo) { this.addreNo = addreNo; } public String getAddreName() { return addreName; } public void setAddreName(String addreName) { this.addreName = addreName; } public int getFlag() { return flag; } public void setFlag(int flag) { this.flag = flag; } @Override public void write(DataOutput out) throws IOException { out.writeUTF(userNo); out.writeUTF(userName); out.writeUTF(addreNo); out.writeUTF(addreName); out.writeInt(flag); } @Override public void readFields(DataInput in) throws IOException { this.userNo = in.readUTF(); this.userName = in.readUTF(); this.addreNo = in.readUTF(); this.addreName = in.readUTF(); this.flag = in.readInt(); } @Override public int compareTo(Bean arg0) { // TODO Auto-generated method stub return 0; } @Override public String toString() { return "userNo=" + userNo + ", userName=" + userName + ", addreNo=" + addreNo + ", addreName=" + addreName; } }
2.Map操作
public class PersonAddrMap extends Mapper<LongWritable, Text, IntWritable, Bean> { @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, IntWritable, Bean>.Context context) throws IOException, InterruptedException { String line = value.toString(); String str[] = line.split("\t"); if (str.length == 2) { // 地區資訊表 Bean bean = new Bean(); bean.setAddreNo(str[0]); bean.setAddreName(str[1]); bean.setFlag(0); // 0表示地區 context.write(new IntWritable(Integer.parseInt(str[0])), bean); } else {// 人員資訊表 Bean bean = new Bean(); bean.setUserNo(str[0]); bean.setUserName(str[1]); bean.setAddreNo(str[2]); bean.setFlag(1); // 1表示人員表 context.write(new IntWritable(Integer.parseInt(str[2])), bean); } } }
3.reduce操作
public class PersonAddreRedu extends
Reducer<IntWritable, Bean, NullWritable, Text> {
@Override
protected void reduce(IntWritable key, Iterable<Bean> values,
Reducer<IntWritable, Bean, NullWritable, Text>.Context context)
throws IOException, InterruptedException {
Bean Addre = null;
List<Bean> peoples = new ArrayList<Bean>();
/*
* 如果values的第一個元素資訊就是地址Addre的資訊的話,
* 我們就不再需要一個List來快取person資訊了,values後面的全是人員資訊
* 將減少巨大的記憶體空間
*/
/*
* partitioner和shuffer的過程:
* partitioner的主要功能是根據reduce的數量將map輸出的結果進行分塊,將資料送入到相應的reducer.
* 所有的partitioner都必須實現partitioner介面並實現getPartition方法,該方法的返回值為int型別,並且取值範圍在0~(numOfReducer-1),
* 從而能將map的輸出輸入到對應的reducer中,對於某個mapreduce過程,hadoop框架定義了預設的partitioner為HashPartioner,
* 該partitioner使用key的hashCode來決定將該key輸送到哪個reducer;
* shuffle將每個partitioner輸出的結果根據key進行group以及排序,將具有相同key的value構成一個values的迭代器,並根據key進行排序分別呼叫
* 開發者定義的reduce方法進行排序,因此mapreducer的所以key必須實現comparable介面的compareto()方法從而能實現兩個key物件的比較
*/
/*
* 我們需要自定義key的資料結構(shuffle按照key進行分組)來滿足共同addreNo的情況下地址表的更小需求
*
*/
for (Bean bean : values) {
if (bean.getFlag() == 0) { // 表示地區表
Addre = new Bean(bean);
} else {
peoples.add(new Bean(bean)); // 新增到peoplelist中
}
}
for (Bean peo : peoples) { // 給peoplelist新增地區名字
peo.setAddreName(Addre.getAddreName());
context.write(NullWritable.get(), new Text(peo.toString()));
}
}
}
4.job操作
public class PersonAddreMain {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJarByClass(PersonAddreMain.class);
job.setMapperClass(PersonAddrMap.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Bean.class);
job.setReducerClass(PersonAddreRedu.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
具體優化分析
在Reduce類的reduce()方法中如果values的第一個元素資訊就是地址Addre的資訊的話,我們就不再需要一個List來快取person資訊了,values後面的全是人員資訊將減少巨大的記憶體空間。
partitioner和shuffer的過程
partitioner的主要功能是根據reduce的數量將map輸出的結果進行分塊,將資料送入到相應的reducer.
所有的partitioner都必須實現partitioner介面並實現getPartition方法,該方法的返回值為int型別,並且取值範圍在0~(numOfReducer-1),
從而能將map的輸出輸入到對應的reducer中,對於某個mapreduce過程,hadoop框架定義了預設的partitioner為HashPartioner,
該partitioner使用key的hashCode來決定將該key輸送到哪個reducer;
shuffle將每個partitioner輸出的結果根據key進行group以及排序,將具有相同key的value構成一個values的迭代器,並根據key進行排序分別呼叫
開發者定義的reduce方法進行排序,因此mapreducer的所以key必須實現comparable介面的compareto()方法從而能實現兩個key物件的比較
我們需要自定義key的資料結構(shuffle按照key進行分組)來滿足共同addreNo的情況下地址表的更小需求
優化後
1.JavaBean操作
/*
* 人員和地址的通用bean
* 用作map輸出的value
*/
public class Bean implements WritableComparable<Bean> {
private String userNo = " ";
private String userName = " ";
private String addreNo = " ";
private String addreName = " ";
public Bean(Bean bean) {
this.userName = bean.getUserName();
this.userNo = bean.getUserNo();
this.addreName = bean.getAddreName();
this.addreNo = bean.getAddreNo();
}
public Bean() {
super();
// TODO Auto-generated constructor stub
}
public Bean(String userNo, String userName, String addreNo,
String addreName, int flag) {
super();
this.userNo = userNo;
this.userName = userName;
this.addreNo = addreNo;
this.addreName = addreName;
}
public String getUserNo() {
return userNo;
}
public void setUserNo(String userNo) {
this.userNo = userNo;
}
public String getUserName() {
return userName;
}
public void setUserName(String userName) {
this.userName = userName;
}
public String getAddreNo() {
return addreNo;
}
public void setAddreNo(String addreNo) {
this.addreNo = addreNo;
}
public String getAddreName() {
return addreName;
}
public void setAddreName(String addreName) {
this.addreName = addreName;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(userNo);
out.writeUTF(userName);
out.writeUTF(addreNo);
out.writeUTF(addreName);
}
@Override
public void readFields(DataInput in) throws IOException {
this.userNo = in.readUTF();
this.userName = in.readUTF();
this.addreNo = in.readUTF();
this.addreName = in.readUTF();
}
@Override
public int compareTo(Bean arg0) {
// TODO Auto-generated method stub
return 0;
}
@Override
public String toString() {
return "userNo=" + userNo + ", userName=" + userName + ", addreNo="
+ addreNo + ", addreName=" + addreName;
}
}
2.自定義個map輸出的key
/*
* map輸出的key
*/
public class BeanKey implements WritableComparable<BeanKey> {
private int AddreNo;
private boolean isPrimary; // true:address false:person
public BeanKey(int addreNo, boolean isPrimary) {
super();
this.AddreNo = addreNo;
this.isPrimary = isPrimary;
}
public BeanKey() {
super();
// TODO Auto-generated constructor stub
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(AddreNo);
out.writeBoolean(isPrimary);
}
@Override
public void readFields(DataInput in) throws IOException {
this.AddreNo = in.readInt();
this.isPrimary = in.readBoolean();
}
// partitioner執行時呼叫hashcode()方法和compareTo()方法
// compareTo()方法作為shuffle排序的預設方法
@Override
public int hashCode() {
return this.AddreNo; // 按AddreNo進行分組
}
//用於排序,將相同的AddressNo的地址表和人員表,將地址表放到首位
@Override
public int compareTo(BeanKey o) {
if (this.AddreNo == o.getAddreNo()) { // 如果是同一個AddressNo的資料則判斷是Person還是Address表
if (this.isPrimary == o.isPrimary()) { //如果屬性相同屬於同種型別的表,返回0
return 0;
} else {
return this.isPrimary ? -1 : 1; // true表示Address表 返回更小的值,將排至values隊首
}
} else {
return this.AddreNo - o.getAddreNo() > 0 ? 1 : -1; //按AddressNo排序
}
}
public int getAddreNo() {
return AddreNo;
}
public void setAddreNo(int addreNo) {
AddreNo = addreNo;
}
public boolean isPrimary() {
return isPrimary;
}
public void setPrimary(boolean isPrimary) {
this.isPrimary = isPrimary;
}
}
3.重新構造shuffle的group分組
實現Group分組
shuffle的group過程預設的是使用的key(BeanKey)的compareTo()方法
剛才我們新增的自定義的Key沒有辦法將具有相同AddressNo的地址和人員放到同一個group中(因為從compareTo()方法中可以看出他們是不相等的)
我們需要的就是自己定義一個groupComparer就可以
實現比較器
/*
* 實現Group分組
* shuffle的group過程預設的是使用的key(BeanKey)的compareTo()方法
* 剛才我們新增的自定義的Key沒有辦法將具有相同AddressNo的地址和人員放到同一個group中(因為從compareTo()方法中可以看出他們是不相等的)
* 我們需要的就是自己定義一個groupComparer就可以
* 實現比較器
*/
public class PKFKCompartor extends WritableComparator{
protected PKFKCompartor() {
super(BeanKey.class, true);
}
//兩個BeanKey進行比較排序
@Override
public int compare(WritableComparable a, WritableComparable b) {
BeanKey a1=(BeanKey)a;
BeanKey b1=(BeanKey)b;
if(a1.getAddreNo()==b1.getAddreNo()){
return 0;
}else{
return a1.getAddreNo()>b1.getAddreNo()?1:-1;
}
}
}
4.實現map
/*
* map類使key,value分別進行處理
*/
public class PersonAddreMap extends Mapper<LongWritable, Text, BeanKey, Bean> {
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, BeanKey, Bean>.Context context)
throws IOException, InterruptedException {
String line = value.toString();
String str[] = line.split("\t");
if (str.length == 2) {
// Addre表
Bean Addre = new Bean();
Addre.setAddreNo(str[0]);
Addre.setAddreName(str[1]);
BeanKey AddreKey = new BeanKey();
AddreKey.setAddreNo(Integer.parseInt(str[0]));
AddreKey.setPrimary(true); // true表示地區表
context.write(AddreKey, Addre);
} else {
// Person表
Bean Person = new Bean();
Person.setUserNo(str[0]);
Person.setUserName(str[1]);
Person.setAddreNo(str[2]);
BeanKey PerKey = new BeanKey();
PerKey.setAddreNo(Integer.parseInt(str[2]));
PerKey.setPrimary(false);// false表示人員表
context.write(PerKey, Person);
}
}
}
5.實現reduce
public class PersonAddreRedu extends Reducer<BeanKey, Bean, NullWritable, Text> {
@Override
protected void reduce(BeanKey key, Iterable<Bean> values,
Reducer<BeanKey, Bean, NullWritable, Text>.Context context)
throws IOException, InterruptedException {
Bean Addre = null;
int num = 0;
for (Bean bean : values) {
if (num == 0) {
Addre = new Bean(bean); // Address地址表為values的第一個值
num++;
} else {
// 其餘全為person表
// 沒有list陣列,節省大量記憶體空間
bean.setAddreName(Addre.getAddreName());
context.write(NullWritable.get(), new Text(bean.toString()));
}
}
}
}
6.實現job
public class PersonAddreMain {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJarByClass(PersonAddreMain.class);
//設定自定義的group
job.setGroupingComparatorClass(PKFKCompartor.class);
job.setMapperClass(PersonAddreMap.class);
job.setMapOutputKeyClass(BeanKey.class);
job.setMapOutputValueClass(Bean.class);
job.setReducerClass(PersonAddreRedu.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}