mapreduce典型應用案例之倒排索引
阿新 • • 發佈:2019-01-03
一、倒排索引的介紹
通俗的講,就是根據單詞找到包含這個單詞的所有文件。
二、mapreduce實現框架
1、首先要確定map、reduce、combiner中的key和value是什麼型別
2、然後確定key和value具體是什麼?
Map : key為 單詞+檔名 value為空
combiner : key為單詞 value為次數+檔名
reduce: key為單詞 value為相同單詞的“次數+檔名”拼接而成
三、mapreduce程式碼實現
1、準備資料
a.txt | i love beijing and love china |
---|---|
b.txt | i love beijing and not like New York |
c.txt | i dot like anycity |
d.txt | you like where |
e.txt | love familiy and love china |
2、具體程式碼實現
package com.qyl.master;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop. mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class MyMapReduce {
public static class MyMapper extends Mapper<LongWritable,Text,Text,Text>{
private Text okey=new Text();
private Text ovalue=new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String filename = ((FileSplit) context.getInputSplit()).getPath().getName();
String[] strs = value.toString().split(" ");
for(String s:strs){
okey.set(s+"-"+filename);
context.write(okey,ovalue);
}
}
}
public static class MyCombiner extends Reducer<Text,Text,Text,Text>{
private Text okey=new Text();
private Text ovalue=new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
int count=0;
for(Text text:values){
count++;
}
String strs[]=key.toString().split("-");
okey.set(strs[0]);
ovalue.set(strs[1]+"="+count);
context.write(okey,ovalue);
}
}
public static class MyReduce extends Reducer<Text,Text,Text,Text>{
private Text ovalue=new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
StringBuilder sb=new StringBuilder();
for(Text text:values){
sb.append(text.toString()).append(",");
}
sb.delete(sb.length()-1,sb.length());
ovalue.set(sb.toString());
context.write(key,ovalue);
}
}
public static void main(String[] args) {
Configuration conf=new Configuration();
try {
Job job=Job.getInstance(conf);
job.setJarByClass(MyMapReduce.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReduce.class);
job.setCombinerClass(MyCombiner.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
Path inPath =new Path("C:\\data");
FileInputFormat.addInputPath(job, inPath);
Path outpath=new Path("C:\\data\\result");
if(outpath.getFileSystem(conf).exists(outpath)){
outpath.getFileSystem(conf).delete(outpath, true);
}
FileOutputFormat.setOutputPath(job, outpath);
job.waitForCompletion(true);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
3、結果
New b.txt=1
York b.txt=1
and b.txt=1,e.txt=1,a.txt=1
anycity c.txt=1
beijing b.txt=1,a.txt=1
china a.txt=1,e.txt=1
dot c.txt=1
familiy e.txt=1
i c.txt=1,a.txt=1,b.txt=1
like b.txt=1,c.txt=1,d.txt=1
love e.txt=2,b.txt=1,a.txt=2
not b.txt=1
where d.txt=1
you d.txt=1