MapReduce程式設計筆記(2)-WordCount程式
阿新 • • 發佈:2021-07-31
一、Hadoop自帶的WordCount程式
1,位置:$HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.3.jar
2.演示命令:命令:hadoop jar hadoop-mapreduce-examples-2.7.3.jar wordcount [輸入檔案路徑] [輸出檔案路徑]
輸入檔案路徑:可以是個檔案,也可以是個目錄,如果是目錄將讀取目錄下所有檔案
二、分析WordCount資料處理的過程
2.1 處理的資料:data.txt
I Love Beijing I Love China I love Shanghai
2.2 處理流程
資料 --> <k1,v1> --> 分詞 --> <k2,v2>
三、開發自己的WordCount程式
3.1需要的jar包
- $HADOOP_HOME/share/hadoop/common/*.jar
- $HADOOP_HOME/share/hadoop/common/lib/*.jar
- $HADOOP_HOME/share/hadoop/mapreduce/*.jar
- $HADOOP_HOME/share/hadoop/mapreduce/lib/*.jar
或者可以直接去官網查詢我們需要的依賴包的配置pom,然後加到專案中。
官網地址:http://mvnrepository.com/
hadoop-common
hadoop-hdfs
hadoop-mapreduce-client-core
hadoop-mapreduce-client-jobclient
hadoop-mapreduce-client-common
3.2需要開發的程式
main主要程式
map程式
reduce程式
3.2.1 main主程式
import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class WordContMain { public static void main(String[] args) throws Exception { //1、建立任務Job,並且指定任務的入口 Job job = Job.getInstance(new Configuration()); job.setJarByClass(WordContMain.class); //2、指定任務的Map,Map的輸出型別 job.setMapperClass(WordCountMapper.class); job.setMapOutputKeyClass(Text.class); //k2 job.setMapOutputValueClass(IntWritable.class); //v2 //3、指定任務的Reduce,Reduce的輸出型別 job.setReducerClass(WordCountReducer.class); job.setOutputKeyClass(Text.class); //k4 job.setOutputValueClass(IntWritable.class); //v4 //4、指定任務的輸入路徑和輸出路徑 FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); //5、執行任務 job.waitForCompletion(true); } }
3.2.2 Map程式
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
// k1 v1 k2 v2
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key1, Text value1, Context context)
throws IOException, InterruptedException {
/*
* context代表Map的上下文
* 上文:HDFS的輸入
* 下文:Reduce
*/
//獲取資料: I love Beijing
String data = value1.toString();
//分詞
String[] words = data.split(" ");
//輸出
for(String w:words) {
// k2單詞 v2記一次數
context.write(new Text(w), new IntWritable(1));
}
}
}
3.2.3 Reduce程式
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
// k3 v3 k4 v4
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text k3, Iterable<IntWritable> v3,Context context) throws IOException, InterruptedException {
/*
* context代表reduce的上下文
* 上文:Map
* 下文:輸出的HDFS
*/
//對v3求和
int total = 0;
for(IntWritable v:v3) {
total += v.get();
}
//輸出 k4是單詞k3, v4是total
context.write(k3, new IntWritable(total));
}
}