eclipse配置hadoop2.7.2開發環境
先安裝並啟動hadoop,怎麽弄見上文http://www.cnblogs.com/wuxun1997/p/6847950.html。這裏說下怎麽設置IDE來開發hadoop代碼。首先要確保你本地裝了eclipse,再下個eclipse的hadoop插件就完事了。下面細說一下:
1、到http://download.csdn.net/detail/wuxun1997/9841487下載eclipse插件並丟到eclipse的pulgin目錄下,重啟eclipse,Project Explorer出現DFS Locations;
2、點擊Window->點Preferences->點Hadoop Map/Reduce->填D:\hadoop-2.7.2並OK;
3、點擊Window->點Show View->點MapReduce Tools下的Map/Reduce Locations->點右邊角一個帶+號的小象圖標"New hadoop location"->eclipse已填好默認參數,但以下幾個參數需要修改以下,參見上文中的兩個配置文件core-site.xml和hdfs-site.xml:
General->Map/Reduce(V2) Master->Port改為9001
General->DSF Master->Port改為9000
Advanced paramters->dfs.datanode.data.dir改為ffile:/hadoop/data/dfs/datanode
Advanced paramters->dfs.namenode.name.dir改為file:/hadoop/data/dfs/namenode
4、點擊Finish後在DFS Locations右鍵點擊左邊三角圖標,出現hdsf文件夾,可以直接在這裏操作hdsf,右鍵點擊文件圖標選"Create new Dictionery"即可新增,再次右鍵點擊文件夾圖標選Reflesh出現新增的結果;此時在localhost:50070->Utilities->Browse the file system也可以看到新增的結果;
5、新建hadoop項目:File->New->Project->Map/Reduce Project->next->輸入自己取的項目名如hadoop再點Finish
6、這裏的代碼演示最常見的分詞例子,統計的是中文小說裏的人名並降序排列。為了分詞需要導入一個jar,在這裏下載http://download.csdn.net/detail/wuxun1997/9841659。項目結構如下:
hadoop
|--src
|--com.wulinfeng.hadoop.wordsplit
|--WordSplit.java
|--IKAnalyzer.cfg.xml
|--myext.dic
|--mystopword.dic
WordSplit.java
package com.wulinfeng.hadoop.wordsplit; import java.io.IOException; import java.io.StringReader; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.map.InverseMapper; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.Lexeme; public class WordSplit { /** * map實現分詞 * @author Administrator * */ public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> { private static final IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException { StringReader input = new StringReader(value.toString()); IKSegmenter ikSeg = new IKSegmenter(input, true); // 智能分詞 for (Lexeme lexeme = ikSeg.next(); lexeme != null; lexeme = ikSeg.next()) { this.word.set(lexeme.getLexemeText()); context.write(this.word, one); } } } /** * reduce實現分詞累計 * @author Administrator * */ public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } this.result.set(sum); context.write(key, this.result); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String inputFile = "/input/people.txt"; // 輸入文件 Path outDir = new Path("/out"); // 輸出目錄 Path tempDir = new Path("/tmp" + System.currentTimeMillis()); // 臨時目錄 // 第一個任務:分詞 System.out.println("start task..."); Job job = Job.getInstance(conf, "word split"); job.setJarByClass(WordSplit.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(inputFile)); FileOutputFormat.setOutputPath(job, tempDir); // 第一個任務結束,輸出作為第二個任務的輸入,開始排序任務 job.setOutputFormatClass(SequenceFileOutputFormat.class); if (job.waitForCompletion(true)) { System.out.println("start sort..."); Job sortJob = Job.getInstance(conf, "word sort"); sortJob.setJarByClass(WordSplit.class); sortJob.setMapperClass(InverseMapper.class); sortJob.setInputFormatClass(SequenceFileInputFormat.class); // 反轉map鍵值,計算詞頻並降序 sortJob.setMapOutputKeyClass(IntWritable.class); sortJob.setMapOutputValueClass(Text.class); sortJob.setSortComparatorClass(IntWritableDecreasingComparator.class); sortJob.setNumReduceTasks(1); // 輸出到out目錄文件 sortJob.setOutputKeyClass(IntWritable.class); sortJob.setOutputValueClass(Text.class); FileInputFormat.addInputPath(sortJob, tempDir); // 如果已經有out目錄,先刪再創建 FileSystem fileSystem = outDir.getFileSystem(conf); if (fileSystem.exists(outDir)) { fileSystem.delete(outDir, true); } FileOutputFormat.setOutputPath(sortJob, outDir); if (sortJob.waitForCompletion(true)) { System.out.println("finish and quit...."); // 刪掉臨時目錄 fileSystem = tempDir.getFileSystem(conf); if (fileSystem.exists(tempDir)) { fileSystem.delete(tempDir, true); } System.exit(0); } } } /** * 實現降序 * * @author Administrator * */ private static class IntWritableDecreasingComparator extends IntWritable.Comparator { public int compare(WritableComparable a, WritableComparable b) { return -super.compare(a, b); } public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { return -super.compare(b1, s1, l1, b2, s2, l2); } } }
IKAnalyzer.cfg.xml
<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd"> <properties> <comment>IK Analyzer 擴展配置</comment> <!--用戶可以在這裏配置自己的擴展字典 --> <entry key="ext_dict">myext.dic</entry> <!--用戶可以在這裏配置自己的擴展停止詞字典 --> <entry key="ext_stopwords">mystopword.dic</entry> </properties>
myext.dic
高育良
祁同偉
陳海
陳巖石
侯亮平
高小琴
沙瑞金
李達康
蔡成功
mystopword.dic
你
我
他
是
的
了
啊
說
也
和
在
就
這裏直接在eclipse跑WordSplit類,右鍵選擇Run as -> Run on hadoop。因為在類裏寫死了輸入文件,所以需要在D盤建一個input目錄,裏面放個文件名叫people.txt的小說,是網上蕩下來的熱劇《人民的名義》,為了分詞的需要把people.txt去Notepad++裏打開,點編碼->以UTF-8以無BOM格式編碼。在myext.dic裏輸入一些不想拆分的人名,在mystopword.dic輸入想要過濾掉的一些謂詞和助詞,跑完去D:\out裏看part-r-00000文件即可知道誰是豬腳。
eclipse配置hadoop2.7.2開發環境