Hadoop自帶Sort例子分析

阿新 • • 發佈：2017-11-16

lan exit more double expr ogr oms lru sort

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      
http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
  
*/

package org.apache.hadoop.examples;

import java.io.IOException;
import java.net.URI;
import java.util.*;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import 
 org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * This is the trivial map/reduce program that does absolutely nothing
 * other than use the framework to fragment and sort the input values.
 *
 * To run: bin/hadoop jar build/hadoop-examples.jar sort
 *            [-r <i>reduces</i>]
 *            [-inFormat <i>input format class</i>]
 *            [-outFormat <i>output format class</i>]
 *            [-outKey <i>output key class</i>]
 *            [-outValue <i>output value class</i>]
 *            [-totalOrder <i>pcnt</i> <i>num samples</i> <i>max splits</i>]
 *            <i>in-dir</i> <i>out-dir</i>
 */
public class Sort<K,V> extends Configured implements Tool {
  public static final String REDUCES_PER_HOST =
    "mapreduce.sort.reducesperhost";
  private Job job = null;

  static int printUsage() {
    System.out.println("sort [-r <reduces>] " +
                       "[-inFormat <input format class>] " +
                       "[-outFormat <output format class>] " +
                       "[-outKey <output key class>] " +
                       "[-outValue <output value class>] " +
                       "[-totalOrder <pcnt> <num samples> <max splits>] " +
                       "<input> <output>");
    ToolRunner.printGenericCommandUsage(System.out);
    return 2;
  }

  /**
   * The main driver for sort program.
   * Invoke this method to submit the map/reduce job.
   * @throws IOException When there is communication problems with the
   *                     job tracker.
   */
  public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = conf.get(REDUCES_PER_HOST);
    if (sort_reduces != null) {
       num_reduces = cluster.getTaskTrackers() *
                       Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass =
      SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass =
      SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;
    Class<? extends Writable> outputValueClass = BytesWritable.class;
    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K,V> sampler = null;
    for(int i=0; i < args.length; ++i) {
      try {
        if ("-r".equals(args[i])) {
          num_reduces = Integer.parseInt(args[++i]);
        } else if ("-inFormat".equals(args[i])) {
          inputFormatClass =
            Class.forName(args[++i]).asSubclass(InputFormat.class);
        } else if ("-outFormat".equals(args[i])) {
          outputFormatClass =
            Class.forName(args[++i]).asSubclass(OutputFormat.class);
        } else if ("-outKey".equals(args[i])) {
          outputKeyClass =
            Class.forName(args[++i]).asSubclass(WritableComparable.class);
        } else if ("-outValue".equals(args[i])) {
          outputValueClass =
            Class.forName(args[++i]).asSubclass(Writable.class);
        } else if ("-totalOrder".equals(args[i])) {
          double pcnt = Double.parseDouble(args[++i]);
          int numSamples = Integer.parseInt(args[++i]);
          int maxSplits = Integer.parseInt(args[++i]);
          if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE;
          sampler =
            new InputSampler.RandomSampler<K,V>(pcnt, numSamples, maxSplits);
        } else {
          otherArgs.add(args[i]);
        }
      } catch (NumberFormatException except) {
        System.out.println("ERROR: Integer expected instead of " + args[i]);
        return printUsage();
      } catch (ArrayIndexOutOfBoundsException except) {
        System.out.println("ERROR: Required parameter missing from " +
            args[i-1]);
        return printUsage(); // exits
      }
    }
    // Set user-supplied (possibly default) job configs
    job = Job.getInstance(conf);
    job.setJobName("sorter");
    job.setJarByClass(Sort.class);

    job.setMapperClass(Mapper.class);
    job.setReducerClass(Reducer.class);

    job.setNumReduceTasks(num_reduces);

    job.setInputFormatClass(inputFormatClass);
    job.setOutputFormatClass(outputFormatClass);

    job.setOutputKeyClass(outputKeyClass);
    job.setOutputValueClass(outputValueClass);

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
      System.out.println("ERROR: Wrong number of parameters: " +
          otherArgs.size() + " instead of 2.");
      return printUsage();
    }
    FileInputFormat.setInputPaths(job, otherArgs.get(0));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1)));

    if (sampler != null) {
      System.out.println("Sampling input to effect total-order sort...");
      job.setPartitionerClass(TotalOrderPartitioner.class);
      Path inputDir = FileInputFormat.getInputPaths(job)[0];
      inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf));
      Path partitionFile = new Path(inputDir, "_sortPartitioning");
      TotalOrderPartitioner.setPartitionFile(conf, partitionFile);
      InputSampler.<K,V>writePartitionFile(job, sampler);
      URI partitionUri = new URI(partitionFile.toString() +
                                 "#" + "_sortPartitioning");
      DistributedCache.addCacheFile(partitionUri, conf);
    }

    System.out.println("Running on " +
        cluster.getTaskTrackers() +
        " nodes to sort from " +
        FileInputFormat.getInputPaths(job)[0] + " into " +
        FileOutputFormat.getOutputPath(job) +
        " with " + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    int ret = job.waitForCompletion(true) ? 0 : 1;
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " +
        (end_time.getTime() - startTime.getTime()) /1000 + " seconds.");
    return ret;
  }



  public static void main(String[] args) throws Exception {
    int res = ToolRunner.run(new Configuration(), new Sort(), args);
    System.exit(res);
  }

  /**
   * Get the last job that was run using this instance.
   * @return the results of the last job that was run
   */
  public Job getResult() {
    return job;
  }
}

看了源碼的第一印象就是，我啥時候寫MapReduce也這麽規範，這麽屌......

Hadoop自帶Sort例子分析

lan exit more double expr ogr oms lru sort /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor lic

使用Hadoop自帶的例子pi計算圓周率

Hadoop中自帶的hadoop-mapreduce-examples-2.7.6.jar含有一些事例，本文將用pi計算圓周率。若想了解其計算原理，參考：http://thinkinginhadoop.iteye.com/blog/710847。具體步驟如下： 1. 啟

運行hadoop自帶的wordcount例子程序

inpu art ces put 自帶 tput wordcount ems example 1.準備文件 [root@master ~]# cat input.txt hello java hello python hello c hello java hello js

hadoop自帶例子wordcount的具體執行步驟

1.在linux系統中，所在目錄“/home/kcm”下建立一個資料夾input [[email protected]~]$ mkdir input 2.在資料夾input中建立兩個文字檔案file1.txt和file2.txt，file1.txt中內容是“hel

hadoop mahout 執行自帶的例子

The 20 Newsgroups資料集合收集了大約20,000 個新聞群組的文件,它們被劃分成為大約20個新聞組。在機器學習領域，這個資料集合被廣泛的應用與分類和聚類測試資料集。下面是介紹如何利用這個資料集合來進行分類。準備工作：（1）首先確保安裝了hadoop和m

JVM自帶性能分析工具介紹——jmap和jhat

inf info weibo 介紹 oci mar style lan user 0L蛻投M口l形繃9http://huiyi.docin.com/sina_5847440681 RH姥胤1操士剮訊39鎂http://www.docin.com/sina_62699771

JVM自帶性能分析工具介紹——jstat

mfp ldd 性能分析工具 blank pxn qtp targe 自帶 mcs 炭撐pcdw1律v煌映40嘔http://tushu.docin.com/sina_6345212704 倩角72g傲28蛻iy墾84http://tushu.docin.com/sina_

利用Hadoop自帶example實現wordCount Failed to execute operation: No such file or directory(systemctl enable iptables.service)

上次雖然把環境搭好了，但是實際執行起來一堆錯誤，下面簡述一下踩的坑。 1、hadoop fs -put上傳檔案失敗報錯資訊：（test資料夾是已經成功建好的） [[email protected] ~]# hadoop fs -put test1.txt /

用hadoop自帶的wordcount測試

1、建立檔案example.txt,並將其拷貝到hdfs的/user/root中 ./hadoop fs -put /root/example.txt /user/root 2、執行hadoop-mapreduce-examples-2.8.0.jar

hadoop 自帶示例wordcount 詳細執行步驟

因為機器學習，接觸到了資料探勘；因為資料探勘，接觸到了大資料；因為大資料，接觸到了Hadoop。之前有過hadoop的簡單瞭解，但都是基於別人提供的hadoop來學習和使用，雖然也很好用，終究不如自己的使用起來方便。經過這兩天參考大量網上的經驗，終於成功的搭

Hadoop自帶WordCount.java程式

位置：{Hadoop_HOME}\hadoop-0.20.1\src\examples\org\apache\hadoop\examples\WordCount.java 其中{Hadoop_HOME}是安裝後的Hadoop所在的目錄 Hadoop自帶WordCount程式

JVM系列-04-JDK自帶的管理分析工具

宣告本篇文章是本人閱讀《深入理解JVM》和《java虛擬機器規範》時的筆記。記錄的都是一些概念性的東西。 JVM是HotSpot，jdk1.7。大神繞路，不喜勿噴。 1 概覽對於java程式設計師來說，java、javac、ja

Libevent0.1之測試Libevent自帶的例子

先解壓下載下來的原始碼Download$ tar xzvf libevent-0.1.tar.gz 編譯庫檔案Download$ cd libevent Download/libevent$ ./configure Download/libevent$ make 編譯自帶的sample，-I引數指定標頭檔案目

mahout自帶的例子 -------------------------分類

介紹這二十個新聞組資料集合是收集大約20,000新聞組文件，均勻的分佈在20個不同的集合。這20個新聞組集合採集最近流行的資料集合到文字程式中作為實驗，根據機器學習技術。例如文字分類，文字聚集。我們將使用Mahout的Bayes Classifier創造一個模型，它將一個

C++自帶sort函式對vector容器元素進行排序

1.包含標頭檔案 #include<algorithm>，然後using namespace std; 2.假如你定義的vector變數為vector<Type> num，則如下

MapReduce-Join中級優化-hadoop自帶datajoin的解決方法

接著上一篇《MapReuce-Join操作-初級優化》這一篇部落格繼續說明MapReduce對於Join的操作，這裡使用hadoop包中自帶的datajoin包來處理，如果是hadoop1.x則包在${HADOOP_HOME}/contrib/datajoin資料夾下。如果

星環大數據安全組件Guardian與hadoop自帶的安全組件區別

對象 linux 密碼 tro snap 方法輪廓 control 重要在進行講解之前，先帶大家學習下hadoop關於hdfs自己的安全如何實現的--------------------------- 名詞： ACL-訪問控制列表（Access Control Li

學習Hadoop MapReduce與WordCount例子分析

/* MapReduce框架一直圍繞著key-value這樣的資料結構，下面以官方自帶的WordCount為例子，自己分析MapReduce的工作機制。MapReduce可以分為Map和Reduce過程，程式碼實現了兩個類，分別是繼承Mapper和Reduceer，Map

Hadoop自帶的一些程式示例

一、PiEstimator.java 位置：E:\Hadoop\hadoop-0.20.1\src\examples\org\apache\hadoop\examples /** * Licensed to the Apache Software Foundation (

Unity3D自帶案例AngryBots分析（三）——怪物啟用、攻擊、動作邏輯控制分析，第一個怪物KamikazeBuzzer的攻擊特效的實現原理

從Hierarchy檢視中可以看見，Enemies物件下面掛有很多子物件，很多都是Prefab。而點選這些子物件，其實發現它們的很多地方有很大的相同之處，就拿SimpleBuzzers來看，裡面的怪物KamikazeBuzzer都是相同的怪物Prefab，隨便點選一個，都可

Hadoop自帶Sort例子分析

相關推薦