大資料（hadoop-mapreduce程式設計應用）

阿新 • • 發佈：2019-06-03

package demo;
import  java.io.*;
import org.apache.hadoop.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;



public class dataRedunplication {
		public static class Map extends Mapper<Object,Text,Text,Text>{
			private static Text line = new Text();
			public void map(Object key, Text value, Context context)throws IOException,InterruptedException
			{
				line = value;
				context.write(line, new Text(""));
			}
		}
		
		public static class Reduce extends Reducer<Text,Text,Text,Text>
		{
			public void reduce(Text key, Iterable <Text>values, Context context) throws IOException, InterruptedException
			{
				context.write(key, new Text(""));
			}
		}
		public static void main(String[] args) throws Exception
		{
			Configuration conf = new Configuration();
			conf.set("mapred.job.tracker","192.168.1.2:9001");
			String[] ioArgs = new String[]{"input","output"};
			String[] otherArgs = new GenericOptionsParser(conf,ioArgs).getRemainingArgs();
			if(otherArgs.length!= 2)
			{
				System.err.println("Usage:Data redunplication <in><out>");
				System.exit(2);
			}
			Job job = new Job(conf,"Data Redunplication");
			job.setJarByClass(dataRedunplication.class);
			job.setMapperClass(Map.class);
			job.setReducerClass(Reduce.class);
			//job.setCombinerClass(Reduce.class);
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(Text.class);
			 
			FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
			FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
			System.exit(job.waitForCompletion(true)?0:1);
		
		}
}

package demo;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import demo.dataRedunplication.Map;
import demo.dataRedunplication.Reduce;

public class hufen {
	public static class Map extends Mapper<LongWritable,Text,Text,NullWritable>{
		Text keyout =new Text();
		Text valueout =new Text();
		
		public void map(LongWritable key, Text value, Context context)throws IOException,InterruptedException
		{
			 String[] rdLine = value.toString().trim().split(":");
			 String username = rdLine[0];
			 String[] fans = rdLine[1].split(",");
			 for(int i =0;i<fans.length;i++)
			 { 
				 String fansname = fans[i];
				 String hufenzu = "";
				 if(username.compareTo(fansname)<0)
				 {
					hufenzu  = username+"-"+fansname;//  a-b
				 }
				 else
				 {
					 hufenzu = fansname+"-"+username;// b-a=> a-b
				 }
				 keyout.set(hufenzu);
				 context.write(keyout,NullWritable.get());
			 }
			 
		}
		public static class Reduce extends Reducer<Text,NullWritable,Text,NullWritable>
		{
			Text keyout =new Text();
			Text valueout =new Text();
			public void reduce(Text key, Iterable <NullWritable>values, Context context) throws IOException, InterruptedException
			{
				int count = 0;
				for(NullWritable text : values )
				{
					count++;
				}
				if(count == 2)
				{
					context.write(key, NullWritable.get());
				}
				
				
			}
		}
		public static void main(String[] args) throws Exception
		{
			Configuration conf = new Configuration();
			conf.set("mapred.job.tracker","192.168.1.2:9001");
			String[] ioArgs = new String[]{"input2","output2"};
			String[] otherArgs = new GenericOptionsParser(conf,ioArgs).getRemainingArgs();
			Job job = new Job(conf,"hufen");
			job.setJarByClass(hufen.class);
			job.setMapperClass(Map.class);
			job.setReducerClass(Reduce.class);
			//job.setCombinerClass(Reduce.class);
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(NullWritable.class);
			FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
			FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
			System.exit(job.waitForCompletion(true)?0:1);
		}
	}
}

大資料（hadoop-mapreduce程式設計應用）

package demo; import java.io.*; import org.apache.hadoop.*; i

大資料（hadoop-mapreduce案例講解）

package com.vip; import java.io.IOException; import java.util

大資料（hadoop-mapreduce程式碼及程式設計模型講解）

MapReduce程式設計模型 MapReduce將整個執行過程分為兩個階段： Map階段和Reduce階段 Map階段由

大資料（hadoop-flume案例講解）

a2.cnf #定義agent名稱，source，channel，sink的名稱 #a1就是我們給agent起的名字，

大資料（hadoop-小檔案合併、Mapreduce原理）

hadoop-小檔案合併 package com.andy.merge; import org.apache.hadoo

分享知識-快樂自己：Liunx-大資料（Hadoop）初始化環境搭建

大資料初始化環境搭建：一）：大資料（hadoop）初始化環境搭建二）：大資料（hadoop）環境搭建三）：執行wordcount案例四）：揭祕HDFS 五）：揭祕MapReduce 六）：揭祕HBase 七）：HBase程式設計 ----------------------------

分享知識-快樂自己：大資料（hadoop）環境搭建

大資料 hadoop 環境搭建：一）：大資料（hadoop）初始化環境搭建二）：大資料（hadoop）環境搭建三）：執行wordcount案例四）：揭祕HDFS 五）：揭祕MapReduce 六）：揭祕HBase 七）：HBase程式設計 -----------------------

大資料（hadoop-自定義資料型別、檔案格式）

自定義InputFormat OutputFormat 示例程式碼 package com.vip09;

大資料（hadoop分散式搭建和yarn）

分散式搭建步驟 1：克隆一臺機器完成後，按以下步驟進行修改（作為源克隆主機） 1）修改網

大資料（hadoop-flume的原理架構）

背景介紹 Hadoop提供了一箇中央化的儲存系統有利於進行集中式的資料分析與資料共享 Hadoo

大資料（hadoop-資料入庫系統Sqoop原理架構）

Sqoop是什麼 Sqoop：SQL-to-Hadoop 連線傳統關係型資料庫和Hadoop的橋樑 &nb

大資料_Shuffle、MapReduce程式設計案例(資料去重、多表查詢、倒排索引、使用單元測試)

一、什麼是Shuffle（洗牌） ----> MapReduce核心 1、序列化 2、排序 3、分割槽 4、合併二、MapReduce程式設計案例 ------> 掌握方法：如何開發一個程式 1、資料

c++ fstream + string 處理大資料（與c 的fread）

一：起因（1）之前處理文字資料時，各種清洗資料用的都是java的File,FileReader/FileWriter,BufferedReader/BufferedWriter等類，（2）應用java的原因是java裡面的map非常靈活，eclipse編譯器更是給力，而且

大資料之Hadoop（MapReduce（四））------->企業優化

6.1 MapReduce 跑的慢的原因 Mapreduce 程式效率的瓶頸在於兩點： 1）計算機效能 CPU、記憶體、磁碟健康、網路 2）I/O 操作優化（1）資料傾斜（2）map和reduce數設定不合理（3）reduce等待過久（4）小檔案過多

網路程式設計基礎【day09】：socket接收大資料（五）

本節內容 1、概述 2、socket接收大資料 3、中文字元的坑一、概述　　上篇部落格寫到了，就是說當伺服器傳送至客戶端的資料，大於客戶端設定的資料，則就會把資料服務端發過來的資料剩餘資料存在IO緩衝區中，那我們如何解決這個問題呢？　　有的同學就說了：改大客戶端接收的資料的大小=&

大資料（十五）：Hadoop資料壓縮與壓縮/解壓縮例項

一、資料壓縮 1.概論壓縮技術能夠有效減少低層儲存系統（HDFS）讀寫位元組。壓縮提高了網路頻寬和磁碟空間的效率。在Hadoop下，尤其是資料規模很大和工作負載密集的情況下。使用資料壓縮閒的非常重要。在這種情況下，I/O操作

大資料（十三）：MapJoin（DistributedCache分散式快取）、資料清理例項與計數器應用

一、在map端表合併（DistributedCache分散式快取） 1.適用場景適合用於關聯表中有小表的情形。可以將小表分發到所有的

大資料之Hadoop學習（環境配置）——Hadoop偽分散式叢集搭建

title: Hadoop偽分散式叢集搭建 date: 2018-11-14 15:17:20 tags: Hadoop categories: 大資料點選檢視我的部落格: Josonlee’s Blog 文章目錄前言準備偽分

初識大資料（二. Hadoop是什麼）

hadoop是一個由Apache基金會所釋出的用於大規模叢集上的分散式系統並行程式設計基礎框架。目前已經是大資料領域最流行的開發架構。並且已經從HDFS、MapReduce、Hbase三大核心元件成長為一個具有60多個元件構成的龐大生態，可以滿足大資料採集、儲存、開發、分析、演算法、建模等方方面面。在ha

初識大資料（三. Hadoop與MPP資料倉庫）

　　MPP代表大規模並行處理，這是網格計算中所有單獨節點參與協調計算的方法。是將任務並行的分散到多個伺服器和節點上，在每個節點上計算完成後，將各自部分的結果彙總在一起得到最終的結果。 MPP DBMS是建立在這種方法之上的資料庫管理系統。在這些系統中的每個查詢都會被分解為由MPP網格的節點並行執行

大資料（hadoop-mapreduce程式設計應用）

相關推薦