MapReduce的Map side join

阿新 • • 發佈：2019-02-09

package com.inspur.mapreduce.join;

/*************************************
 * @author:	caolch
 * @date:	2013-12-31
 * @note:	利用mapper寫的表連線，小表讀到記憶體裡
 *************************************/

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class MapJoin extends Configured implements Tool {

	public static class myMapper extends Mapper<Object, Text, Text, Text> {
		// TODO Auto-generated constructor stub
		private HashMap<String,String> authorMap = new HashMap<String,String>();

		@Override
		public void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {
			// TODO Auto-generated method stub
			String []tokens = value.toString().split(":::");
			String joinData = authorMap.get(tokens[1]);
			
			if (joinData!=null) {
				context.write(new Text(tokens[0]),new Text(joinData));		
			}
		}

		//setup會先於map執行
		@Override
		public void setup(Context context) throws IOException,
				InterruptedException {
			// TODO Auto-generated method stub
			//得到要快取的檔案的路徑
			Path []cacheFiles = DistributedCache.getLocalCacheFiles(context.getConfiguration());
			
			//將檔案內容讀到分散式快取
			if (cacheFiles!=null && cacheFiles.length > 0) {
				String line;
				String []tokens;
				for(Path path:cacheFiles)
				{
					if(path.toString().contains("author"))
					{
						BufferedReader br = new BufferedReader(new FileReader(path.toString()));
						try{  
		                    while((line = br.readLine()) != null){  
		                        tokens = line.split(":::", 2);  
		                        authorMap.put(tokens[0], tokens[1]);               
		                    }  
		                }finally{  
		                    br.close();  
		                }  
					}
				}
			}
		}
		
	}
	
	@Override
	public int run(String[] args) throws Exception {
		// TODO Auto-generated method stub
		
		Configuration conf = getConf();
		Job job = new Job(conf,"MapJoin");
		job.setJarByClass(MapJoin.class);
		job.setMapperClass(myMapper.class);
		job.setNumReduceTasks(0);
		
		/*新增要加入到快取中的檔案*/
		Path cachefilePath = new Path(args[0]);
		FileSystem hdfs = FileSystem.get(conf);
		FileStatus fileStatus = hdfs.getFileStatus(cachefilePath); 
		//判斷輸入的路徑是檔案還是資料夾
		if(fileStatus.isDir()==false){		//如果輸入的路徑是檔案,新增檔案到快取
			DistributedCache.addCacheFile(cachefilePath.toUri(), job.getConfiguration());
		}
		if(fileStatus.isDir()==true)		//如果輸入的路徑是資料夾，獲取資料夾中的檔案列表
		{
			//獲取資料夾元資料，並一一新增內部所有檔案
			for (FileStatus fs : hdfs.listStatus(cachefilePath)) {
				DistributedCache.addCacheFile(fs.getPath().toUri(), job.getConfiguration());
			}
		}
	
		Path in = new Path(args[1]);
		Path out = new Path(args[2]);
		//設定輸入輸出格式
		FileInputFormat.setInputPaths(job, in);
		FileOutputFormat.setOutputPath(job, out);
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		
		System.exit(job.waitForCompletion(true)? 0 : 1);
		return 0;
	}

	public static void main(String[] args) throws Exception {
		// TODO Auto-generated method stub
		int res = ToolRunner.run(new Configuration(), new MapJoin(), args);
		System.exit(res);
	}

}

HDPCD-Java --- Exam Test -- Map-Side Join

package task1; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.net.URI; import java.net.URISyntaxExcepti

MapReduce的Map side join

package com.inspur.mapreduce.join; /************************************* * @author: caolch * @date: 2013-12-31 * @note: 利用mapper寫的表連線，小表讀到記憶體裡 ******

深入理解 Reduce-side Join

在《MapReduce Design Patterns》一書中，作者給出了Reduce-side Join的實現方法，大致步驟如下：使用MultipleInputs指定不同的來源表和相應的Mapper類； Mapper輸出的Key為Join的欄位內容，

left join 和 on 的區別

wid color left join where 單表查詢不同決定 width aaa SQL中on條件與where條件的區別在使用left jion時，on和where條件的區別如下： 1、 on條件是在生成臨時表時使用的條件，它不管on中的條件是否為

七. 多線程編程6.isAlive()和join()的使用

nbsp using 程序 () dna 演示一個 exit 問題：如前所述，通常你希望主線程最後結束。在前面的例子中，這點是通過在main()中調用sleep()來實現的，經過足夠長時間的延遲以確保所有子線程都先於主線程結束。然而，這不是一個令人滿意的解決方法，它也帶

C# LINQ 詳解 From Where Select Group Into OrderBy Let Join

分享 str 關聯例如數據 lln ole inf emp 目錄 1. 概述 2. from子句 3. where子句 4. select子句 5. group子句 6. into子句 7. 排序子句 8. let子句 9. join子句 10. 小結 1. 概述

MySQL中使用INNER JOIN來實現Intersect並集操作

int isam har 業務 charset tin ner get 一句話 MySQL中使用INNER JOIN來實現Intersect並集操作一、業務背景我們有張表設計例如以下： CREATE TABLE `user_defined_value` (

Thread.Join() 方法

我們 logs threading start nbsp class tel finish 是不是 1. 什麽是 the calling thread? 2. 什麽是 a thread? 運行一個程序，即開啟了一個進程和至少一個線程，幹活的是線程而非

Mysql中Left Join 與Right Join 與 Inner Join 與 Full Join的區別

chinese ron 兩張 ansi 左連接 ima money key sel 看看Left Join 與Right Join 與 Inner Join 與 Full Join對表進行操作後得到的結果。在數據庫中新建兩張表，並插入要測試的數據。新建表：

StringUtils的Join函數

字符 str comm 不同 join 一個 images 類型 lang 有一天看到同事用了這麽個函數，然而我並沒有見過，所以查了查，以後說不定用得到。包路徑：org.apache.commons.lang3.StringUtils; 函數名：StringUtils.j

join函數詳解

array 使用 sep 可選一個 [0 兩個 [1] 數組定義：join() 方法用於把數組中的所有元素放入一個字符串。語法 : ArrayObject.join(separator) separator 可選。指定要使用的分隔符。如果省略該參數，則使用逗號作為

UVA 1397 - The Teacher's Side of Math(高斯消元)

continue def 而且 span build 全部 flow scanf 得到 UVA 1397 - The Teacher‘s Side of Math 題目鏈接題意：給定一個x=a1/m+b1/n。求原方程組思路：因為m*n最多20，全部最高項僅

DataStage中merge、lookup、join的區別與聯系

處理過程 key 占用內存效率功能要求過程事實表 lookup 三者功能類似，都可以將表連接起來進行輸出。區別主要體現在性能上。 lookup就是一個表在另一個表中找，處理過程都在內存進行，因此占用內存較多，一般大事實表和小緯表用這種方式關聯效率高。 merge

left outer join的on不起作用

oci join microsoft nbsp mysq soft sel asp msdn left outer join的on不起作用 https://msdn.microsoft.com/zh-cn/library/ms177634.aspx?f=255&am

Sql的各種連接用法(cross join、inner join、full join)

集合 xxx table int cross 同時 targe ont 右外連接 1.名詞解釋：笛卡爾乘積：笛卡爾乘積是指在數學中，兩個集合X和Y的笛卡爾積，又稱直積，表示X x Y ，第一個對象是X的成員，而第二個對象是Y的所有可能有序列的其中的一

map端join

path auth not config 單表 mapreduce == 書包 task package my.hadoop.hdfs.mapreduceJoin; import java.io.BufferedReader; import java.io.FileIn

Server-Side Rendering(服務端渲染)的優點與缺點

spa 性能 ide 渲染額外優點一定的 size 壓力優點 1. SEO 客戶端渲染，頁面中只有初始的幾個html容器，js生成內容填充到容器中，爬蟲只能識別到初始的html容器，js生成的內容一般不會被識別,而服務端渲染直接給出html，爬蟲可以識別到所有內容

SQL多表聯合查詢（LEFT JOIN）條件差異

logs 技術分享 .cn where 聯合查詢 uid exist 包含 into 查詢A： select a.*,b.* into Bus605115_ON_Where_And --(642 行受影響) from PositionN a left join szt

mysql left join 左連接查詢關聯n多張表

part 存在 col 外鍵 ros bold new 多條 ble left join 左連接即以左表為基準，顯示坐標所有的行，右表與左表關聯的數據會顯示，不關聯的則不顯示。關鍵字為left join on。 **基本用法如下： select table a le

Linq表連接大全(INNER JOIN、LEFT OUTER JOIN、RIGHT OUTER JOIN、FULL OUTER JOIN、CROSS JOIN)

iar value fsm wdcp esc max als tla plc 轉載http://www.cnblogs.com/shenqiboy/p/3260105.html 我們知道在SQL中一共有五種JOIN操作：INNER JOIN、LEFT OUTER JOIN、

MapReduce的Map side join

相關推薦