MapReduce資料清洗

阿新 • • 發佈：2018-11-25

一、簡單解析版

1.需求

去除日誌中欄位長度小於等於11的日誌。

2.輸入資料

3.實現程式碼

（1）編寫LogMapper

package com.bigdata.mapreduce.weblog;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class LogMapper extends Mapper<LongWritable, Text, Text, NullWritable>{	
	Text k = new Text();
	
	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		
		// 1 獲取1行資料
		String line = value.toString();
		
		// 2 解析日誌
		boolean result = parseLog(line,context);
		
		// 3 日誌不合法退出
		if (!result) {
			return;
		}
		
		// 4 設定key
		k.set(line);
		
		// 5 寫出資料
		context.write(k, NullWritable.get());
   	}

	// 2 解析日誌
	private boolean parseLog(String line, Context context) {
		// 1 擷取
		String[] fields = line.split(" ");
		
		// 2 日誌長度大於11的為合法
		if (fields.length > 11) {
			// 系統計數器
			context.getCounter("map", "true").increment(1);
			return true;
		}else {
			context.getCounter("map", "false").increment(1);
			return false;
		}
	}
}

（2）編寫LogDriver

package com.bigdata.mapreduce.weblog;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class LogDriver {

	public static void main(String[] args) throws Exception {

      		args = new String[] { "e:/input/inputlog", "e:/output1" };

		// 1 獲取job資訊
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);

		// 2 載入jar包
		job.setJarByClass(LogDriver.class);

		// 3 關聯map
		job.setMapperClass(LogMapper.class);

		// 4 設定最終輸出型別
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);

		// 設定reducetask個數為0
		job.setNumReduceTasks(0);

		// 5 設定輸入和輸出路徑
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		// 6 提交
		job.waitForCompletion(true);
	}
}

二、複雜解析版

1．需求

對web訪問日誌中的各欄位識別切分
去除日誌中不合法的記錄
根據統計需求，生成各類訪問請求過濾資料

2．輸入資料

3．實現程式碼

（1）定義一個bean，用來記錄日誌資料中的各資料欄位

package com.bigdata.mapreduce.log;

@Data
public class LogBean {
	private String remote_addr;// 記錄客戶端的ip地址
	private String remote_user;// 記錄客戶端使用者名稱稱,忽略屬性"-"
	private String time_local;// 記錄訪問時間與時區
	private String request;// 記錄請求的url與http協議
	private String status;// 記錄請求狀態；成功是200
	private String body_bytes_sent;// 記錄傳送給客戶端檔案主體內容大小
	private String http_referer;// 用來記錄從那個頁面連結訪問過來的
	private String http_user_agent;// 記錄客戶瀏覽器的相關資訊
	private boolean valid = true;// 判斷資料是否合法

	@Override
	public String toString() {
		StringBuilder sb = new StringBuilder();
		sb.append(this.valid);
		sb.append("\001").append(this.remote_addr);
		sb.append("\001").append(this.remote_user);
		sb.append("\001").append(this.time_local);
		sb.append("\001").append(this.request);
		sb.append("\001").append(this.status);
		sb.append("\001").append(this.body_bytes_sent);
		sb.append("\001").append(this.http_referer);
		sb.append("\001").append(this.http_user_agent);
		
		return sb.toString();
	}
}

（2）編寫LogMapper程式

package com.bigdata.mapreduce.log;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class LogMapper extends Mapper<LongWritable, Text, Text, NullWritable>{
	Text k = new Text();

@Override
protected void map(LongWritable key, Text value, Context context)
		throws IOException, InterruptedException {
	// 1 獲取1行
	String line = value.toString();
	
	// 2 解析日誌是否合法
	LogBean bean = pressLog(line);
	
	if (!bean.isValid()) {
		return;
	}
	
	k.set(bean.toString());
	
	// 3 輸出
	context.write(k, NullWritable.get());
}

// 解析日誌
private LogBean pressLog(String line) {
	LogBean logBean = new LogBean();
	
	// 1 擷取
	String[] fields = line.split(" ");
	
	if (fields.length > 11) {
		// 2封裝資料
		logBean.setRemote_addr(fields[0]);
		logBean.setRemote_user(fields[1]);
		logBean.setTime_local(fields[3].substring(1));
		logBean.setRequest(fields[6]);
		logBean.setStatus(fields[8]);
		logBean.setBody_bytes_sent(fields[9]);
		logBean.setHttp_referer(fields[10]);
		
		if (fields.length > 12) {
			logBean.setHttp_user_agent(fields[11] + " "+ fields[12]);
		}else {
			logBean.setHttp_user_agent(fields[11]);
		}
		
		// 大於400，HTTP錯誤
		if (Integer.parseInt(logBean.getStatus()) >= 400) {
			logBean.setValid(false);
		}
	}else {
		logBean.setValid(false);
	}
	
	return logBean;
}
}

（3）編寫LogDriver程式

package com.bigdata.mapreduce.log;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class LogDriver {
	public static void main(String[] args) throws Exception {
		// 1 獲取job資訊
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);

	// 2 載入jar包
	job.setJarByClass(LogDriver.class);

	// 3 關聯map
	job.setMapperClass(LogMapper.class);

	// 4 設定最終輸出型別
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(NullWritable.class);

	// 5 設定輸入和輸出路徑
	FileInputFormat.setInputPaths(job, new Path(args[0]));
	FileOutputFormat.setOutputPath(job, new Path(args[1]));

	// 6 提交
	job.waitForCompletion(true);
}
}

MapReduce資料清洗

一、簡單解析版 1.需求去除日誌中欄位長度小於等於11的日誌。 2.輸入資料 3.實現程式碼（1）編寫LogMapper package com.bigdata.mapreduce.weblog; import java.io.IOException; impo

mapReduce：網站日誌分析專案案例：資料清洗

一、資料情況分析 1.1 資料情況回顧　　該論壇資料有兩部分：　　（1）歷史資料約56GB，統計到2012-05-29。這也說明，在2012-05-29之前，日誌檔案都在一個檔案裡邊，採用了追加寫入的方式。　　（2）自2013-05-30起，每天生成一個數據檔案

ETL專案2:大資料清洗,處理:使用MapReduce進行離線資料分析並報表顯示完整專案

ETL專案2:大資料清洗,處理:使用MapReduce進行離線資料分析並報表顯示完整專案思路同我之前的部落格的思路 https://www.cnblogs.com/symkmk123/p/10197467.html 但是資料是從web訪問的資料 avro第一次過濾觀察資料的格式,我們

Hadoop鏈式MapReduce、多維排序、倒排索引、自連線演算法、二次排序、Join效能優化、處理員工資訊Join實戰、URL流量分析、TopN及其排序、求平均值和最大最小值、資料清洗ETL、分析氣

Hadoop Mapreduce 演算法彙總第52課：Hadoop鏈式MapReduce程式設計實戰...1 第51課：Hadoop MapReduce多維排序解析與實戰...2 第50課：HadoopMapReduce倒排索引解析與實戰...3 第49課：Hado

使用JAVA API和MapReduce讀取HBase裡的資料(可用作HBase資料清洗)

一.使用JAVA API的方式 private static Table table = null; // 宣告靜態配置 static Configuration conf = null; static { conf = HBaseConfigu

MapReduce實現資料清洗（去重）

上面是資料的格式特徵碼#路徑（路徑裡面包括身份證號和姓名），我通過身份證號進行去重。 Map程式 public static class DistinctMaper extends Mapper<LongWritable, Text, Text, PeopleBe

SQL資料清洗

大家好，我是jacky,很高興繼續跟大家分享《MySQL資料分析實戰》，從本節課程開始，我們的課程就會變得越來越實戰，也會越來越有意思了；我們課程的主體叫MySQL資料分析實戰，那我們用MySQL來進行資料分析的時候，也一定會有其邏輯在的，大家想想我們無論從事任何的工作崗位，我

資料基礎---《利用Python進行資料分析·第2版》第7章資料清洗和準備

之前自己對於numpy和pandas是要用的時候東學一點西一點，直到看到《利用Python進行資料分析·第2版》，覺得只看這一篇就夠了。非常感謝原博主的翻譯和分享。在資料分析和建模的過程中，相當多的時間要用在資料準備上：載入、清理、轉換以及重塑。這些工作會佔到分析師時間的80%或更多。

pandas資料清洗，排序，索引設定，資料選取

此教程適合有pandas基礎的童鞋來看，很多知識點會一筆帶過，不做詳細解釋 Pandas資料格式 Series DataFrame：每個column就是一個Series 基礎屬性sha

軌跡資料之資料清洗以及異常檢測

軌跡資料之資料清洗以及異常檢測空間軌跡是一個(x,y)點的序列,每個點都有一個時間戳.因為軌跡通常是由感測器測量的,所以它們不可避免地會出現一些錯誤，需要對資料進行平滑化處理。此外，司機繞路或者交通事故也會導致軌跡資料出現偏離，這時候我們需要對軌跡資料進行異常檢測。資料平滑化處理

python之資料清洗指令碼

#coding=utf-8 import numpy as np import pandas as pd na_list=['NO CLUE','N/A','0']#na包含的型別 data=pd.read_csv('311-service-requests.csv',na_values=na_li

python簡單完成資料清洗

f = open("datasets_origin/foresfires.txt", "r") nf = open("./datasetss/foresfires.txt", "w+") for line in f.readlines(): line = line.split() pri

機器學習-資料清洗

歡迎大家前往騰訊雲+社群，獲取更多騰訊海量技術實踐乾貨哦~ 本文由brzhang發表資料清洗首先，為何需要對資料進行清洗資料清洗的工作絕壁是非常枯燥的，做資料研究的的人絕對無法避開這個環節，其根本原因是因為我們從各種渠道拿到的資料可能會出現： 1、不合理的資料，你比如，樣本中有些人的

python資料預處理：使用pandas 進行資料清洗

問題：介紹資料清洗方法。。解答：所謂資料清洗主要處理的是資料中的缺失值、異常值和重複值：缺失值處理資料缺失值指由於各種原因導致資料中存在的空缺值：資料庫中的null，python返回物件none，pandas或numpy中的nan；另空字串是有實體的不算是缺

Python居然還能用到財務當中？資料清洗中的運用！無所不能的Py

目前，江北區審計局資料分析小組運用Python語言對區級20個部門預算執行審計專案的多個部門財務資料進行了清理，相對其他資料清理工具，Python更加靈活、簡潔、高效和準確。由於我區各部門財務核算軟體未統一，各單位財務軟體型別和版本各異，會計科目設定及會計處理方式不一致

資料清洗小記(15)：DECODE利用sign函式做大小值判斷

【背景】對某業務資料處理時，需要判斷兩個資料欄位的大小，保留較大的資料存入到指定欄位下面。考慮使用decode函式和sign函式來實現這個目的。【解決】 1、decode(條件,值1,翻譯值1,值2,翻譯值2,...值n,翻譯值n,預設值) 2、sign()函式：根據某個值

資料清洗小記(14)：rtrim不被注意的用法

【背景】在業務對邏輯下，需要對某欄位資料進行處理，要求把右側開始數起，從首位開始，為連續0的資料中0給去掉。我們習慣了使用rtrim函式來處理右側開始的空格位，卻忽視了該函式其實可以達到去除特定數字或字元的目的。【解決】使用rtrim函式，去除右側開始，從首位開始連續的某數

資料清洗-> 資料入庫-> 資料視覺化的簡單專案

資料從同事那裡拿來，大概60萬條，幾百MB ，是某市面上保險櫃子的資料，現在要分析這批資料。資料清洗:略資料入庫:略資料視覺化: #!/usr/bin/python3 import pymysql type_list = ["userInfoSync","alertRe

Mapreduce資料分析例項

資料包百度網盤連結：https://pan.baidu.com/s/1v9M3jNdT4vwsqup9N0mGOA 提取碼：hs9c 複製這段內容後開啟百度網盤手機App，操作更方便哦 1、資料清洗說明：（1）

python資料分析與挖掘之資料清洗

資料探索的目的是及早發現數據的一些簡單規律或特徵,資料清洗的目的是留下可靠的資料,避免髒資料的干擾。這兩者沒有嚴格的先後順序,經常在一個階段進行。 &nb

MapReduce資料清洗

一、 簡單解析版

1.需求

2.輸入資料

3.實現程式碼

二、複雜解析版

1．需求

2．輸入資料

3．實現程式碼

相關推薦

一、簡單解析版