hadoop程式設計實戰——日誌分析
上傳日誌檔案到hadoop的dfs當中去
一、根據上述日誌檔案,計算該天的獨立ip數,pv數(注意要篩選日誌,並非每條記錄都要統計),被傳輸頁面的總位元組數
1、將日誌資訊分為8個欄位,建立指標物件KPI
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
/*
* KPI Object
*/
public class KPI {
private String remote_addr;// 記錄客戶端的ip地址
private String remote_user;// 記錄客戶端使用者名稱稱,忽略屬性"-"
private String time_local;// 記錄訪問時間與時區
private String request;// 記錄請求的url與http協議
private String status;// 記錄請求狀態;成功是200
private String body_bytes_sent;// 記錄傳送給客戶端檔案主體內容大小
private String http_referer;// 用來記錄從那個頁面連結訪問過來的
private String http_user_agent;// 記錄客戶瀏覽器的相關資訊
private boolean valid = true;// 判斷資料是否合法
private static KPI parser(String line) {
KPI kpi = new KPI();
String[] arr = line.split(" ");
if (arr.length > 11) {
kpi.setRemote_addr(arr[0]);
kpi.setRemote_user(arr[1 ]);
kpi.setTime_local(arr[3].substring(1));
kpi.setRequest(arr[6]);
kpi.setStatus(arr[8]);
kpi.setBody_bytes_sent(arr[9]);
kpi.setHttp_referer(arr[10]);
if (arr.length > 12) {
kpi.setHttp_user_agent(arr[11] + " " + arr[12]);
} else {
kpi.setHttp_user_agent(arr[11]);
}
try{
// 存在status沒有的情況,直接pass
if (Integer.parseInt(kpi.getStatus()) >= 400) {// 大於400,HTTP錯誤
kpi.setValid(false);
}
}catch(Exception e){
System.out.println(line);
kpi.setValid(false);
}
} else {
kpi.setValid(false);
}
return kpi;
}
/**
* 按page的pv分類
*/
public static KPI filterPVs(String line) {
/*KPI kpi = parser(line);
Set<String> pages = new HashSet<String>();
pages.add("/forum-46-1.html");
pages.add("/forum-58-1.html");
pages.add("/forum-61-1.html");
if (!pages.contains(kpi.getRequest())) {
kpi.setValid(false);
}
return kpi;*/
return parser(line);
}
/**
* 按page的獨立ip分類
*/
public static KPI filterIPs(String line) {
/*KPI kpi = parser(line);
Set<String> pages = new HashSet<String>();
pages.add("/forum-46-1.html");
pages.add("/forum-58-1.html");
pages.add("/forum-61-1.html");
if (!pages.contains(kpi.getRequest())) {
kpi.setValid(false);
}
return kpi;*/
return parser(line);
}
/**
* PV按瀏覽器分類
*/
public static KPI filterBroswer(String line) {
return parser(line);
}
/**
* PV按小時分類
*/
public static KPI filterTime(String line) {
return parser(line);
}
/**
* PV按訪問域名分類
*/
public static KPI filterDomain(String line) {
return parser(line);
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("valid:" + this.valid);
sb.append("\nremote_addr:" + this.remote_addr);
sb.append("\nremote_user:" + this.remote_user);
sb.append("\ntime_local:" + this.time_local);
sb.append("\nrequest:" + this.request);
sb.append("\nstatus:" + this.status);
sb.append("\nbody_bytes_sent:" + this.body_bytes_sent);
sb.append("\nhttp_referer:" + this.http_referer);
sb.append("\nhttp_user_agent:" + this.http_user_agent);
return sb.toString();
}
public String getRemote_addr() {
return remote_addr;
}
public void setRemote_addr(String remote_addr) {
this.remote_addr = remote_addr;
}
public String getRemote_user() {
return remote_user;
}
public void setRemote_user(String remote_user) {
this.remote_user = remote_user;
}
public String getTime_local() {
return time_local;
}
public Date getTime_local_Date() throws ParseException {
SimpleDateFormat df = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",
Locale.US);
return df.parse(this.time_local);
}
public String getTime_local_Date_hour() throws ParseException {
SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHH");
return df.format(this.getTime_local_Date());
}
public void setTime_local(String time_local) {
this.time_local = time_local;
}
public String getRequest() {
return request;
}
public void setRequest(String request) {
this.request = request;
}
public String getStatus() {
return status;
}
public void setStatus(String status) {
this.status = status;
}
public String getBody_bytes_sent() {
return body_bytes_sent;
}
public void setBody_bytes_sent(String body_bytes_sent) {
this.body_bytes_sent = body_bytes_sent;
}
public String getHttp_referer() {
return http_referer;
}
public String getHttp_referer_domain() {
if (http_referer.length() < 8) {
return http_referer;
}
String str = this.http_referer.replace("\"", "").replace("http://", "")
.replace("https://", "");
return str.indexOf("/") > 0 ? str.substring(0, str.indexOf("/")) : str;
}
public void setHttp_referer(String http_referer) {
this.http_referer = http_referer;
}
public String getHttp_user_agent() {
return http_user_agent;
}
public void setHttp_user_agent(String http_user_agent) {
this.http_user_agent = http_user_agent;
}
public boolean isValid() {
return valid;
}
public void setValid(boolean valid) {
this.valid = valid;
}
public static void main(String args[]) {
String line = "112.97.24.243 - - [31/Jan/2012:00:14:48 +0800] \"GET /data/cache/style_2_common.css?AZH HTTP/1.1\" 200 57752 \"http://f.dataguru.cn/forum-58-1.html\" \"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Mobile/9A406\"";
System.out.println(line);
KPI kpi = new KPI();
String[] arr = line.split(" ");
kpi.setRemote_addr(arr[0]);
kpi.setRemote_user(arr[1]);
kpi.setTime_local(arr[3].substring(1));
kpi.setRequest(arr[6]);
kpi.setStatus(arr[8]);
kpi.setBody_bytes_sent(arr[9]);
kpi.setHttp_referer(arr[10]);
kpi.setHttp_user_agent(arr[11] + " " + arr[12]);
System.out.println(kpi);
try {
SimpleDateFormat df = new SimpleDateFormat("yyyy.MM.dd:HH:mm:ss z",Locale.US);
System.out.println(df.format(kpi.getTime_local_Date()));
System.out.println(kpi.getTime_local_Date_hour());
System.out.println(kpi.getHttp_referer_domain());
} catch (ParseException e) {
e.printStackTrace();
}
}
}
2、計算獨立ip、pv和總位元組數程式碼
package week5;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
public class KPIIPVBYTE {
public static class ParseLogMapper extends MapReduceBase implements Mapper<Object, Text, Text, Text> {
@Override
public void map(Object key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
KPI kpi = KPI.filterIPs(value.toString());
if (kpi.isValid()) {
output.collect(new Text("ip"), new Text(kpi.getRemote_addr()));
output.collect(new Text("pv"), new Text("1"));
output.collect(new Text("ps"), new Text("".equals(kpi.getBody_bytes_sent())?"0":kpi.getBody_bytes_sent()));
}
}
}
public static class parseLogReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
private Set<String> count = new HashSet<String>();
private int sumPv=0;
private long sumPs=0;
@Override
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
String keys = key.toString();
List<Text> listv = new ArrayList<Text>();
if("ip".equals(keys.toLowerCase().trim())){
while (values.hasNext()) {
count.add(values.next().toString());
}
output.collect(new Text("IP總數:"), new Text(String.valueOf(count.size())));
}else if("pv".equals(keys.toLowerCase().trim())){
while (values.hasNext()) {
sumPv+= Long.parseLong(values.next().toString());
}
output.collect(new Text("PV數:"), new Text(String.valueOf(sumPv)));
}else if("ps".equals(keys.toLowerCase().trim())){
while (values.hasNext()) {
sumPs +=Integer.parseInt(values.next().toString());
}
sumPs = sumPs/1024/1024;
output.collect(new Text("總位元組數:"), new Text(String.valueOf(sumPs)+"M"));
}
}
}
public static void main(String[] args) throws Exception{
String input = "hdfs://hadoop1:9000/week5/in/access.20120104.log";
String output = "hdfs://hadoop1:9000/week5/out/";
JobConf conf = new JobConf(KPIIPVBYTE.class);
conf.setJobName("IPPVPS");
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(Text.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(ParseLogMapper.class);
conf.setReducerClass(parseLogReducer.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(input));
FileOutputFormat.setOutputPath(conf, new Path(output));
JobClient.runJob(conf);
System.exit(0);
}
}
結果:
IP總數: 34413
總位元組數: 67627M
PV數: 2910085
二、統計來源網站,列出域名及帶來的獨立ip數
1、來源統計程式碼
package week5;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class SourceCount {
public static class MyMapper extends Mapper<Object, Text, Text, Text> {
private Text state = new Text();
private Text ip = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
KPI kpi = KPI.filterIPs(value.toString());
if (kpi.isValid()) {
state.set(kpi.getHttp_referer());
ip.set(kpi.getRemote_addr());
context.write(state, ip);
}
}
}
public static class SumReducer extends Reducer<Text, Text, Text, Text> {
private Text result = new Text();
public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {
Set<String> ips = new HashSet<String>();
for (Text val : values) {
ips.add(val.toString());
}
result.set(String.valueOf(ips.size()));
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJarByClass(SourceCount.class);
job.setMapperClass(MyMapper.class);
job.setCombinerClass(SumReducer.class);
job.setReducerClass(SumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path("hdfs://hadoop1:9000/week5/in/access.20120104.log"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://hadoop1:9000/week5/out3"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
結果:
太多了,取部分為證
三、統計使用者使用的瀏覽器種類,計算出各種瀏覽器佔的百分比
1、瀏覽器佔比程式碼
package week5;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
public class KPIBroswer {
private static long total = 0;
public static class KPIBrowserMapper extends MapReduceBase implements
Mapper<Object, Text, Text, IntWritable> {
private Text browserInfo = new Text();
private IntWritable one = new IntWritable(1);
public void map(Object key, Text value,
OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException {
KPI kpi = KPI.filterBroswer(value.toString());
if (kpi.isValid()) {
browserInfo.set(kpi.getHttp_user_agent());
total++;
output.collect(browserInfo, one);
}
}
}
public static class KPIBrowserReducer extends MapReduceBase implements
Reducer<Text, IntWritable, Text, Text> {
private Text result = new Text();
public void reduce(Text key, Iterator<IntWritable> values,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
long sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
System.out.println("answer is over there");
System.out.println(sum);
System.out.println(total);
System.out.println(String.valueOf(((double) sum / total * 100) + "%"));
result.set(String.valueOf(((double) sum / total * 100) + "%"));
output.collect(key, result);
}
}
public static void main(String[] args) throws Exception {
String input = "hdfs://hadoop1:9000/week5/in/access.20120104.log";
String output = "hdfs://hadoop1:9000/week5/out4";
JobConf conf = new JobConf(KPIBroswer.class);
conf.setJobName("KPIBrowser");
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(IntWritable.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(KPIBrowserMapper.class);
// conf.setCombinerClass(KPIBrowserReducer.class);
conf.setReducerClass(KPIBrowserReducer.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(input));
FileOutputFormat.setOutputPath(conf, new Path(output));
JobClient.runJob(conf);
System.exit(0);
}
}
2、結果:
2.0617954458374926E-4%
"" 2.0617954458374926E-4%
"(C)Nokia6700s/SymbianOS/9.1 Series60/3.0" 3.4363257430624875E-5%
"-" 1.5351785257131665%
"-" "-" 6.872651486124975E-5%
"-" "Mozilla/4.0" 1.0308977229187463E-4%
"Amoi-F90/Plat-F/WAP2.0/MIDP1.0/CLDC1.0 UP.Browser/6.2.2.6.f.1.100 6.872651486124975E-5%
"AmoiE70/6.1.08/WAP2.0 Profile/MIDP2.0 3.4363257430624875E-5%
"AndroidDownloadManager" 6.872651486124975E-5%
"Apache-HttpClient/4.1 (java 3.4363257430624875E-5%
"Apache-HttpClient/4.1.1 (java 3.4363257430624875E-5%
"AppEngine-Google; (+http://code.google.com/appengine; 2.0617954458374926E-4%
"Apple-PubSub/28" 0.001683799614100619%
"Apple-PubSub/65" 3.4363257430624875E-5%
"Apple-PubSub/65.20" 3.4363257430624875E-5%
"Apple-PubSub/65.23" 6.185386337512477E-4%
"Apple-PubSub/65.28 AppEngine-Google; 3.4363257430624875E-5%
"Apple-PubSub/65.28" 0.015807098418087445%
"BGSY bot/1.0" 0.0041235908916749855%
"BaiduMobile/1.3.1 CFNetwork/485.12.7 3.4363257430624875E-5%
"BaiduMobile/1.3.2 CFNetwork/548.0.4 3.4363257430624875E-5%
"BaiduMobile/1.3.4 CFNetwork/485.12.7 6.872651486124975E-5%
"BaiduMobile/1.3.4 CFNetwork/485.13.9 6.872651486124975E-5%
"BaiduMobile/1.3.5 CFNetwork/485.12.7 3.4363257430624875E-5%
"BaiduMobile/1.3.5 CFNetwork/485.13.9 2.4054280201437413E-4%
"BaiduMobile/1.3.5 CFNetwork/548.0.3 6.872651486124975E-5%
"BaiduMobile/1.3.5 CFNetwork/548.0.4 1.374530297224995E-4%
"Baiduspider" 1.374530297224995E-4%
"Baiduspider+(+http://www.baidu.com/search/spider.htm)" 0.01147732798182871%
"Baiduspider-news+(+http://www.baidu.com/search/spider.htm)" 0.009003173446823718%
"BlackBerry8820/2.7.0.105-4.5.0.174 Profile/MIDP-2.0 1.718162871531244E-4%
"BlackBerry8900/5.2.0.96 Profile/MIDP-2.0 6.872651486124975E-5%
"BlackBerry9000/5.2.0.89 Profile/MIDP-2.0 3.4363257430624875E-5%
"BlackBerry9700/5.0.0.862 Profile/MIDP-2.1 3.4363257430624875E-5%
"CoolPadF800/CMCC WindowsCEOS/6.0/(2009.10.30)10.01.F800/WAP2.0 1.374530297224995E-4%
"Dalvik/1.2.0 (Linux; 5.49812118889998E-4%
"Dalvik/1.4.0 (Linux; 3.4363257430624875E-5%
"DoCoMo/2.0 N905i(c100;TB;W24H16) 3.779958317368737E-4%
"DoCoMo/2.0 P900i(c100;TB;W24H11) 1.374530297224995E-4%
"Domnutch-Bot/Nutch-1.0 (Domnutch; 6.872651486124975E-5%
"Doubanbot/1.0 ([email protected] 9.278079506268717E-4%
"E63/SymbianOS/9.1 Series60/3.0" 2.4054280201437413E-4%
"E66/SymbianOS/9.1 Series60/3.0" 4.123590891674985E-4%
"E71/SymbianOS/9.1 Series60/3.0" 6.872651486124975E-5%
"FTRF: Friendly 6.185386337512477E-4%
"Feed43 Proxy/1.0 3.779958317368737E-4%
"FeedDemon/3.1 (http://www.feeddemon.com/; 6.872651486124975E-5%
"FeedDemon/4.0 (http://www.feeddemon.com/; 0.00402050111938311%
"FeedFetcher-Google-CoOp; (+http://www.google.com/coop/cse/cref)" 7.559916634737474E-4%
"Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)" 6.872651486124976E-4%
"Feedfetcher-Google; (+http://www.google.com/feedfetcher.html; 0.3223273546992614%
"Feedreader 3.14 0.0035394155153543627%
"GIONEE-L011/SW1.0.0/WAP2.0/MIDP2.1 Configuration/CLDC-1.1" 6.872651486124975E-5%
"GoodReaderIPad/3.12.0 CFNetwork/548.0.4 3.4363257430624875E-5%
"GoogleProducer" 0.0015119833269474948%
"Googlebot-Image/1.0" 1.0308977229187463E-4%
"Googlebot/2.1 (+http://www.google.com/bot.html)" 0.0017181628715312442%
"Googlebot/2.1 (+http://www.googlebot.com/bot.html)" 2.0617954458374926E-4%
"Googlebot/2.1 (http://www.googlebot.com/bot.html)" 1.0308977229187463E-4%
"GreatNews/1.0" 0.012576952219608707%
"HD_T8282 Mozilla/4.0 3.4363257430624875E-5%
"HTCT9188_TD/1.0 WindowsMobile/6.5 1.0308977229187463E-4%
"HTC_Touch_Diamond2_T5353 Mozilla/4.0 1.0308977229187463E-4%
"HTC_Touch_Pro_T7272 Mozilla/4.0 3.4363257430624875E-5%
"HTMLParser/1.6" 6.872651486124975E-5%
"HTTP Fetcher/HTTP/1.0" 3.4363257430624875E-5%
"HTTP_Request2/2.0.0 (http://pear.php.net/package/http_request2) 2.74906059444999E-4%
"Holleycomm-H8800/2.0 WAP2.0 3.436325743062488E-4%
"HuaweiSymantecSpider/[email protected]+(compatible; MSIE 0.04123590891674985%
"HuaweiT5211_TD/1.0 RTKE_OS/01.00 1.718162871531244E-4%
"HuaweiT8100_TD/1.0 Android/2.2 1.374530297224995E-4%
"HuaweiU7520/B000 Browser/NetFront/4.1 3.4363257430624875E-5%
"Huaweisymantecspider (compatible; 0.027490605944499907%
"IUC(U;iOS 3.1.3;Zh-cn;320*480;)" 3.4363257430624875E-5%
"IUC(U;iOS 4.1;Zh-cn;320*480;)/UCWEB8.1.0.104/41/997" 4.123590891674985E-4%
...
相關推薦
hadoop程式設計實戰——日誌分析
上傳日誌檔案到hadoop的dfs當中去 一、根據上述日誌檔案,計算該天的獨立ip數,pv數(注意要篩選日誌,並非每條記錄都要統計),被傳輸頁面的總位元組數 1、將日誌資訊分為8個欄位,建立指標物件KPI import java.text.ParseE
Hadoop專案實戰---日誌分析
1、專案描述 通過對apache common日誌進行分析 2、資料情況 每行記錄有5部分組成: 1.訪問ip 2.訪問時間 3.訪問資源【跟著兩個訪問的Url】 4.訪問狀態 5.本次流量 擷取部分資料如下: 27.19.74.143
專案實戰-日誌分析之ELK stack實戰
專案實戰-日誌分析之ELK stack實戰 elk在這兩年確實火得一塌糊塗了,在與傳統日誌分析對比上,es可以說是以快速實時搜尋在日誌分析方面脫穎而出的。而elk 一些列外掛也沒有想象中那麼難學,包括官方提供的beat外掛,監控和分析可以說是手到擒來。這套視訊意在快速幫助入門和
Hadoop 案例7-----日誌分析:分析非結構化檔案
1、需求:根據tomcat日誌計算url訪問了情況,具體的url如下, 要求:區別統計GET和POST URL訪問量 結果為:訪問方式、URL、訪問量 127.0.0.1 - - [03/Jul/2014:23:36:38 +0800]
Cloudera Hadoop 4 實戰課程(Hadoop 2.0、叢集介面化管理、電商線上查詢+日誌離線分析
親愛的網友,我這裡有套課程想和大家分享,如果對這個課程有興趣的,可以加我的QQ2059055336和我聯絡。 hadoop簡介及工資水平: 目前Hadoop的應用越來越廣泛,很多企業都開始採用,Hadoop人才也很稀缺和搶手,待遇和遠比Java、.Net開發
Hadoop實戰: 論壇點選流日誌分析
簡介 網站點選流日誌資料,比如,點選了哪一個連結,在哪個網頁停留時間最多,採用了哪個搜尋項、總體瀏覽時間等。而所有這些資訊都可被儲存在網站日誌中。通過分析這些資料,可以獲知許多對網站運營至關重要的資訊。採集的資料越全面,分析就能越精準。專案主要使用的技術有Map
基於Hadoop離線大數據分析平臺項目實戰
網站 收集 har 配置 處理 com 數據分析 reduce yun 基於Hadoop離線大數據分析平臺項目實戰網盤地址:https://pan.baidu.com/s/13vOi8WphetCiRtHhOCUdGg 密碼: kt99備用地址(騰訊微雲):https://
基於SparkSQL的網站日誌分析實戰
基於SparkSQL的網站日誌分析實戰 使用者行為日誌概述 使用者行為日誌:使用者每次訪問網站時所有的行為資料(訪問、瀏覽、搜尋、點選...) &n
大資料專案實戰之 --- 某App管理平臺的手機app日誌分析系統(三)
一、建立hive分割槽表 ---------------------------------------------------- 1.建立資料庫 $hive> create database applogsdb; 2.建立分割槽表 編寫指令碼。
大資料技術學習筆記之Hadoop框架基礎3-網站日誌分析及MapReduce過程詳解
一、回顧 -》Hadoop啟動方式 -》單個程序 sbin/h
hadoop入門之海量Web日誌分析 用Hadoop提取KPI統計指標
轉載自:http://blog.fens.me/hadoop-mapreduce-log-kpi/ 今天學習了這一篇部落格,寫得十分好,照著這篇部落格敲了一遍。 發現幾個問題, 一是這篇部落格中採用的hadoop版本過低,如果在hadoop2.x上面跑的話,可能會出現結果檔案沒有寫入任何資料,為了解決這
Spark專案學習-慕課網日誌分析-days1-hadoop
1. HDFS架構 1 Master(NameNode/NM) 帶 N個Slaves(DataNode/DN) HDFS/YARN/HBase 1個檔案會被拆分成多個Block NN: 1)負責客戶端請求的響應 2)負責元資料(檔案的名稱、
[專案實戰]流量日誌分析系統
================================================================================================ 一、虛擬機器環境部署: 1、在虛擬機器中安裝CentOS 啟動一個virtu
【hadoop】1、MapReduce進行日誌分析,並排序統計結果
1.網上很多關於搭建Hadoop叢集的知識,這裡不多做敘述,並且本機執行Hadoop程式是不需要hdfs叢集的,我們本機執行只做個demo樣式,當真的需要執行大資料的時候,才需要真正的叢集 2.還有就是詞頻統計的知識,不論是官方文件,還是網上的知識,基本都能隨意百度個幾百篇出來 但是我找半天,確實是沒有找
使用MapReduce對Hadoop下的日誌記錄進行分析處理
一. 簡介 MapReduce是一個高效能的批處理分散式計算框架,用於對海量資料進行並行分析和處理。與傳統方法相比較,MapReduce更傾向於蠻力去解決問題,通過簡單、粗暴、有效的方式去處理海量的資料。通過對資料的輸入、拆分與組合(核心),將任務分配到多個節點伺服器上,進
Spark SQL 筆記(10)——實戰網站日誌分析(1)
1 使用者行為日誌介紹 1.1 行為日誌生成方法 Nginx Ajax 1.2 日誌內容 訪問的系統屬性:作業系統、瀏覽器 訪問特徵:點選的 url、從哪個url 跳轉過來的(referer)、頁
Spark SQL 筆記(11)——實戰網站日誌分析(2)統計結果入庫
1 統計結果入庫 使用 DataFrame API 完成統計分析 使用 SQL API 完成統計分析 將結果寫入 MySQL 資料庫 1.1 調優點 分割槽欄位的資料型別的調整 https://
Spark SQL 筆記(15)——實戰網站日誌分析(5)資料視覺化
1 常見的視覺化框架 echarts highcharts d3.js HUE Zeppelin 2 建立 Web 專案 下載Echarts的檔案放到此目錄 http://echarts.bai
新手入門大資料 Hadoop基礎與電商行為日誌分析
爬取圖蟲網 為什麼要爬取這個網站,不知道哎~ 莫名奇妙的收到了,感覺圖片質量不錯,不是那些妖豔賤貨 可以比的,所以就開始爬了,搜了一下網上有人也在爬,但是基本都是py2,py3的還沒有人寫,所以順手寫一篇吧。 起始頁面 https://tuchong.com/explore/
Hadoop學習筆記—20.網站日誌分析專案案例(三)統計分析
網站日誌分析專案案例(三)統計分析:當前頁面一、藉助Hive進行統計1.1 準備工作:建立分割槽表 為了能夠藉助Hive進行統計分析,首先我們需要將清洗後的資料存入Hive中,那麼我們需要先建立一張表。這裡我們選擇分割槽表,以日期作為分割槽的指標,建表語句如下:(這裡關鍵之