cascading--wordcount

阿新 • • 發佈：2017-05-04

sub 個數另一個 dsp exp scrip 執行 width pipe管道

在eclipse下運行wordcount,使用cascading封裝

準備：centos系統，jdk,hadoop,eclipse,cascading的lib包，官網可下載，自帶cascading封裝的wordcount源碼，以及爬蟲數據data目錄，這些均可以在官網下載

我是在cascading官網把材料下載好後，在eclipse中運行，可以得到測試數據

難點：cascading的版本與官網自帶的wordcount實例可能不匹配，這需要自己自行修改，我的cascading版本不是在官網下載的

給出我的運行結果圖：

技術分享

代碼如下：完整版

package com.zjf.cascading.example;

 
/*
 * WordCount example
 * zjf-pc
 * Copyright (c) 2007-2012 Concurrent, Inc. All Rights Reserved.
 * Project and contact information: http://www.concurrentinc.com/
 */

import java.util.Map;
import java.util.Properties;

import cascading.cascade.Cascade;
import cascading.cascade.CascadeConnector;
import cascading.cascade.Cascades;
 
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.operation.Identity;
import cascading.operation.aggregator.Count;
import cascading.operation.regex.RegexFilter;
import cascading.operation.regex.RegexGenerator;
import cascading.operation.regex.RegexReplace;
import cascading.operation.regex.RegexSplitter;
 
import cascading.operation.xml.TagSoupParser;
import cascading.operation.xml.XPathGenerator;
import cascading.operation.xml.XPathOperation;
import cascading.pipe.Each;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.pipe.SubAssembly;
import cascading.scheme.SequenceFile;
import cascading.scheme.TextLine;
import cascading.tap.Tap;
import cascading.tap.Hfs;
import cascading.tap.Lfs;
import cascading.tuple.Fields;

public class WordCount
  {
  @SuppressWarnings("serial")
private static class ImportCrawlDataAssembly extends SubAssembly
    {
    public ImportCrawlDataAssembly( String name )
      {
      //拆分文本行到url和raw
      RegexSplitter regexSplitter = new RegexSplitter( new Fields( "url", "raw" ) );
      Pipe importPipe = new Each( name, new Fields( "line" ), regexSplitter );
      //刪除所有pdf文檔
      importPipe = new Each( importPipe, new Fields( "url" ), new RegexFilter( ".*\\.pdf$", true ) );
      //把":n1"替換為"\n",丟棄無用的字段
      RegexReplace regexReplace = new RegexReplace( new Fields( "page" ), ":nl:", "\n" );
      importPipe = new Each( importPipe, new Fields( "raw" ), regexReplace, new Fields( "url", "page" ) );
      //此句強制調用
      setTails( importPipe );
      }
    }

  @SuppressWarnings("serial")
private static class WordCountSplitAssembly extends SubAssembly
    {
    public WordCountSplitAssembly( String sourceName, String sinkUrlName, String sinkWordName )
      {
      //創建一個新的組件,計算所有頁面中字數,和一個頁面中的字數
      Pipe pipe = new Pipe(sourceName);
     //利用TagSoup將HTML轉成XHTML，只保留"url"和"xml"去掉其它多余的
      pipe = new Each( pipe, new Fields( "page" ), new TagSoupParser( new Fields( "xml" ) ), new Fields( "url", "xml" ) );
      //對"xml"字段運用XPath(XML Path Language)表達式,提取"body"元素
      XPathGenerator bodyExtractor = new XPathGenerator( new Fields( "body" ), XPathOperation.NAMESPACE_XHTML, "//xhtml:body" );
      pipe = new Each( pipe, new Fields( "xml" ), bodyExtractor, new Fields( "url", "body" ) );
      //運用另一個XPath表達式刪除所有元素,只保留文本節點,刪除在"script"元素中的文本節點
      String elementXPath = "//text()[ name(parent::node()) != ‘script‘]";
      XPathGenerator elementRemover = new XPathGenerator( new Fields( "words" ), XPathOperation.NAMESPACE_XHTML, elementXPath );
      pipe = new Each( pipe, new Fields( "body" ), elementRemover, new Fields( "url", "words" ) );
      //用正則表達式將文檔打亂成一個個獨立的單詞,和填充每個單詞(新元組)到當前流使用"url"和"word"字段
      RegexGenerator wordGenerator = new RegexGenerator( new Fields( "word" ), "(?<!\\pL)(?=\\pL)[^ ]*(?<=\\pL)(?!\\pL)" );
      pipe = new Each( pipe, new Fields( "words" ), wordGenerator, new Fields( "url", "word" ) );
      //按"url"分組
      Pipe urlCountPipe = new GroupBy( sinkUrlName, pipe, new Fields( "url", "word" ) );
      urlCountPipe = new Every( urlCountPipe, new Fields( "url", "word" ), new Count(), new Fields( "url", "word", "count" ) );
      //按"word"分組
      Pipe wordCountPipe = new GroupBy( sinkWordName, pipe, new Fields( "word" ) );
      wordCountPipe = new Every( wordCountPipe, new Fields( "word" ), new Count(), new Fields( "word", "count" ) );
      //此句強制調用
      setTails( urlCountPipe, wordCountPipe );
      }
    }

  public static void main( String[] args )
    {
      //設置當前工作jar
     Properties properties = new Properties(); 
     FlowConnector.setApplicationJarClass(properties, WordCount.class);
     FlowConnector flowConnector = new FlowConnector(properties);
     /**
      * 在運行設置的參數裏設置如下代碼：
      * 右擊Main.java，選擇run as>run confugrations>java application>Main>Agruments->Program arguments框內寫入如下代碼
      * data/url+page.200.txt output local 
      * 分析:
      * args[0]代表data/url+page.200.txt，它位於當前應用所在的目錄下面，且路徑必須是本地文件系統裏的路徑
      * 我的所在目錄是/home/hadoop/app/workspace/HadoopApplication001/data/url+page.200.txt
      * 且該路徑需要自己創建，url+page.200.txt文件也必須要有,可以在官網下下載
      * 
      * args[1]代表output文件夾,第二個參數，它位於分布式文件系統hdfs中
      * 我的路徑是：hdfs://s104:9000/user/hadoop/output，該路徑需要自己創建
      * 在程序運行成功後，output目錄下會自動生成三個文件夾pages,urls,words
      * 裏面分別包含所有的page,所有的url,所有的word
      * 
      * args[2]代表local,第三個參數，它位於本地文件系統中
      * 我的所在目錄是/home/hadoop/app/workspace/HadoopApplication001/local
      * 該文件夾不需要自己創建，在程序運行成功後會自動生成在我的上述目錄中，
      * 且在該local文件夾下會自動生成兩個文件夾urls和words,裏面分別是url個數和word個數
      */
      String inputPath = args[ 0 ];
      String pagesPath = args[ 1 ] + "/pages/";
      String urlsPath = args[ 1 ] + "/urls/";
      String wordsPath = args[ 1 ] + "/words/";
      String localUrlsPath = args[ 2 ] + "/urls/";
      String localWordsPath = args[ 2 ] + "/words/";

    // import a text file with crawled pages from the local filesystem into a Hadoop distributed filesystem
    // the imported file will be a native Hadoop sequence file with the fields "page" and "url"
    // note this examples stores crawl pages as a tabbed file, with the first field being the "url"
    // and the second being the "raw" document that had all new line chars ("\n") converted to the text ":nl:".
      
    //初始化Pipe管道處理爬蟲數據裝配,返回字段url和page
    Pipe importPipe = new ImportCrawlDataAssembly( "import pipe" );

     //創建tap實例
    Tap localPagesSource = new Lfs( new TextLine(), inputPath );
    Tap importedPages = new Hfs( new SequenceFile( new Fields( "url", "page" ) ), pagesPath );

    //鏈接pipe裝配到tap實例
    Flow importPagesFlow = flowConnector.connect( "import pages", localPagesSource, importedPages, importPipe );

    //拆分之前定義的wordcount管道到新的兩個管道url和word
    // these pipes could be retrieved via the getTails() method and added to new pipe instances
    SubAssembly wordCountPipe = new WordCountSplitAssembly( "wordcount pipe", "url pipe", "word pipe" );

    //創建hadoop SequenceFile文件存儲計數後的結果
    Tap sinkUrl = new Hfs( new SequenceFile( new Fields( "url", "word", "count" ) ), urlsPath );
    Tap sinkWord = new Hfs( new SequenceFile( new Fields( "word", "count" ) ), wordsPath );

    //綁定多個pipe和tap,此處指定的是pipe名稱
    Map<String, Tap> sinks = Cascades.tapsMap( new String[]{"url pipe", "word pipe"}, Tap.taps( sinkUrl, sinkWord ) );
    //wordCountPipe指的是一個裝配
    Flow count = flowConnector.connect( importedPages, sinks, wordCountPipe );

   //創建一個裝配,導出hadoop sequenceFile 到本地文本文件
    Pipe exportPipe = new Each( "export pipe", new Identity() );
    Tap localSinkUrl = new Lfs( new TextLine(), localUrlsPath );
    Tap localSinkWord = new Lfs( new TextLine(), localWordsPath );

   // 使用上面的裝配來連接兩個sink
    Flow exportFromUrl = flowConnector.connect( "export url", sinkUrl, localSinkUrl, exportPipe );
    Flow exportFromWord = flowConnector.connect( "export word", sinkWord, localSinkWord, exportPipe );

    ////裝載flow,順序隨意,並執行
    Cascade cascade = new CascadeConnector().connect( importPagesFlow, count, exportFromUrl, exportFromWord );
    cascade.complete();
    }
  }

cascading--wordcount

sub 個數另一個 dsp exp scrip 執行 width pipe管道在eclipse下運行wordcount,使用cascading封裝準備：centos系統，jdk,hadoop,eclipse,cascading的lib包，官網可下載，自帶cascadi

Hadoop 0.20.2+Ubuntu13.04配置和WordCount測試

password trac 讓我說明 core jvm -m launchpad 1.7 事實上這篇博客寫的有些晚了。之前做過一些總結後來學校的事給忘了，這幾天想又一次拿來玩玩發現有的東西記不住了。翻博客發現居然沒有。好吧，所以趕緊寫一份留著自己用吧。這東西網上有非常

debian下 Hadoop 1.0.4 集群配置及運行WordCount

速度虛擬裏的否則 ado 修改安裝包 name 節點說明：我用的是壓縮包安裝，不是安裝包官網安裝說明：http://hadoop.apache.org/docs/r1.1.2/cluster_setup.html，繁冗，看的眼花...大部分人應該都不是按照這個來

Hadoop之WordCount詳解

ride 開始 zookeepe ati 程序 form 數組 -c 狀態花了好長時間查找資料理解、學習、總結這應該是一篇比較全面的MapReduce之WordCount文章了耐心看下去 1，創建本地文件在hadoop-2.6.0文件夾下創建一個文件夾data，在其

Hadoop Mapreduce之WordCount實現

註意 com split gin 繼承 [] leo ring exce 1.新建一個WCMapper繼承Mapper public class WCMapper extends Mapper<LongWritable, Text, Text, LongWritab

Storm入門（四）WordCount示例

etl rri emit lis auto deb turn output -- Storm API文檔網址如下： http://storm.apache.org/releases/current/javadocs/index.html 一、關聯代碼使用maven，代碼如

Hadoop wordcount Demon

dir 搭建 out light tails txt bash 運行結果截圖 -m 　　搭建完成Hadoop後，第一個demon,wordcount。此處參考：http://blog.csdn.net/wangjia55/article/details/53160679 　

wordcount代碼實現詳解

常量內部 t對象 mit sta see .org 系統配置 ioe /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agre

Storm集群上的開發，Topology任務的編寫之 WordCount Spout和Blot的分組策略（一張圖說明問題）（五）

topology 技術流動 .com orm 使用不同的 alt 分組 Storm的數據從Spout采集後，交給Blot組件處理，數據在Blot之間流動時，會涉及到數據流動的方向。這就是Storm的分組策略。從WordCount的單詞拆分到單詞計數，會使用按字段的分組

hadoop2.7.0實踐- WordCount

path static nts sdn 步驟 popu cer token apache 環境要求說明：本文檔為wordcount的mapreduce job編寫及執行文檔。操作系統：Ubuntu14 x64位 Hadoop：Hadoop 2.

將java開發的wordcount程序提交到spark集群上運行

復制工程目錄下的文件依賴 clip 拖拽 ons 啟動方式 mage 今天來分享下將java開發的wordcount程序提交到spark集群上運行的步驟。第一個步驟之前，先上傳文本文件，spark.txt，然用命令hadoop fs -put spark.txt /

Hadoop2.6.0版本號MapReudce演示樣例之WordCount（一）

set pat -m 代碼分享 ont extends gravity csdn 一、準備測試數據 1、在本地Linux系統/var/lib/hadoop-hdfs/file/路徑下準備兩個文件file1.txt和file2.tx

eclipse下執行wordcount報錯 java.lang.ClassNotFoundException 解決辦法

eclipse下執行wordcount報錯 java.lang.classnotfoundexception 解決辦法eclipse下執行wordcount報錯 java.lang.ClassNotFoundException17/08/29 07:52:54 INFO Configuration.depre

Hadoop MapReduce 官方教程 -- WordCount示例

get pre red oop hadoop apache tor ria pac Hadoop MapReduce 官方教程 -- WordCount示例： http://hadoop.apache.org/docs/r1.0.4/cn/mapred_tutorial.h

Storm之路-WordCount-實例

ade debug clas boolean gif import oot 遠程 lex 初學storm，有不足的地方還請糾正。網上看了很多wordcount實例，發現都不是我想要的。實現場景：統計shengjing.txt詞頻到集合，一次打印結果。 ● 消息源Sp

Python開發MapReduce系列（一）WordCount Demo

logs 3-9 line counter ota python開發 home num brush 　原創，轉發請註明出處。　　MapReduce是hadoop這只大象的核心，Hadoop 中，數據處理核心就是 MapReduce 程序設計模型。一個Map/Reduc

Hive 實現 wordcount

style nbsp logs 執行 txt hive ble text bsp 創建表： create table hive_wordcount(context string); load data local inpath ‘/home/hadoop/files/

軟件工程作業二：WordCount實現與改進

import 函數 another buffer frequency ++ 兩種 read keyset 首先，我在網上找了一個WordCount的源碼，在源碼上進行修改原文博主：波濤洶湧原文鏈接：http://blog.sina.com.cn/s/blog_51335

wordcount小程序

空格完全方式 html htm 結束 https 發現增加仿寫地址： http://www.cnblogs.com/sunbuqiao/p/5312227.html 原作者：孫步橋我的代碼地址：https://github.com/babyboss5/micac

wordcount程序

sun 打開單詞數 rewind logs num tell fopen out #include<stdio.h>#include<stdlib.h>#include<string.h>#include<windows.h>

cascading--wordcount

相關推薦