Hadoop通過HCatalog編寫Mapreduce任務訪問hive庫中schema資料

阿新 • • 發佈：2022-05-02

1、dirver

package com.kangaroo.hadoop.drive;

import java.util.Map;
import java.util.Properties;

import com.kangaroo.hadoop.mapper.AggregateMapper;
import com.kangaroo.hadoop.reducer.AggregateReducer;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hive.hcatalog.mapreduce.HCatInputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.kangaroo.hadoop.utils.PropertiesUtil;


public class DriveMain extends Configured implements Tool {

    private static final Logger logger = LoggerFactory.getLogger(DriveMain.class);
    private Configuration conf;
    private PropertiesUtil propUtil;

    public DriveMain() {
        this.conf = new Configuration();
        this.propUtil = new PropertiesUtil("configure.properties");
    }

    public int run(String[] args) throws Exception {
        try {
            logger.info("MapReduce Job Beginning.");
            String dbName = args[0];
            String tableName = args[1];
            String partition = args[2];
            String sumField = args[3];
            String outPath = args[4];
            String partFilter = partitionFormat(partition);
            logger.info("[Params] dbName:{}; tableName:{}, partition:{}, sumField:{}, outPath:{}, partFilter:{}",
                    dbName, tableName, partition, sumField, outPath, partFilter);
            this.conf.set("sumField", sumField);
            this.setMapRedConfiguration();
            Job job = this.setJobConfiguration(this.conf);
            HCatInputFormat.setInput(job, dbName, tableName, partFilter);
            logger.info("setInput successfully.");
            FileOutputFormat.setOutputPath(job, new Path(outPath));
            logger.info("setOutput successfully.");
            return (job.waitForCompletion(true) ? 0 : 1);
        } catch (Exception ex) {
            logger.error(ex.getMessage());
            throw ex;
        }
    }

    private Job setJobConfiguration(Configuration conf) throws Exception {
        try {
            logger.info("enter setJobConfiguration");
            Job job = Job.getInstance(conf);
            job.setJarByClass(DriveMain.class);
            job.setInputFormatClass(HCatInputFormat.class);
            job.setMapperClass(AggregateMapper.class);
            job.setReducerClass(AggregateReducer.class);

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            job.setNumReduceTasks(1);
            logger.info("setJobConfiguration successfully.");
            return job;
        } catch (Exception ex) {
            logger.error("setJobConfiguration: " + ex.getMessage());
            throw new Exception(ex);
        }
    }

    private void setMapRedConfiguration() {
        try {
            Properties properties = propUtil.getProperties();
            logger.info("Load MapReduce Configuration Successfully.");
            for (Map.Entry entry : properties.entrySet()) {
                if (entry.getKey().toString().startsWith("mapred")) {
                    conf.set(entry.getKey().toString(), entry.getValue().toString());
                    logger.info("[MR][Config] key:{}, value:{}", entry.getKey().toString(), entry.getValue().toString());
                }
            }
            logger.info("[MR][Config] Set MapReduce Configuration Successfully.");
        } catch (Exception e) {

        }

    }


    private String partitionFormat(String partition) {
        String format = "";
        if(!partition.contains("pt") && ! partition.contains("dt")) {
            String[] items = partition.split("/");
            String[] keys = {"year","month","day", "hour"};
            for(int i=0; i<items.length; i++) {
                if (i == items.length-1) {
                    format += keys[i] + "='" + items[i] + "'";
                } else {
                    format += keys[i] + "='" + items[i] + "' and ";
                }
            }
        } else {
            format = partition;
        }
        return format;
    }

    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new DriveMain(), args);
        System.exit(exitCode);
    }

}

2、Mapper

package com.kangaroo.hadoop.mapper;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hive.hcatalog.data.HCatRecord;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import org.apache.hive.hcatalog.mapreduce.HCatInputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

@SuppressWarnings("rawtypes")
public class AggregateMapper extends Mapper<WritableComparable, HCatRecord, Text, Text> {

    private static final Logger logger = LoggerFactory.getLogger(AggregateMapper.class);

    private HCatSchema schema;
    private Text outKey;
    private Text outValue;
    private IntWritable one;

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        outKey = new Text();
        outValue = new Text();
        schema = HCatInputFormat.getTableSchema(context.getConfiguration());
    }

    @Override
    protected void map(WritableComparable key, HCatRecord value, Context context) throws IOException, InterruptedException {
        String sumField = context.getConfiguration().get("sumField");
        Map<String, String> recordMap = new HashMap<String, String>();
        for (String fieldName : schema.getFieldNames()) {
            logger.info("fieldName={}", fieldName);
            String fieldValue = value.get(fieldName, schema).toString();
            logger.info("fieldName={}, fieldValue={}", fieldName, fieldValue);
            recordMap.put(fieldName, fieldValue);
            logger.info("recordMap={}", recordMap.toString());
        }
        outKey.set(recordMap.get(sumField));
        outValue.set("1");
    }

    @Override
    protected void cleanup(Context context) throws IOException, InterruptedException {
        context.write(outKey, outValue);
    }
}

3、Reducer

package com.kangaroo.hadoop.reducer;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import org.apache.hive.hcatalog.mapreduce.HCatInputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;

@SuppressWarnings("rawtypes")
public class AggregateReducer extends Reducer<Text, Text, Text, Text> {
    protected static final Logger logger = LoggerFactory.getLogger(AggregateReducer.class);
    HCatSchema schema;
    Text outKey;
    Text outValue;

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        schema = HCatInputFormat.getTableSchema(context.getConfiguration());
    }

    @Override
    public void reduce(Text key, Iterable<Text> values, Context context) throws IOException,InterruptedException {
        outKey.set(key);
        int sum = 0;
        for (Text value : values) {
            sum += Integer.parseInt(value.toString());
        }
        outValue.set(String.valueOf(sum));
    }

    protected void cleanup(Context context) throws IOException, InterruptedException {
        context.write(outKey, outValue);
    }
}

4、propertyUtil

package com.kangaroo.hadoop.utils;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;


import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;


public class PropertiesUtil {
    private String filePath;

    public PropertiesUtil() {
        this.filePath = "configure.properties";
    }

    public PropertiesUtil(String filePath) {
        this.filePath = filePath;
    }

    public Properties getProperties() throws IOException {
        Properties prop;
        InputStream inStream = null;
        try {
            inStream = PropertiesUtil.class.getClassLoader()
                    .getResourceAsStream(this.filePath);
            prop = new Properties();
            prop.load(inStream);

            return prop;
        } finally {
            if (inStream != null)
                inStream.close();
        }
    }
}

5、配置

mapred.job.queue.name=root.XXX
mapred.jar=./XXX.jar
mapred.map.tasks=300
mapred.reduce.tasks=100
#mapred.map.capacity=1
#mapred.reduce.capacity=1
mapred.job.priority=HIGH
mapred.job.name=XXX

Hadoop通過HCatalog編寫Mapreduce任務訪問hive庫中schema資料

1、dirver package com.kangaroo.hadoop.drive; import java.util.Map; import java.util.Properties; import com.kangaroo.hadoop.mapper.AggregateMapper;

hadoop 偽分散式執行 mapreduce 任務時報 running beyond physical memory或者beyond vitual memory limits

當執行中出現Container is running beyond physical memory這個問題出現主要是因為實體記憶體不足導致的，在執行mapreduce的時候，每個map和reduce都有自己分配到記憶體的最大值，當map函式需要的記憶體大於這個值就

Hadoop 用Java編寫MapReduce詞頻統計程式並提交到Hadoop叢集執行

一、MapReduce介紹 MapReduce是一個分散式計算框架，可以部署在Hadoop、Spark等大資料平臺上，實現海量資料的平行計算。它採用“分而治之”的思想，將一個計算任務交給叢集中的多臺機器共同完成，之後再彙總成最終結

大資料-hive-記錄一次不用sqoop將mysql庫的資料匯入到hive庫中

技術標籤：大資料大資料hivemysql 一、場景描述公司需要的一些資料是自己收集的，收集的時候儲存在mysql中，現在需要將mysql中的儲存的資料匯入到hive庫中。

訪問其他程式中的資料

ContentResolver（）中的增刪改查方法不接收表名引數，而使用的是一個Uri引數，這個引數被稱為內容URl。內容URl給內容提供器中的資料建立了唯一識別符號，它主要由兩部分組成：authority和path。authority是用於對不

IDEA SpringBoot整合hadoop執行環境，，本地啟動專案，GET請求介面觸發遠端提交MapReduce任務至生產叢集報錯

報錯明細 IDEA SpringBoot整合hadoop執行環境，，本地啟動專案，GET請求介面觸發遠端提交MapReduce任務至生產叢集報錯：

hadoop 裡執行 MapReduce 任務的幾種常見方式

說明：測試檔案： echo -e \"aatbb tccnbbtcctdd\" > 3.txt hadoop fs -put 3.txt /tmp/3.txt 全文的例子均以該檔案做測試用例，統計單詞出現的次數（WordCount）。

Spring boot如何通過@Scheduled實現定時任務及多執行緒配置

這篇文章主要介紹了Spring boot如何通過@Scheduled實現定時任務及多執行緒配置,文中通過示例程式碼介紹的非常詳細，對大家的學習或者工作具有一定的參考學習價值,需要的朋友可以參考下

通過Python編寫一個簡單登入功能過程解析

需求：寫一個登入的程式， 1、最多登陸失敗3次 2、登入成功，提示歡迎xx登入，今天的日期是xxx，程式結束

nuxt配置通過指定IP和埠訪問的實現

非常簡單，只要加一個配置！第1步，複製這個程式碼（埠可改） \"config\": { \"nuxt\": {

主流開源SQL（on Hadoop）總結，不斷改進的Hive始終遙遙領先

本文涵蓋了6個開源領導者：Hive、Impala、Spark SQL、Drill、HAWQ 以及Presto，還加上Calcite、Kylin、Phoenix、Tajo 和Trafodion。以及2個商業化選擇Oracle Big Data SQL 和IBM Big SQL，IBM 尚未將後者更名為“Wat

Win10 1903系統怎麼通過檔案資源管理器訪問Linux檔案

在最新版的Win10系統中，完美相容了Linux系統，而且從Win10 1903系統開始，加入了一項新功能，就是可以直接在預設的檔案資源管理器中訪問Linux系統下的檔案，那麼要怎麼操作呢，現在為大家帶來Win10 1903系統通過檔案

Hadoop基礎（四十三）：Hive 安裝（二）

1 MySql 安裝 1.1安裝包準備 1．檢視 mysql 是否安裝，如果安裝了，解除安裝 mysql （1）檢視

在Hadoop環境下執行MapReduce自帶的wordCount示例

首先先建立一個目標檔案words，將其儲存至/home/hadoop目錄下： cd /home/hadoopvim words# 向新檔案中新增內容，例如：data mining on data warehouse

python 訪問hive

1、安裝 pip install impyla==0.14.1 pip install pure_sasl==0.5.1 pip install thriftpy==0.3.9 pip install thrift-sasl==0.2.1

為什麼vue中data中的資料可以通過this.x 直接訪問到？

先來一段vue程式碼 import Vue from \'vue\' new Vue({ el: \'#app\', data: { message: \'hello world\' },

MapReduce任務執行到running job卡住

改變hadoop裡的yarn-site.xml檔案舊版本 <configuration>

Outlook通過RPC/RPC Over HTTPS訪問Exchange郵箱

轉載自：http://yuelei.blog.51cto.com/202879/75398 Outlook通過RPC/RPC Over HTTPS訪問Exchange郵箱我們在前面的文章中已經介紹了Exchange郵箱的建立和配置，現在我們來看看如何訪問Exchange郵箱。訪問

java後端通過資料庫地址等資訊訪問第三方資料庫，以及JdbcTemplate的in語法引數解決辦法

一、通過資料庫資訊訪問第三方資料庫本例子是從一張儲存資料庫資訊表中查出第三方資料庫資訊，再進行訪問的

spark 訪問 hive，不能獲取到資料資訊

前言 HDP version： 3.x HIVE version： 3.x 問題描述使用hdp版本的叢集服務，當安裝好spark以後，執行spark.sql ,不能查詢到hive的資料庫，只查詢到default資料庫，說明spark沒有連線到hive。

Hadoop通過HCatalog編寫Mapreduce任務訪問hive庫中schema資料

相關推薦