hadoop 壓縮解壓
阿新 • • 發佈:2019-02-10
先放程式碼
package com.huawei.hdfs.compress; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.compress.*; import org.apache.hadoop.util.ReflectionUtils; import org.junit.Test; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; public class TestCompress { @Test public void deflateCompress() throws Exception { Class codeclass=DeflateCodec.class; //例項化物件 CompressionCodec codec= (CompressionCodec) ReflectionUtils.newInstance(codeclass,new Configuration()); //建立檔案輸出流 FileOutputStream fos=new FileOutputStream("/Users/simmucheng/tmp/words.gz"); //得到壓縮流 CompressionOutputStream zipout=codec.createOutputStream(fos); IOUtils.copyBytes(new FileInputStream("/Users/simmucheng/tmp/words"),zipout,1024); zipout.finish(); } public static void main(String[] args) throws Exception { Class[] zipClasses={ DeflateCodec.class, GzipCodec.class, BZip2Codec.class }; for(Class c:zipClasses){ ManyCompress(c,args[0],args[1]); } for(Class c:zipClasses){ ManyDecompress(c,args[0],args[1]); } } public static void ManyCompress(Class compressmethos,String strs0,String strs1) throws Exception { CompressionCodec codec=(CompressionCodec) ReflectionUtils.newInstance(compressmethos,new Configuration()); FileOutputStream fos=new FileOutputStream(strs1+codec.getDefaultExtension()); CompressionOutputStream zipout=codec.createOutputStream(fos); IOUtils.copyBytes(new FileInputStream(strs0),zipout,1024); zipout.close(); fos.close(); } public static void ManyDecompress(Class compressmethos, String arg, String s) throws Exception { //例項化物件 CompressionCodec codec=(CompressionCodec) ReflectionUtils.newInstance(compressmethos,new Configuration()); FileInputStream fis=new FileInputStream(arg); CompressionInputStream zipIn=codec.createInputStream(fis); IOUtils.copyBytes(zipIn,new FileOutputStream(s+".txt"),1024); zipIn.close(); fis.close(); } }
pom.xml
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.huawei</groupId> <artifactId>hdfs</artifactId> <version>1.0-SNAPSHOT</version> <packaging>jar</packaging> <build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-antrun-plugin</artifactId> <version>1.7</version> <executions> <execution> <phase>package</phase> <goals> <goal>run</goal> </goals> <configuration> <tasks> <echo>-----複製jar包到指定目錄--------</echo> <copy file="/Users/simmucheng/IDEA/idea-hadoop-4/hdfs/target/hdfs-1.0-SNAPSHOT.jar" toDir="/Users/simmucheng/tmp"> </copy> </tasks> </configuration> </execution> </executions> </plugin> </plugins> </build> <dependencies> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>2.7.4</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>2.7.4</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-yarn-common</artifactId> <version>2.7.4</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-yarn-client</artifactId> <version>2.7.4</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-yarn-server-resourcemanager</artifactId> <version>2.7.4</version> </dependency> <dependency> <groupId>org.anarres.lzo</groupId> <artifactId>lzo-hadoop</artifactId> <version>1.0.0</version> </dependency> </dependencies> </project>
該專案可以執行在hadoop叢集中,但是由於LZO演算法在hadoop中連相關的codec都沒有,所以需要將相關的jar包放入hadoop叢集中,因為我們是採用maven專案,所以jar包是存在在Interllj IDE中的,需要在mac的命令列視窗執行
mvn -DoutputDirectory=/Users/simmucheng/hadoop_tmp_jar -DgroupId=com.huawei -DartifactId=hdfs -Dversion=1.0-SNAPSHOT dependency:copy-dependencies
將maven相關的下載jar包轉移到指定目錄,這樣就可以將jar包收集並轉移到hadoop中了。