Spark2 sparkSession使用
阿新 • • 發佈:2018-12-20
sparkSession使用:
package com.jdjr.city.demo import org.apache.spark.sql.SparkSession /** * @Auther: hongwei * @Date: 2018/11/9 16:31 * @Description: SparkSession使用 */ object Test4 { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("ActionOperation") .master("local") .getOrCreate() spark.sparkContext.setLogLevel("WARN") import spark.implicits._ val employee = spark.read.json("D:\\jdWork\\code\\MySpark\\src\\main\\resources\\1.txt") // collect:將分散式儲存在叢集上的分散式資料集(比如dataset)中的所有資料都獲取到driver端來 employee.collect().foreach { println(_) } // count:對dataset中的記錄數進行統計個數的操作 println(employee.count()) // first:獲取資料集中的第一條資料 println(employee.first()) // foreach:遍歷資料集中的每一條資料,對資料進行操作,這個跟collect不同,collect是將資料獲取到driver端進行操作 // foreach是將計算操作推到叢集上去分散式執行 // foreach(println(_))這種,真正在叢集中執行的時候,是沒用的,因為輸出的結果是在分散式的叢集中的,我們是看不到的 employee.foreach { println(_) } // reduce:對資料集中的所有資料進行歸約的操作,多條變成一條 // 用reduce來實現資料集的個數的統計 //println(employee.map(employee => 1).reduce(_ + _)) // show,預設將dataset資料列印前20條 employee.show() // take,從資料集中獲取指定條數 employee.take(3).foreach { println(_) } } }
pom.xml
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.jdjr.city</groupId> <artifactId>MySpark</artifactId> <version>1.0-SNAPSHOT</version> <repositories> <repository> <id>cloudera</id> <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url> </repository> </repositories> <properties> <maven.compiler.source>1.8</maven.compiler.source> <maven.compiler.target>1.8</maven.compiler.target> <encoding>UTF-8</encoding> <scala.version>2.11.8</scala.version> <spark.version>2.2.0</spark.version> <hadoop.version>2.6.4</hadoop.version> </properties> <dependencies> <!--<dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> <version>${scala.version}</version> </dependency>--> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.8</version> </dependency> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.5</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.8.3</version> </dependency> <dependency> <groupId>org.ansj</groupId> <artifactId>ansj_seg</artifactId> <version>5.0.4</version> </dependency> <dependency> <groupId>com.geccocrawler</groupId> <artifactId>gecco</artifactId> <version>1.0.8</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.11</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.11</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>${hadoop.version}</version> </dependency> </dependencies> <build> <sourceDirectory>src/main/scala</sourceDirectory> <testSourceDirectory>src/test/scala</testSourceDirectory> <plugins> <plugin> <groupId>net.alchim31.maven</groupId> <artifactId>scala-maven-plugin</artifactId> <version>3.2.2</version> <executions> <execution> <goals> <goal>compile</goal> <goal>testCompile</goal> </goals> <configuration> <args> <!--scala 2.11不支援--> <!-- <arg>-make:transitive</arg>--> <arg>-dependencyfile</arg> <arg>${project.build.directory}/.scala_dependencies</arg> </args> </configuration> </execution> </executions> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> <version>2.4.3</version> <executions> <execution> <phase>package</phase> <goals> <goal>shade</goal> </goals> <configuration> <filters> <filter> <artifact>*:*</artifact> <excludes> <exclude>META-INF/*.SF</exclude> <exclude>META-INF/*.DSA</exclude> <exclude>META-INF/*.RSA</exclude> </excludes> </filter> </filters> <transformers> <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> <mainClass>com.jdjr.city.demo.SearchPoi2</mainClass> </transformer> </transformers> </configuration> </execution> </executions> </plugin> </plugins> </build> </project>