1. 程式人生 > 實用技巧 >Spark讀取parquet檔案

Spark讀取parquet檔案

[root@centos00 ~]$ cd /opt/cdh5.14.2/hadoop-2.6.0-cdh5.14.2/
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start namenode
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start datanode
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ bin/hdfs dfs -ls /input/dept
Found 3 items
-rw-r--r--   1 root supergroup          0 2020-08-27 20:44 /input/dept/_SUCCESS
-rw-r--r--   1 root supergroup        484 2020-08-27 20:44 /input/dept/part-00000-247a5279-306d-4cae-a85b-4d0196f39ebc-c000.snappy.parquet
-rw-r--r--   1 root supergroup        472 2020-08-27 20:44 /input/dept/part-00001-247a5279-306d-4cae-a85b-4d0196f39ebc-c000.snappy.parquet
           
[root@centos00 ~]$ cd /opt/cdh5.14.2/hive-1.1.0-cdh5.14.2/
[root@centos00 hive-1.1.0-cdh5.14.2]$ bin/hive --service metastore &
           
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ cd ../spark-2.2.1-cdh5.14.2/
[root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-master.sh
[root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-slaves.sh
[root@centos00 spark-2.2.1-cdh5.14.2]$ bin/spark-shell --master local[2]
 
/*
 * 方法1
 */ 
scala> val p = Seq("/input/dept")
p: Seq[String] = List(/input/dept)
 
scala> val df = spark.read.load(p:_*)
df: org.apache.spark.sql.DataFrame = [value: string]
 
scala> df.show(false)
+-----------------------+
|value                  |
+-----------------------+
|10 ACCOUNTING  NEW YORK|
|20 RESERACH    DALLAS  |
|30 SALES   CHICAGO     |
|40 OPREARIONS  BOSTON  |
+-----------------------+
 
scala> df.printSchema
root
 |-- value: string (nullable = true)
 
/*
 * 方法2
 */  
scala> val df2 = spark.read.parquet("/input/dept")
df2: org.apache.spark.sql.DataFrame = [value: string]
 
scala> df2.show(false)
+----------------------+
|value                 |
+----------------------+
|10 ACCOUNTING  NEW YORK|
|20 RESERACH    DALLAS    |
|30 SALES   CHICAGO      |
|40 OPREARIONS  BOSTON  |
+----------------------+
 
scala> df2.printSchema
root
 |-- value: string (nullable = true)