Spark讀取parquet檔案
阿新 • • 發佈:2020-08-27
[root@centos00 ~]$ cd /opt/cdh5.14.2/hadoop-2.6.0-cdh5.14.2/ [root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start namenode [root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start datanode [root@centos00 hadoop-2.6.0-cdh5.14.2]$ bin/hdfs dfs -ls /input/dept Found 3 items -rw-r--r-- 1 root supergroup 0 2020-08-27 20:44 /input/dept/_SUCCESS -rw-r--r-- 1 root supergroup 484 2020-08-27 20:44 /input/dept/part-00000-247a5279-306d-4cae-a85b-4d0196f39ebc-c000.snappy.parquet -rw-r--r-- 1 root supergroup 472 2020-08-27 20:44 /input/dept/part-00001-247a5279-306d-4cae-a85b-4d0196f39ebc-c000.snappy.parquet [root@centos00 ~]$ cd /opt/cdh5.14.2/hive-1.1.0-cdh5.14.2/ [root@centos00 hive-1.1.0-cdh5.14.2]$ bin/hive --service metastore & [root@centos00 hadoop-2.6.0-cdh5.14.2]$ cd ../spark-2.2.1-cdh5.14.2/ [root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-master.sh [root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-slaves.sh [root@centos00 spark-2.2.1-cdh5.14.2]$ bin/spark-shell --master local[2] /* * 方法1 */ scala> val p = Seq("/input/dept") p: Seq[String] = List(/input/dept) scala> val df = spark.read.load(p:_*) df: org.apache.spark.sql.DataFrame = [value: string] scala> df.show(false) +-----------------------+ |value | +-----------------------+ |10 ACCOUNTING NEW YORK| |20 RESERACH DALLAS | |30 SALES CHICAGO | |40 OPREARIONS BOSTON | +-----------------------+ scala> df.printSchema root |-- value: string (nullable = true) /* * 方法2 */ scala> val df2 = spark.read.parquet("/input/dept") df2: org.apache.spark.sql.DataFrame = [value: string] scala> df2.show(false) +----------------------+ |value | +----------------------+ |10 ACCOUNTING NEW YORK| |20 RESERACH DALLAS | |30 SALES CHICAGO | |40 OPREARIONS BOSTON | +----------------------+ scala> df2.printSchema root |-- value: string (nullable = true)