1. 程式人生 > >hive之Python UDF函式操作map資料 詳解 全是乾貨

hive之Python UDF函式操作map資料 詳解 全是乾貨

#1、Hive基本操作: 檢視dw.full_h_usr_base_user的詳細資訊,可以獲取資料檔案的存放路徑 desc formatted dw.full_h_usr_base_user; dfs -ls dfs -ls hdfs://BIGDATA:9000/user/hive/warehouse/dw.db/full_h_usr_base_user; 刪除外表full_h_usr_base_user的資料 dfs -rmdir dfs -ls hdfs://BIGDATA:9000/user/hive/warehouse/dw.db/full_h_usr_base_user; #192.168.1.181 192.168.1.1
#2、建立帶有map資料型別的外表 create external table dw.full_h_usr_base_user( user_id  string    comment '使用者id', reg_ip  string    comment 'ip', reg_ip_geo_map map<string,string> comment --map資料型別建立方法 'city_id,city_name,isp,province_id,province_name,country_id,country_name,postzip,district,province'
) comment '使用者測試表' partitioned by(ds string comment '當前時間,用於分割槽欄位') row format delimited fields terminated by '\t' collection items terminated by ","--map鍵值對逗號分割 map keys terminated by ":"--map鍵值冒號分割 stored as TEXTFILE;--儲存為文字型別 #3、載入資料(指定user_id和reg_ip即可,reg_ip_geo_map可以通過UDF運算出來 load data local inpath '/opt/data/dw.full_h_usr_base_user.del' 
overwrite into table dw.full_h_usr_base_user partition(ds='2017-09-25'); #4、自定義函式:Python UDF函式ip_to_num.py
#coding=utf-8
#Version:python3.5.2
#Tools:Pycharm
#Date:
__author__ = "Colby"
import socket
import struct
import sys,datetime
ipDB='/opt/data/IP_utf-8.csv'
for line in sys.stdin:
    line = line.strip()
    user_id, reg_ip, reg_ip_geo_map, ds = line.split('\t')
    num_ip = int(socket.ntohl(struct.unpack("I", socket.inet_aton(str(reg_ip)))[0]))
    f = open(ipDB, 'r', encoding="utf-8")
    ipDict = {}
    count = 0
for line in f:
        if count == 9:
            count += 1
continue
line = line.split(',')
        if int(line[2]) <= num_ip and int(line[3]) >= num_ip:
            ipDict['IP'] = reg_ip
            ipDict['nationality'] = line[4]
            ipDict['province'] = line[5]
            ipDict['city'] = line[6]
            ipDict['Corporation'] = line[8]
            reg_ip_geo_map=str(ipDict)[1:-1].replace('\'','').replace(' ','')
            print('\t'.join([user_id, reg_ip, reg_ip_geo_map,ds]))
    f.close()
#5、將udf函式檔案上傳檔案到伺服器指定目錄 /opt/udf/ /opt/udf/ip_to_num.py #6、進入hive命令列,add檔案 add file /opt/udf/ip_to_num.py; #Added resources: [/opt/udf/ip_to_num.py] #7、使用udf函式並進行測試 SELECT   TRANSFORM (user_id, reg_ip, reg_ip_geo_map, ds)   USING 'python3 ip_to_num.py'   AS (user_id, reg_ip, reg_ip_geo_map, ds) FROM dw.full_h_usr_base_user; #8、函式處理資料,並且overwrite表dw.full_h_usr_base_user,注意動態分割槽引數 set hive.exec.dynamic.partition.mode=nonstrict; insert overwrite table dw.full_h_usr_base_user partition(ds) select user_id ,reg_ip ,str_to_map(reg_ip_geo_map,',',':') as reg_ip_geo_map ,ds from ( SELECT   TRANSFORM (user_id, reg_ip, reg_ip_geo_map,ds)   USING 'python3 ip_to_num.py'   AS (user_id, reg_ip, reg_ip_geo_map,ds) FROM dw.full_h_usr_base_user ) as a ; #9、查詢處理好的資料,學會查詢map型別的資料 select user_id ,reg_ip_geo_map['province']  ,reg_ip_geo_map['city']  ,reg_ip_geo_map['nationality']  from dw.full_h_usr_base_user where ds='2017-09-25' and user_id='1000000015'; 輸出結果: OK 1000000015      安徽省  合肥市  中國 Time taken: 0.107 seconds, Fetched: 1 row(s) #動態分割槽,將字元創轉換成MAP #set hive.exec.dynamic.partition.mode=nonstrict; #insert into dw.full_h_usr_base_user partition(ds)  #select user_id #, reg_ip #, str_to_map(reg_ip_geo_map) reg_ip_geo_map #,ds from dw.full_h_usr_base_user_tmp;