hive> desc mytest;
OK
num int
name string
Time taken: 0.087 seconds
hive> select * from mytest;
OK
NULL NULL
22 world
33 hive
hive> select * from mytest a join mytest b on a.num=b.num;
OK
22 world 22 world
33 hive 33 hive
1.hive测试数据位置本地:/home/hadoop/testdata/hive
命名规则:表名.txt
create table tb1(id double,name string) ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
storedas textfile;
本地直接写绝对路径就好,loaddata local inpath
loaddata local inpath '/home/hadoop/testdata/hive/tb1.txt' overwrite intotable tb1;
hive>select *from tb1;
OK
100.0 hadoop
101.0 hive
102.0 hbase
103.0 memcachedb
Timetaken: 0.236 seconds
因为字段类型为double,所以将id显示为double型
创建表tb2,并将tb1的数据插入tb2
create table tb2 (id double,name string) ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
stored as textfile;
insert overwrite table tb2 select * from tb1;
select*from tb2;
100.0 hadoop
101.0 hive
102.0 hbase
103.0 memcachedb
修改表tb101的id字段信息,由double修改为int
alter table tbt1 change id id int;
100 hadoop
101 hive
102 hbase
103 memcachedb
修改表tb2的id字段信息,由double修改为int
alter table tb2 change id id int;
hive>select *from tb2;
OK
NULL hadoop
NULL hive
NULL hbase
NULL memcachedb
此时由于tb2表的底层数据发生了变化
hadoop@ubuntu:~/testdata/hive$hadoop fs -cat /user/hive/warehouse/tb1/tb1.txt
100 hadoop
101 hive
102 hbase
103 memcachedb
hadoop@ubuntu:~/testdata/hive$hadoop fs -cat /user/hive/warehouse/tb2/000000_0
100.0 hadoop
101.0 hive
102.0 hbase
103.0 memcachedb
虚拟列
INPUT__FILE__NAME表示输入文件的路径
BLOCK__OFFSET__INSIDE__FILE表示的记录在文件中的偏移量
select id,INPUT__FILE__NAME,BLOCK__OFFSET__INSIDE__FILE from tb1;
100 hdfs://localhost:9000/user/hive/warehouse/tb1/tb1.txt 0
101 hdfs://localhost:9000/user/hive/warehouse/tb1/tb1.txt 11
102 hdfs://localhost:9000/user/hive/warehouse/tb1/tb1.txt 20
103 hdfs://localhost:9000/user/hive/warehouse/tb1/tb1.txt 30
一些函数的用法保证tb1存在
select concat('a','\001','b') from tb1;
Array数组构建:
语法:array(val1, val2, …)说明:根据输入的参数构建数组array类型
create table udftest as select array("tom","mary","tim")as t from tb1;
hive>desc udftest;
OK
t array<string>
hive>select *from udftest;
OK
["tom","mary","tim"]
["tom","mary","tim"]
["tom","mary","tim"]
["tom","mary","tim"]
array类型访问:A[n]
select t[0],t[1],t[2] from udftest;
tom mary tim
tom mary tim
tom mary tim
tom mary tim
select t[0],t[4] from udftest;没有直接返回null
Map类型构建
Create table maptest as select map('100','tom','200','mary') as t fromtb1;(注意此处maptest不能写成map)
hive>desc maptest;
OK
t map<string,string>
Timetaken: 0.106 seconds
hive>select *from maptest;
OK
{"100":"tom","200":"mary"}
{"100":"tom","200":"mary"}
{"100":"tom","200":"mary"}
{"100":"tom","200":"mary"}
map类型访问:M[key]
select t['200'],t['100'] from maptest;
mary tom
mary tom
mary tom
mary tom
select t['200'],t['300'] from maptest;key—300不存在,返回null
mary NULL
mary NULL
mary NULL
mary NULL
Struct类型构建
create table structtest as select struct('tom','mary','tim') as t from tb1;
hive>desc structtest;
OK
t struct<col1:string,col2:string,col3:st
hive>select *from structtest;
OK
{"col1":"tom","col2":"mary","col3":"tim"}
{"col1":"tom","col2":"mary","col3":"tim"}
{"col1":"tom","col2":"mary","col3":"tim"}
{"col1":"tom","col2":"mary","col3":"tim"}
struct类型访问:S.x
说明:返回结构体S中的x字段。比如,对于结构体structfoobar {int foo, int bar},foobar.foo返回结构体中的foo字段,没有是直接报错eg:t.col4
select t.col1,t.col3 from structtest test;
tom tim
tom tim
tom tim
tom tim
select t.col1,t.col2 from structtest test;
tom mary
tom mary
tom mary
tom mary
select t.col1,t.col2,t.col3 from structtest test;
tom mary tim
tom mary tim
tom mary tim
tom mary tim
分割字符串函数: split
select split('abtcdtef','t') from tb1;
["ab","cd","ef"]
["ab","cd","ef"]
["ab","cd","ef"]
["ab","cd","ef"]
json解析函数:get_json_object
select get_json_object('{"store":{"fruit":\[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],"bicycle":{"price":19.95,"color":"red"}},"email":"amy@only_for_json_udf_test.net","owner":"amy"}','$.owner')from
tb1;
amy
amy
amy
amy
select get_json_object('{"store":{"fruit":\[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],"bicycle":{"price":19.95,"color":"red"}},"email":"amy@only_for_json_udf_test.net","owner":"amy"}','$.email')from
tb1;
amy@only_for_json_udf_test.net
amy@only_for_json_udf_test.net
amy@only_for_json_udf_test.net
amy@only_for_json_udf_test.net
URL解析函数:parse_url
明:返回URL中指定的部分。partToExtract的有效值为:HOST,PATH, QUERY, REF, PROTOCOL, AUTHORITY, FILE, and USERINFO.
select parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1','HOST') from tb1;
facebook.com
facebook.com
facebook.com
facebook.com
select parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1','USERINFO') from tb1;
分隔符处理‘\001’
‘\001′是hive中默认的分隔符,hive对该字符有些特殊的处理.
select concat('a','\001','b') fromtb1;
load data local inpath '/home/hadoop/testdata/hive/ml-100k/u.data' overwrite into table u_data;
python脚本weekday_mapper.py
import sys
import datetime
for line in sys.stdin:
line= line.strip()
userid,movieid,rating,unixtime=line.split('\t')
weekday= datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print'\t'.join([userid,movieid,rating,str(weekday)])
create table u_data_new(userid int,movieid int,rating int,weekday int)rowformat delimited fields terminated by '\t' stored as textfile;
add file /home/hadoop/testdata/hive/weekday_mapper.py;
hive>INSERT OVERWRITE TABLE u_data_new
>SELECT
>TRANSFORM (userid, movieid, rating, unixtime)
>USING 'python weekday_mapper.py'
>AS (userid, movieid, rating, weekday)
>FROM u_data;
insert overwrite table u_data_new selecttransform(userid.movieid,rating,unixtime) using 'pythonweekday_mapper.py' as (userid,movieid,rating,weekday) from u_data;
select weekday,count(*) from u_data_new group by weekday;