hive基础语法 - 爱码网

1、内部表建表时指定表中字段分隔符
create table t_order(id int,name string…)
row format delimited
fields terminated by ‘,’;

2、外部表的建立
create external table t_job(id int,name string…)
row format delimited
fields terminated by ‘,’
location ‘/user/mytables’;

3、分区表(注意这里的分区条件和字段无关)
/user/hive/warehouse/t_pv_log/day=2019-1-29
/day=2019-1-30
…
在一个表目录下还可以建立其他子目录用以分区，具体按照什么来分去看实际场景
/user/hive/warehouse/t_consume_log/city=Beijing/
/city=Shanghai/
…
create table t_pv_log(ip string,url string,commit_time string)
partitioned by(day string)
row format delimited
fields terninated by ‘,’;
为了不用每次暴力地将文件丢在hdfs目录中去在hive中使用以下语句导入本地文件：
load data local inpath ‘/root/hivetest/xxx.day.29’ into table t_xxx_xxx partition(day=20190129); day---->不一定要是表中的字段
load data local inpath ‘/root/hivetest/xxx/day.30’ into table t_xxx_xxx partition(day=20190130);
不过在表中会多出一个分区字段（伪字段），例如：
hive基础语法
所以注意：分区字段不能是表定义中已存在的字段，可以用来作为查询条件
select count(1)
from t_pv_log
where url=‘http://sina.com/a’ and day=‘20190129’; —>这是查找某个分区中的数据
select count(1)
from t_pv_loh
where url=‘http://sina.com/a’; —>查询整个表中数据

4、CTAS建表
a、要建立与已有表相同的结构的一张新表可以使用一下语法
create table t_pv_log3 like t_pv_log;
这里的字段包括和t_pv_log中的所有字段n-1 加上1个伪字段
b、要建立与已有表相同的结构的表并且新表中存在数据，且数据来自于旧表的一部分
create table t_pv_log4
as
select * from t_pv_log2 where ip>‘192.168.33,2’;
这里的字段包括和t_pv_log2中的所有字段，n个字段都是普通字段。
but：如果是这样
create table t_pv_log4
as select ip from t_pv_log2;
这样的话新表中只会有一个字段(ip字段)
5、将数据文件导入hive表中
a、手动用hdfs命令，将文件放入指定表目录下；
b、在hive的交互式shell中用hive命令来导入本地数据到表目录（复制）
hive>load data local inpath ‘/root/order.data.2’ into table t_order [patition() …]
c、用hive命令导入hdfs中的数据文件到表目录(移动)
hive>load data inpath ‘/access.log.2019-01-29.log’ into table t_access partition(dt=‘20190129’);

5、hive条件查询
基本查询示例：
select * from t_access;
select count(*) from t_access;
select max(ip) from t_access;
…

6、hive连接查询
先准备两个表
create table t_a(name string,numb int)
row format delimited
fields terminated by ‘,’;

create table t_b(name string,nick string)
row format delimited
fields terminated by ‘,’;

load data local inpath ‘/root/hivetest/a.txt’ into table t_a;
load data local inpath ‘/root/hivetest/b.txt’ into table t_b;

各类join
(1)、内连接/笛卡尔积(可以不要inner)
select a.,b.
from t_a as a inner join t_b as b;
hive基础语法

(2)、指定join条件(属于内连接,可以不要inner)
select a.,b.
from t_a as a inner join t_b as b on a.name=b.name;
hive基础语法

(3)、左外连接（左连接，可以不要outer）
select a.,b.
from
t_a as a left outer join t_b as b on a.name=b.name;
hive基础语法

(4)、右外连接（右连接,可以不要outer）
select a.,b.
from
t_a as a right outer join t_b as b on a.name=b.name;
hive基础语法
(5)、全外连接
select a.,b**
from
t_a as a full outer join t_b as b on a.name=b.name;

(6)、左半连接（左表全部返回，右表不返回）
这样会报错哦！
select a.,b.*
from
t_a as a left semi join t_b as b on a.name=b.name;

select a.*
from
t_a as a left semi join t_b as b on a.name=b.name;
hive基础语法

7、hive分组聚合
(1)、针对每一行进行运算 --该表达式是对数据中的每一行进行逐行运算
select ip,upper(url),access_time
from
t_pv_log;
hive基础语法
(2)、求每条URL的访问总次数 --该表达式是对分好组的数据进行逐组运算
select url,count(1) as cnts
from
t_pv_log
group by url;

(3)、求每个URL的访问者中ip地址最大的
select url,max(ip)
from
t_pv_log
group by url;

(4)、求每个用户访问同一页面的所有记录中，时间最晚的一条
select ip,url,max(access_time)
from
t_pv_log
group by ip,url;
先按照ip分组，再在分组好的基础上再次按照url分组
hive基础语法