1、对原始数据进行预处理,格式为上面给出的预处理之后的示例数据。
通过观察原始数据形式,可以发现,每个字段之间使用“:”分割,视频可以有多个视频类别,类别之间&符号分割,且分割的两边有空格字符,同时相关视频也是可以有多个,多个相关视频也是用“:”进行分割。为了分析数据时方便,我们首先进行数据重组清洗操作。
即:将每条数据的类别用“&”分割,同时去掉两边空格,多个“相关视频id”使用“,”进行分割
map
reduce
configured
api
创建ori表
create table video_ori(
videoId string,
uploader string,
age int,
category array,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array)
row format delimited
fields terminated by “:”
stored as textfile;
创建orc表
create table video_orc(
videoId string,
uploader string,
age int,
category array,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array)
row format delimited
fields terminated by “:”
stored as orc;
导入语句
load data local inpath ‘/opt/part-r-00000’ into table video_ori;
从原始表查询数据并插入对应表
insert into table video_orc select * from video_ori;
3.1统计视频上传到平台的整天数大于500的按照video表的原格式(加载进去之前的格式,列的分隔符是“:”)保存到/opt/hiveData/video目录下
hive -e "select * from video.video_orc where rate=5 " > /opt/hive/Data/Video/a
创建外部表
create external table rate(
videoId string,
uploader string,
age string,
category string,
length string,
views string,
rate string,
ratings string,
comments string,
relatedId string)
row format delimited
fields terminated by “\t”
stored as textfile;
数据加载语句
load data local inpath ‘/opt/Video/xxx.txt’ into table rate;
创建映射表
create table video.hbase_rate(
videoId string,
uploader string,
age string,
category string,
length string,
views string,
rate string,
ratings string,
comments string,
relatedId string)
stored by ‘org.apache.hadoop.hive.hbase.HBaseStorageHandler’
with serdeproperties(“hbase.columns.mapping” = “cf:uploader,cf:age,cf:category,cf:length,cf:views,cf:rate,cf:ratings,cf:comments,cf:relatedId”)
tblproperties(“hbase.table.name” = “hbase_rate”);
请写出通过insert overwrite select,插入hbase_user表的语句
insert into table hbase_rate select * from rate;