lj-C

**1、组织数据 (需要处理每条数据开头和结尾的中括号)
(1)创建Hive表weibo_json(json string),表只有一个字段,导入所有数据,并验证查询前5条数据 **

    create table weibo_json(json string);
    load data local inpath \'/root/weibo.txt\' into table weibo_json;
    select * from weibo_json limit 5;

(2)解析完weibo_json当中的json格式数据到拥有19个字段的weibo表中,写出必要的SQL语句

    create table weibo as 
    select json_tuple(json,
    \'beCommentWeiboId\'
    ,\'beForwardWeiboId\'
    ,\'catchTime\'
    ,\'commentCount\'
    ,\'content\'
    ,\'createTime\'
    ,\'info1\'
    ,\'info2\'
    ,\'info3\'
    ,\'mlevel\'
    ,\'musicurl\'
    ,\'pic_list\'
    ,\'praiseCount\'
    ,\'reportCount\'
    ,\'source\'
    ,\'userId\'
    ,\'videourl\'
    ,\'weiboId\'
    ,\'weiboUrl\'
    ) as 
    (beCommentWeiboId
    ,beForwardWeiboId
    ,catchTime
    ,commentCount
    ,content
    ,createTime
    ,info1
    ,info2
    ,info3
    ,mlevel
    ,musicurl
    ,pic_list
    ,praiseCount
    ,reportCount
    ,source
    ,userId
    ,videourl
    ,weiboId
    ,weiboUrl
    )
    from weibo_json;

2、统计微博总量 和 独立用户数

    select count(*) from weibo;
    +----------+--+
    |   _c0    |
    +----------+--+
    | 1451868  |
    +----------+--+
    select count(*) as Independent from
    (select count(*) from weibo group by userId) tmp;
    +--------------+--+
    | independent  |
    +--------------+--+
    | 78540        |
    +--------------+--+

3、统计用户所有微博被转发的次数之和,输出top5用户,并给出次数

    #select source from weibo limit 10;
    reportCount
    select userId,sum(reportCount) as total from weibo group by userId order by total desc limit 5;
    +-------------+--------------+--+
    |   userid    |    total     |
    +-------------+--------------+--+
    | 1793285524  | 7.6454805E7  |
    | 1629810574  | 7.3656898E7  |
    | 2803301701  | 6.8176008E7  |
    | 1266286555  | 5.5111054E7  |
    | 1191258123  | 5.4808042E7  |
    +-------------+--------------+--+

4、统计带图片的微博数

    select count(*) as total from weibo where pic_list=\'[]\';
    +---------+--+
    |  total  |
    +---------+--+
    | 701356  |
    +---------+--+

5、统计使用iphone发微博的独立用户数

    select count(*) as total from
    (select count(*) as total from weibo where source=\'iPhone客户端\' group by userId) tmp;
    +------+--+
    | total  |
    +------+--+
    | 921  |
    +------+--+

6、将用户所有微博的点赞人数和转发人数相加求和,并将相加之和降序排列,取前10条记录,输出userid和总次数

    select userId,(sum(praiseCount)+sum(reportCount)) as total from weibo group by userId order by total desc limit 10;
    
    +-------------+---------------+--+
    |   userid    |     total     |
    +-------------+---------------+--+
    | 1793285524  | 1.14941096E8  |
    | 1629810574  | 9.761207E7    |
    | 1266286555  | 8.3789422E7   |
    | 2803301701  | 7.4208822E7   |
    | 1195242865  | 6.9292231E7   |
    | 1191258123  | 6.1985742E7   |
    | 1197161814  | 5.9093308E7   |
    | 2656274875  | 5.2380775E7   |
    | 2202387347  | 5.1623117E7   |
    | 1195230310  | 4.8321083E7   |
    +-------------+---------------+--+

7、统计微博中评论次数小于1000的用户ID与数据来源信息,将其放入视图,然后统计视图中数据来源是”ipad客户端”的用户数目

    #创建表接收微博中评论次数小于1000的用户ID与数据来源信息
    create table small1000 as
    select userId,source from weibo where commentCount<1000 group by userId,source;
    #查询数据来源是”ipad客户端”的用户数目
    select count(*) as total from small1000 where source=\'iPad客户端\';
    +--------+--+
    | total  |
    +--------+--+
    | 537    |
    +--------+--+

**8、统计微博内容中出现”iphone”次数最多的用户,
最终结果输出用户id和次数(注意:该次数是”iphone”的出现次数,
不是出现”iphone”的微博数目) **

    select userId,count(userid) from weibo where content like \'%iphone%\' group by userId;

9、求每天发微博次数最多的那个家伙的ID和发微博的条数

    #时间函数,原表存储的数据的时间是字符串的时间戳,所以需要将其转成长整型bigint
    from_unixtime(cast(createTime as bigint)
    #首先获取数据的userId,以及时间并转换为"yyyy-MM-dd"格式
    select userId,from_unixtime(cast(createTime as bigint),"yyyy-MM-dd") as Release_time from weibo;
    #然后按照时间,用户进行分组查询,获取每日用户发布微博数量,这里定义该查询结果为a
    select tmp.Release_time as Release_time,
    tmp.userId as userId,
    count(tmp.userId) as total
    from
    (select userId,from_unixtime(cast(createTime as bigint),"yyyy-MM-dd") as Release_time from weibo) tmp
    group by tmp.userId,tmp.Release_time;
    #从a中按照获取的时间再进行分组查询,获取当日发布微博最大值,这里将查询结果定义为b
    select tmp1.Release_time as Release_time,max(tmp1.total) as big from
    (select tmp.Release_time as Release_time,
    tmp.userId as userId,
    count(tmp.userId) as total
    from
    (select userId,from_unixtime(cast(createTime as bigint),"yyyy-MM-dd") as Release_time from weibo) tmp
    group by tmp.userId,tmp.Release_time) tmp1
    group by tmp1.Release_time;
    #最后b与a进行内连接,以获取导userId
    select a.Release_time as Release_time,b.userId as userId,a.big as total from
    (select tmp1.Release_time as Release_time,max(tmp1.total) as big from
    (select tmp.Release_time as Release_time,
    tmp.userId as userId,
    count(tmp.userId) as total
    from
    (select userId,from_unixtime(cast(createTime as bigint),"yyyy-MM-dd") as Release_time from weibo) tmp
    group by tmp.userId,tmp.Release_time) tmp1
    group by tmp1.Release_time) a
    join 
    (select tmp.Release_time as Release_time,
    tmp.userId as userId,
    count(tmp.userId) as total
    from
    (select userId,from_unixtime(cast(createTime as bigint),"yyyy-MM-dd") as Release_time from weibo) tmp
    group by tmp.userId,tmp.Release_time) b
    on a.Release_time=b.Release_time and a.big=b.total;

在这里插入图片描述

分类:

技术点:

相关文章: