一、 通过spark实现点击流日志分析案例
1. 访问的pv
package cn.bw

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object PV {
def main(args: Array[String]): Unit = {
//todo:创建sparkconf,设置appName
//todo:setMaster(“local[2]”)在本地模拟spark运行 这里的数字表示 使用2个线程
val sparkConf: SparkConf = new SparkConf().setAppName(“PV”).setMaster(“local[2]”)
//todo:创建SparkContext
val sc: SparkContext = new SparkContext(sparkConf)
//todo:读取数据
val file: RDD[String] = sc.textFile(“d:\data\access.log”)
//todo:将一行数据作为输入,输出(“pv”,1)
val pvAndOne: RDD[(String, Int)] = file.map(x=>(“pv”,1))
//todo:聚合输出
val totalPV: RDD[(String, Int)] = pvAndOne.reduceByKey(+)
totalPV.foreach(println)
sc.stop()
}
}

2. 访问的uv
package cn.bw

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object UV {
def main(args: Array[String]): Unit = {
//todo:构建SparkConf和 SparkContext
val sparkConf: SparkConf = new SparkConf().setAppName(“UV”).setMaster(“local[2]”)
val sc: SparkContext = new SparkContext(sparkConf)
//todo:读取数据
val file: RDD[String] = sc.textFile(“d:\data\access.log”)
//todo:对每一行分隔,获取IP地址
val ips: RDD[(String)] = file.map(.split(" ")).map(x=>x(0))
//todo:对ip地址进行去重,最后输出格式 (“UV”,1)
val uvAndOne: RDD[(String, Int)] = ips.distinct().map(x=>(“UV”,1))
//todo:聚合输出
val totalUV: RDD[(String, Int)] = uvAndOne.reduceByKey(
+_)
totalUV.foreach(println)
//todo:数据结果保存
totalUV.saveAsTextFile(“d:\data\out”)
sc.stop()
}
}

3. 访问的topN
package cn.bw

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

/**

  • 求访问的topN
    */
    object TopN {
    def main(args: Array[String]): Unit = {
    val sparkConf: SparkConf = new SparkConf().setAppName(“TopN”).setMaster(“local[2]”)
    val sc: SparkContext = new SparkContext(sparkConf)
    sc.setLogLevel(“WARN”)
    //读取数据
    val file: RDD[String] = sc.textFile(“d:\data\access.log”)
    //将一行数据作为输入,输出(来源URL,1)
    val refUrlAndOne: RDD[(String, Int)] = file.map(.split(" ")).filter(.length>10).map(x=>(x(10),1))
    //聚合 排序–>降序
    val result: RDD[(String, Int)] = refUrlAndOne.reduceByKey(+).sortBy(_._2,false)
    //通过take取topN,这里是取前5名
    val finalResult: Array[(String, Int)] = result.take(5)
    println(finalResult.toBuffer)

sc.stop()

}

}

二、 通过Spark实现ip地址查询
1. 需求分析
在互联网中,我们经常会见到城市热点图这样的报表数据,例如在百度统计中,会统计今年的热门旅游城市、热门报考学校等,会将这样的信息显示在热点图中。
PvUv具体操作
因此,我们需要通过日志信息(运行商或者网站自己生成)和城市ip段信息来判断用户的ip段,统计热点经纬度。
2. 技术调研
因为我们的需求是完成一张报表信息,所以对程序的实时性没有要求,所以可以选择内存计算spark来实现上述功能。
3. 架构设计
搭建spark集群
4. 开发流程
4.1. 数据准备
4.2. ip日志信息
在ip日志信息中,我们只需要关心ip这一个维度就可以了,其他的不做介绍
PvUv具体操作
4.3. 城市ip段信息
PvUv具体操作
5. 代码开发
5.1. 思路
1、 加载城市ip段信息,获取ip起始数字和结束数字,经度,维度
2、 加载日志数据,获取ip信息,然后转换为数字,和ip段比较
3、 比较的时候采用二分法查找,找到对应的经度和维度
4、 然后对经度和维度做单词计数
5.2. 代码
package cn.bw.IPlocaltion

import java.sql.{Connection, DriverManager, PreparedStatement}

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

/**

  • ip地址查询
    */
    object IPLocaltion_Test {

def main(args: Array[String]): Unit = {
//todo:创建sparkconf 设置参数
val sparkConf: SparkConf = new SparkConf().setAppName(“IPLocaltion_Test”).setMaster(“local[2]”)

//todo:创建SparkContext
val sc = new SparkContext(sparkConf)

//todo:读取基站数据
val data: RDD[String] = sc.textFile(“d:\data\ip.txt”)

//todo:对基站数据进行切分 ,获取需要的字段 (ipStart,ipEnd,城市位置,经度,纬度)
val jizhanRDD: RDD[(String, String, String, String, String)] = data.map(_.split("\|")).map(
x => (x(2), x(3), x(4) + “-” + x(5) + “-” + x(6) + “-” + x(7) + “-” + x(8), x(13), x(14)))

//todo:获取RDD的数据
val jizhanData: Array[(String, String, String, String, String)] = jizhanRDD.collect()

//todo:广播变量,一个只读的数据区,所有的task都能读到的地方
val jizhanBroadcast: Broadcast[Array[(String, String, String, String, String)]] = sc.broadcast(jizhanData)

//todo:读取目标数据
val destData: RDD[String] = sc.textFile(“d:\data\20090121000132.394251.http.format”)

//todo:获取数据中的ip地址字段
val ipData: RDD[String] = destData.map(_.split("\|")).map(x=>x(1))

//todo:把IP地址转化为long类型,然后通过二分法去基站数据中查找,找到的维度做wordCount

val result=ipData.mapPartitions(iter=>{
//获取广播变量中的值
val valueArr: Array[(String, String, String, String, String)] = jizhanBroadcast.value

//todo:操作分区中的itertator
iter.map(ip=>{
//将ip转化为数字long
val ipNum:Long=ipToLong(ip)

//拿这个数字long去基站数据中通过二分法查找,返回ip在valueArr中的下标
val index:Int=binarySearch(ipNum,valueArr)

//根据下标获取对一个的经纬度
val tuple = valueArr(index)
//返回结果 ((经度,维度),1)
((tuple._4,tuple._5),1)

})

})

//todo:分组聚合
val resultFinal: RDD[((String, String), Int)] = result.reduceByKey(+)

//todo:打印输出
resultFinal.foreach(println)

//todo:将结果保存到mysql表中

resultFinal.map(x=>(x._1._1,x._1._2,x._2)).foreachPartition(data2Mysql)
sc.stop()

}

//todo:ip转为long类型
def ipToLong(ip: String): Long = {
//todo:切分ip地址。
val ipArray: Array[String] = ip.split("\.")
var ipNum=0L

for(i <- ipArray){
ipNum=i.toLong | ipNum << 8L
}
ipNum
}

//todo:通过二分查找法,获取ip在广播变量中的下标
def binarySearch(ipNum: Long, valueArr: Array[(String, String, String, String, String)]): Int ={

//开始下标
var start=0
//结束下标
var end=valueArr.length-1

while(start<=end){
val middle=(start+end)/2
if(ipNum>=valueArr(middle)._1.toLong && ipNum<=valueArr(middle)._2.toLong){
return middle
}

if(ipNum > valueArr(middle)._2.toLong){
start=middle
}

if(ipNum<valueArr(middle)._1.toLong){
end=middle
}
}

-1
}

//todo:数据保存到mysql表中
def data2Mysql(iterator:Iterator[(String,String, Int)]):Unit = {
//todo:创建数据库连接Connection
var conn:Connection=null
//todo:创建PreparedStatement对象
var ps:PreparedStatement=null
//todo:采用拼占位符问号的方式写sql语句。
var sql=“insert into iplocaltion(longitude,latitude,total_count) values(?,?,?)”
//todo:获取数据连接
conn=DriverManager.getConnection(“jdbc:mysql://bw01:3306/spark”,“root”,“root123”)

//todo: 选中想被try/catch包围的语句 ctrl+alt+t 快捷键选中try/catch/finally
try {
iterator.foreach(line=> {

//todo:预编译sql语句
ps = conn.prepareStatement(sql)

//todo:对占位符设置值,占位符顺序从1开始,第一个参数是占位符的位置,第二个参数是占位符的值。
ps.setString(1, line._1)
ps.setString(2, line._2)
ps.setLong(3, line._3)
//todo:执行
ps.execute()
})
} catch {
case e:Exception =>println(e)
} finally {
if(ps!=null){
ps.close()
}
if (conn!=null){
conn.close()
}
}

}

}

数据

ip.txt

1.0.1.0|1.0.3.255|16777472|16778239|亚洲|中国|福建|福州||电信|350100|China|CN|119.306239|26.075302
1.0.8.0|1.0.15.255|16779264|16781311|亚洲|中国|广东|广州||电信|440100|China|CN|113.280637|23.125178
1.0.32.0|1.0.63.255|16785408|16793599|亚洲|中国|广东|广州||电信|440100|China|CN|113.280637|23.125178
1.1.0.0|1.1.0.255|16842752|16843007|亚洲|中国|福建|福州||电信|350100|China|CN|119.306239|26.075302
1.1.2.0|1.1.7.255|16843264|16844799|亚洲|中国|福建|福州||电信|350100|China|CN|119.306239|26.075302
1.1.8.0|1.1.63.255|16844800|16859135|亚洲|中国|广东|广州||电信|440100|China|CN|113.280637|23.125178
1.2.0.0|1.2.1.255|16908288|16908799|亚洲|中国|福建|福州||电信|350100|China|CN|119.306239|26.075302

ip.format

20090121000132095572000|125.213.100.123|show.51.com|/shoplist.php?phpfile=shoplist2.php&style=1&sex=137|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Mozilla/4.0(Compatible Mozilla/4.0(Compatible-EmbeddedWB 14.59 http://bsalsa.com/ EmbeddedWB- 14.59 from: http://bsalsa.com/ )|http://show.51.com/main.php|
20090121000132124542000|117.101.215.133|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/index.php?uidhash=d1c3b69e9b8355a5204474c749fb76ef|__tkist=0; myloc=50%7C5008; myage=2009; PROFILE=14469674%3A%E8%8B%A6%E6%B6%A9%E5%92%96%E5%95%A1%3Am%3Aphotos2.love21cn.com%2F45%2F1b%2F388111afac8195cc5d91ea286cdd%3A1%3A%3Ahttp%3A%2F%2Fimages.love21cn.com%2Fw4%2Fglobal%2Fi%2Fhykj_m.jpg; last_login_time=1232454068; SESSION_HASH=8176b100a84c9a095315f916d7fcbcf10021e3af; RAW_HASH=008a1bc48ff9ebafa3d5b4815edd04e9e7978050; COMMON_HASH=45388111afac8195cc5d91ea286cdd1b; pop_1232093956=1232468896968; pop_time=1232466715734; pop_1232245908=1232469069390; pop_1219903726=1232477601937; LOVESESSID=98b54794575bf547ea4b55e07efa2e9e; main_search:14469674=%7C%7C%7C00; registeruid=14469674; REG_URL_COOKIE=http%3A%2F%2Fphoto.jiayuan.com%2Fshowphoto.php%3Fuid_hash%3D0319bc5e33ba35755c30a9d88aaf46dc%26total%3D6%26p%3D5; click_count=0%2C3363619
20090121000132406516000|117.101.222.68|gg.xiaonei.com|/view.jsp?p=389|Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; CIBA)|http://home.xiaonei.com/Home.do?id=229670724|r01=1; __utma=204579609.31669176.1231940225.1232462740.1232467011.145; __utmz=204579609.1231940225.1.1.utmccn=(direct)
20090121000132581311000|115.120.36.118|tj.tt98.com|/tj.htm|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; TheWorld)|http://www.tt98.com/|
20090121000132864647000|123.197.64.247|cul.sohu.com|/20071227/n254338813_22.shtml|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; TheWorld)|http://cul.sohu.com/20071227/n254338813_22.shtml|ArticleTab=visit:1; IPLOC=unknown; SUV=0901080709152121; vjuids=832dd37a1.11ebbc5d590.0.b20f858f14e918; club_chat_ircnick=JaabvxC4aaacQ; spanel=%7B%22u%22%3A%22%22%7D; vjlast=1232467312,1232467312,30
20090121000133296729000|222.55.57.176|down.chinaz.com|/|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; iCafeMedia; TencentTraveler 4.0)||cnzz_a33219=0; vw33219=%3A18167791%3A; sin33219=http%3A//www.itxls.com/wz/wyfx/it.html; rtime=0; ltime=1232464387281; cnzz_eid=6264952-1232464379-http%3A//www.itxls.com/wz/wyfx/it.html
20090121000133331104000|123.197.66.93|www.pkwutai.cn|/down/downLoad-id-45383.html|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 1.7)|http://www.baidu.com/s?tn=b1ank_pg&ie=gb2312&bs=%C3%C0%C6%BC%B7%FE%D7%B0%B9%DC%C0%ED%C8%ED%BC%FE&sr=&z=&cl=3&f=8&wd=%C6%C6%BD%E2%C3%C0%C6%BC%B7%FE%D7%B0%B9%DC%C0%ED%C8%ED%BC%FE&ct=0|
20090121000133446262000|115.120.12.157|v.ifeng.com|/live/|Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; .NET CLR 2.0.50727; CIBA)|http://www.ifeng.com/|userid=1232466610953_4339; location=186; sclocationid=10002; vjuids=22644b162.11ef4bc1624.0.63ad06717b426; vjlast=1232466614,1232467297,13

access.log

194.237.142.21 - - [18/Sep/2013:06:49:18 +0000] “GET /wp-content/uploads/2013/07/rstudio-git3.png HTTP/1.1” 304 0 “-” “Mozilla/4.0 (compatible;)”
183.49.46.228 - - [18/Sep/2013:06:49:23 +0000] “-” 400 0 “-” “-”
163.177.71.12 - - [18/Sep/2013:06:49:33 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0”
163.177.71.12 - - [18/Sep/2013:06:49:36 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0”
101.226.68.137 - - [18/Sep/2013:06:49:42 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0”
101.226.68.137 - - [18/Sep/2013:06:49:45 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0”
60.208.6.156 - - [18/Sep/2013:06:49:48 +0000] “GET /wp-content/uploads/2013/07/rcassandra.png HTTP/1.0” 200 185524 “http://cos.name/category/software/packages/” “Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36”
222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] “GET /images/my.jpg HTTP/1.1” 200 19939 “http://www.angularjs.cn/A00n” “Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36”
222.68.172.190 - - [18/Sep/2013:06:50:08 +0000] “-” 400 0 “-” “-”
183.195.232.138 - - [18/Sep/2013:06:50:16 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0”
183.195.232.138 - - [18/Sep/2013:06:50:16 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0”

相关文章:

  • 2021-07-23
  • 2021-08-05
  • 2021-09-27
  • 2021-06-28
  • 2021-09-18
  • 2021-12-12
  • 2021-07-02
猜你喜欢
  • 2021-07-06
  • 2021-07-23
  • 2021-07-26
  • 2021-05-16
  • 2021-08-07
  • 2021-11-10
  • 2021-07-02
相关资源
相似解决方案