抓取百度新闻的时候出现 乱码现象

Golang 爬虫 GBK 转 UTF-8

原因Golang 默认不支持 UTF-8 以外的字符集
解决:将字符串的编码转换成UTF-8

需要用到的库

Python
go get github.com/axgle/mahonia
1
2
go get github.com/axgle/mahonia
 

创建函数

Python
# 常规方式 func ConvertToString(src string, srcCode string, tagCode string) string { srcCoder := mahonia.NewDecoder(srcCode) srcResult := srcCoder.ConvertString(src) tagCoder := mahonia.NewDecoder(tagCode) _, cdata, _ := tagCoder.Translate([]byte(srcResult), true) result := string(cdata) return result } # **这种方式更简洁** func decoderConvert(name string, body string) string { return mahonia.NewDecoder(name).ConvertString(body) }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 常规方式
func ConvertToString(src string, srcCode string, tagCode string) string {
    srcCoder := mahonia.NewDecoder(srcCode)
    srcResult := srcCoder.ConvertString(src)
    tagCoder := mahonia.NewDecoder(tagCode)
    _, cdata, _ := tagCoder.Translate([]byte(srcResult), true)
    result := string(cdata)
    return result
}
# **这种方式更简洁**
func decoderConvert(name string, body string) string {
    return mahonia.NewDecoder(name).ConvertString(body)
}
 
 

使用函数

Python
func main() { url:= "http://top.baidu.com/news?fr=topbuzz_b4_c2" html := GetHtml(url,"pc") //Println(html) html = ConvertToString(html, "gbk", "utf-8") Println(html) ExtractData(html) //Println(html) }
1
2
3
4
5
6
7
8
9
10
func main() {
    url:= "http://top.baidu.com/news?fr=topbuzz_b4_c2"
    html := GetHtml(url,"pc")
    //Println(html)
    html  = ConvertToString(html, "gbk", "utf-8")
    Println(html)
    ExtractData(html)
    //Println(html)
}
 

效果如下

Golang 爬虫 GBK 转 UTF-8

Golang 爬虫 GBK 转 UTF-8


相关文章: