抓取百度新闻的时候出现 乱码现象
原因:Golang 默认不支持 UTF-8 以外的字符集
解决:将字符串的编码转换成UTF-8
需要用到的库
Python
go get github.com/axgle/mahonia
|
1
2
|
go get github.com/axgle/mahonia
|
创建函数
Python
# 常规方式
func ConvertToString(src string, srcCode string, tagCode string) string {
srcCoder := mahonia.NewDecoder(srcCode)
srcResult := srcCoder.ConvertString(src)
tagCoder := mahonia.NewDecoder(tagCode)
_, cdata, _ := tagCoder.Translate([]byte(srcResult), true)
result := string(cdata)
return result
}
# **这种方式更简洁**
func decoderConvert(name string, body string) string {
return mahonia.NewDecoder(name).ConvertString(body)
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
# 常规方式
func ConvertToString(src string, srcCode string, tagCode string) string {
srcCoder := mahonia.NewDecoder(srcCode)
srcResult := srcCoder.ConvertString(src)
tagCoder := mahonia.NewDecoder(tagCode)
_, cdata, _ := tagCoder.Translate([]byte(srcResult), true)
result := string(cdata)
return result
}
# **这种方式更简洁**
func decoderConvert(name string, body string) string {
return mahonia.NewDecoder(name).ConvertString(body)
}
|
使用函数
Python
func main() {
url:= "http://top.baidu.com/news?fr=topbuzz_b4_c2"
html := GetHtml(url,"pc")
//Println(html)
html = ConvertToString(html, "gbk", "utf-8")
Println(html)
ExtractData(html)
//Println(html)
}
|
1
2
3
4
5
6
7
8
9
10
|
func main() {
url:= "http://top.baidu.com/news?fr=topbuzz_b4_c2"
html := GetHtml(url,"pc")
//Println(html)
html = ConvertToString(html, "gbk", "utf-8")
Println(html)
ExtractData(html)
//Println(html)
}
|
效果如下
-
zeropython 微信公众号
5868037 QQ号
[email protected] QQ邮箱