**
python3爬取华为应用市场APP安装包实战
**
一、用浏览器访问华为应用市场,搜索APP并观察URL规律
比如我们要下载微信,那么在搜索框输入微信并回车,会发现url变成http://app.hicloud.com/search/%25E5%25BE%25AE%25E4%25BF%25A1,再搜索微博,发现又会跳转至http://app.hicloud.com/search/%25E5%25BE%25AE%25E5%258D%259A,那么我们就可以知道前面的http://app.hicloud.com/search/网址不会变,后面的才会变化,后面的这串字符其实就是微信或者微博的中文字符的urlencode编码,所以,我们在代码中只需要将所要搜索的APP中文名进行urlencode编码后再与前面不变的进行拼接就可以了。
def __init__(self):
'''
设置请求头,要下载的APP存放在列表中
'''
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
}
self.url = 'http://app.hicloud.com/search/'
self.AppList = ['微信', 'QQ', '微博']
def searchApp(self):
'''
将要下的APP名字转化成ascii码,并拼接在查找网址后面,组成所需APP下载界面
:return:
'''
SearchLink_list = []
for AppName in self.AppList:
SearchLink = self.url + quote(AppName)
SearchLink_list.append(SearchLink)
self.loadPage(SearchLink_list)
二、获取下载链接并进行下载
拿微信举例,我们要在搜索页面利用开发者工具找到下载链接,如下图所示的就是下载链接存放标签:
我们只需要获取下图的内容就可以得到下载链接了,利用Xpathhelper谷歌插件写出对应的Xpath:“//*/div[1]/div[4]/div[1]/div/div/div[2]/div[2]/div[2]/a/@onclick”
对应代码如下:
def loadPage(self, UrlList):
'''
获取url并访问url,利用Xpath找到下载链接
:param UrlList:
:return:
'''
for url in UrlList:
req = urllib.request.Request(url, headers=self.headers)
html = urllib.request.urlopen(req).read().decode('utf-8')
content = etree.HTML(html)
down = content.xpath('//*/div[1]/div[4]/div[1]/div/div/div[2]/div[2]/div[2]/a/@onclick') # 获取下载链接存放的标签,
# 返回的是一大堆乱七八糟
p = re.compile(r'[(](.*?)[)]', re.S)
appInfo = re.findall(p, down[0])[0]
appInfo = tuple(eval(appInfo)) # 将获取的APP信息存入元祖
print("正在爬取%s" % appInfo[1])
# print(appInfo[5]) # 下载链接
self.downLoad(appInfo[1], appInfo[5])
三、下载APP
获取并下载链接,将获得的内容以二进制的格式存入文件就可以啦
def downLoad(self, name, link):
PackageName = ‘package/’ + name + ‘.apk’
App = requests.get(link)
AppInfo = App.content
with open(PackageName, ‘wb’) as f:
f.write(AppInfo)
print(’%s爬取完成’ % name)
运行效果如下:
四、源码
import urllib.request
from urllib.parse import quote
from lxml import etree
import re
import requests
class Spider:
def __init__(self):
'''
设置请求头,要下载的APP存放在列表中
'''
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
}
self.url = 'http://app.hicloud.com/search/'
self.AppList = ['微信', 'QQ', '微博']
def searchApp(self):
'''
将要下的APP名字转化成ascii码,并拼接在查找网址后面,组成所需APP下载界面
:return:
'''
SearchLink_list = []
for AppName in self.AppList:
SearchLink = self.url + quote(AppName)
SearchLink_list.append(SearchLink)
self.loadPage(SearchLink_list)
def loadPage(self, UrlList):
'''
获取url并访问url,利用Xpath找到下载链接
:param UrlList:
:return:
'''
for url in UrlList:
req = urllib.request.Request(url, headers=self.headers)
html = urllib.request.urlopen(req).read().decode('utf-8')
content = etree.HTML(html)
down = content.xpath('//*/div[1]/div[4]/div[1]/div/div/div[2]/div[2]/div[2]/a/@onclick') # 获取下载链接存放的标签,
# 返回的是一大堆乱七八糟
p = re.compile(r'[(](.*?)[)]', re.S)
appInfo = re.findall(p, down[0])[0]
appInfo = tuple(eval(appInfo)) # 将获取的APP信息存入元祖
print("正在爬取%s" % appInfo[1])
# print(appInfo[5]) # 下载链接
self.downLoad(appInfo[1], appInfo[5])
def downLoad(self, name, link):
PackageName = 'package/' + name + '.apk'
App = requests.get(link)
AppInfo = App.content
with open(PackageName, 'wb') as f:
f.write(AppInfo)
print('%s爬取完成' % name)
if __name__ == '__main__':
spd = Spider()
spd.searchApp()