爬取目标:
https://sj.qq.com/myapp/category.htm?orgame=1&categoryId=122
网页结构分析:
由于这个网站是懒加载页面。我们需要拖动滚动条,来让页面的内容加载出来。
拖动滚动条的代码如下:
target = self.browser.find_element_by_class_name('load-more-btn')
self.browser.execute_script("arguments[0].scrollIntoView();", target)
此方式是拖动到指定元素。
爬取流程:
模拟浏览器获取请求:
def get_response(self,num):
url = 'https://sj.qq.com/myapp/category.htm?orgame=1&categoryId='+str(num)
self.browser.get(url)
# 拖动滚动条到指定的位置
while True:
target = self.browser.find_element_by_class_name('load-more-btn')
self.browser.execute_script("arguments[0].scrollIntoView();", target)
# 判断跳出循环。
if target.text=='没有更多了':
html = self.browser.page_source
return html
else:
continue
获取信息:
def get_info(self,html):
"""获取信息"""
# print(html)
doc = pq(html)
# print(doc)
# 获取信息标签
items = doc('.app-info-desc')
# 信息提取
list_info = []
for item in items.items():
info = {
'name':item.children('.com-install-btn ').attr('appname'),
'size':item.children('.size').text(),
'download':item.children('.download').text()
}
# name = item.children('.com-install-btn ').attr('appname')
# size = item.children('.size').text()
# download = item.children('.download').text()
# 将字典转换为字符串
info = json.dumps(info,ensure_ascii=False)
list_info.append(info)
return list_info
保存内容:
def save_book(self,list_info):
print(list_info)
with open("pythontest1.txt", "w") as f:
for info in list_info:
f.write(info + '\n')
def save_yingyong(self,list_info):
print(list_info)
with open("pythontest2.txt", "w") as f:
for info in list_info:
f.write(info + '\n')
代码运行入口:
def run(self,num):
"""运行入口"""
# self.get_response()
html = self.get_response(num)
info = self.get_info(html)
if num == 102:
self.save_book(info)
elif num == 122:
self.save_yingyong(info)
self.close()
全部代码如下:
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from pyquery import PyQuery as pq
import time
import json
from concurrent.futures import ThreadPoolExecutor
import threading
class shouji():
def __init__(self):
# self.url = 'https://sj.qq.com/myapp/category.htm?orgame=1&categoryId='+ str(num)
self.browser = webdriver.Chrome()
self.wait = WebDriverWait(self.browser,10)
self.headers = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
def get_response(self,num):
url = 'https://sj.qq.com/myapp/category.htm?orgame=1&categoryId='+str(num)
self.browser.get(url)
# 拖动滚动条到指定的位置
while True:
target = self.browser.find_element_by_class_name('load-more-btn')
self.browser.execute_script("arguments[0].scrollIntoView();", target)
# 判断跳出循环。
if target.text=='没有更多了':
html = self.browser.page_source
return html
else:
continue
def get_info(self,html):
"""获取信息"""
# print(html)
doc = pq(html)
# print(doc)
# 获取信息标签
items = doc('.app-info-desc')
# 信息提取
list_info = []
for item in items.items():
info = {
'name':item.children('.com-install-btn ').attr('appname'),
'size':item.children('.size').text(),
'download':item.children('.download').text()
}
# name = item.children('.com-install-btn ').attr('appname')
# size = item.children('.size').text()
# download = item.children('.download').text()
# 将字典转换为字符串
info = json.dumps(info,ensure_ascii=False)
list_info.append(info)
return list_info
def save_book(self,list_info):
print(list_info)
with open("pythontest1.txt", "w") as f:
for info in list_info:
f.write(info + '\n')
def save_yingyong(self,list_info):
print(list_info)
with open("pythontest2.txt", "w") as f:
for info in list_info:
f.write(info + '\n')
def close(self):
self.browser.close()
def run(self,num):
"""运行入口"""
# self.get_response()
html = self.get_response(num)
info = self.get_info(html)
if num == 102:
self.save_book(info)
elif num == 122:
self.save_yingyong(info)
self.close()
if __name__ == '__main__':
# 多线程
# sj =shouji()
# num1 = 102
# num2 = 122
# threads = []
# t1 = threading.Thread(target=sj.aaa(num1))
# threads.append(t1)
# t2 = threading.Thread(target=sj.aaa(num2))
# threads.append(t2)
# for t in threads: # 遍历线程列表
# t.setDaemon(True)
# t.start()
sj = shouji()
# 应用爬取
sj.run(122)
# 阅读爬取
# sj.run(102)
难点总结:
因为这个网站是懒加载的页面:可以通过获取ajax请求的接口来爬取信息,或者使用selenium模拟浏览器操作来获取。
我采用的是selenium模拟操作,那么解决的困难就是滚动条的下拉,当滚动条拉动到指定元素的时候,返回html,
因为这个页面和别的页面稍有不同,我选中的元素出现了四次,所以设置的while循环,一直拉动到指定元素的位置,如果
不再有指定元素出现,那就返回html。通过这种方式获取html。
多线程的处理。因为是使用的selenium,模拟浏览器操作,我刚开始设置的多线程,一直都是只执行一个线程,要不
就是同一个线程执行两次,多次调试之后,发现,是第二个线程在执行的时候,因为是selenium模拟操作,浏览器未来得及关闭
所以会一直重复执行。