1. 爬校花网图片:
这是一个很基础的爬虫小例子,爬取校花网的图片。其中用了requests-html库:
先获取每页的url,再爬取每页中的图片进行解析处理,最后存入到文件中
from requests_html import HTMLSession
import os
class Spider():
def __init__(self):
self.session = HTMLSession()
def get_index_page(self):
for i in range(1, 4):
if i == 1:
page_url = "http://www.xiaohuar.com/meinv/index.html"
else:
page_url = "http://www.xiaohuar.com/meinv/index_%s.html" % i
yield page_url
def get_image_name(self, page_url):
r = self.session.get(url=page_url)
elements_list = r.html.find(\'#images .items\')
for element in elements_list:
image_url = element.find(\'img\', first=True).attrs.get(\'src\')
image_name = element.find(\'.p_title a\', first=True).text
image_name = image_name.replace(\'【\', \'\').replace(\'】\', \'\').replace(\'|\', \'\').replace(\'\\\', \'\').replace(
\'/\',\'\') + \'.jpg\'
yield image_url, image_name
def save(self, image_url, image_name):
image_name = os.path.join(\'pictures\', image_name)
if not image_url.startswith(\'http\'):
image_url = \'http://www.xiaohuar.com\' + image_url
r = self.session.get(url=image_url)
with open(image_name, \'wb\') as f:
f.write(r.content)
print(\'%s下载完成\' % image_name)
def run(self):
for page_url in self.get_index_page():
for image_url, image_name in self.get_image_name(page_url):
self.save(image_url, image_name)
if __name__ == \'__main__\':
xiaohua = Spider()
xiaohua.run()
2. 豆瓣电影排行信息
爬虫获取豆瓣电影信息:在这里对电影进行筛选是通过url携带参数。所以先获取参数信息。
from requests_html import HTMLSession
class Spider:
def __init__(self):
self.api = "https://movie.douban.com/j/new_search_subjects?"
self.session = HTMLSession()
def get_params(self):
sort = input(\'请输入按什么排序(S评分)\')
year_range = input(\'请输入年份:\')
self.params = {
\'sort\':sort,
\'year_range\':year_range,
\'start\':0
}
def get_message(self):
for i in range(10):
self.params[\'start\'] = i * 20
r = self.session.get(url=self.api,params=self.params)
print(r.json())
def run(self):
self.get_params()
self.get_message()
if __name__ == \'__main__\':
douban = Spider()
douban.run()
3. 爬取校花视频
校花视频是通过m3u8格式。有的视频会员有反爬机制,查看元素的播放链接是unknown,所以获取不到资源。
现获取到m3u8格式的播放列表连接,然后发送请求获取到的文件内容是一行行.ts。.ts也是一个文件格式。对m3u8文件内容就行处理,发送请求下载.ts文件保存
from requests_html import HTMLSession
import os
class spider():
def __init__(self):
self.session = HTMLSession()
def get_index_page(self):
for i in range(7):
url = \'http://www.xiaohuar.com/list-3-%s.html\'%i
yield url
def parse_index_page(self,index_page):
r = self.session.get(url=index_page)
elements_list = r.html.find(\'#images .items a[class="imglink"]\')
for element in elements_list:
yield element.attrs.get(\'href\')
def parse_detail_page(self,detail_page):
r = self.session.get(url=detail_page)
r.html.encoding = \'GBK\'
result_obj = r.html.search(\'var vHLSurl = "{}";\')
if result_obj:
m3u8_url = result_obj[0]
m3u8_name = r.html.find(\'title\',first=True).text.replace(\'\\\',\'\')
yield m3u8_url,m3u8_name
else:
print("匹配失败,无资源")
def save_m3u8(self,m3u8_url,m3u8_name):
m3u8_dir = m3u8_name
if not os.path.exists(m3u8_dir):
os.mkdir(m3u8_dir)
print(m3u8_url)
r = self.session.get(url=m3u8_url)
m3u8_path = os.path.join(m3u8_dir,\'playlist.m3u8\')
with open(m3u8_path,\'wt+\',encoding=\'utf-8\') as f :
f.write(r.text)
f.seek(0,0)
for line in f:
line = line.strip()
if line.endswith(\'.ts\'):
ts_url = os.path.dirname(m3u8_url) + \'/%s\'%line
r = self.session.get(url=ts_url)
ts_path = os.path.join(m3u8_dir,line)
with open(ts_path,\'wb\') as f1:
f1.write(r.content)
print(\'%s下载完毕\'%line)
def run(self):
for url in self.get_index_page():
for detail_page in self.parse_index_page(url):
for m3u8_url,m3u8_name in self.parse_detail_page(detail_page):
self.save_m3u8(m3u8_url,m3u8_name)
if __name__ == \'__main__\':
xioahua = spider()
xioahua.run()