一,校花网图片
from requests_html import HTMLSession import os class spider(): def __init__(self): self.session = HTMLSession() self.headers = { \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36\' } def get_index_url(self): for i in range(1,4): if i == 1: yield \'http://www.xiaohuar.com/meinv/index.html\' else: yield \'http://www.xiaohuar.com/meinv/index_%s.html\'%i def get_img_name(self,index_url): r = self.session.get(url=index_url,headers=self.headers) elements_list = r.html.find(\'#images .items\') for element in elements_list: img_url:str = element.find(\'img\',first=True).attrs.get(\'src\') if not img_url.startswith(\'http\'): img_url = \'http://www.xiaohuar.com\' + img_url img_name = element.find(\'.p_title>a\',first=True).text.replace(\'\\\',\'\').replace(\'/\',\'\') + \'.jpg\' yield img_url,img_name def save_img(self,img_url,img_name): r = self.session.get(url=img_url) img_path= os.path.join(\'校花图片\',img_name) with open(img_path,\'wb\') as f: f.write(r.content) print(\'%s下载完毕\'%img_name) def run(self): for index_url in self.get_index_url(): for img_url,img_name in self.get_img_name(index_url): self.save_img(img_url,img_name) if __name__ == \'__main__\': xiaohua = spider() xiaohua.run()
二.豆瓣
from requests_html import HTMLSession #测试 # session = HTMLSession() # url=\'https://movie.douban.com/tag/#/?sort=S&range=0,10&tags=2018\' # # r = session.get(url=url) # print(r.text) #电影 %E7%94%B5%E5%BD%B1 # print(str(\'电影\'.encode(\'utf-8\')).strip("\'b").replace(\'\\x\',\'%\').upper()) class spider(): def __init__(self): self.api = \'https://movie.douban.com/j/new_search_subjects?\' self.session = HTMLSession() def get_params(self): year_range = input(\'输入年份\') #年份是一个区间,逗号隔开 sort = input(\'输入排序规则(S按评分)\') self.params = { \'year_range\':year_range, \'sort\':sort, \'start\':0 } def get_data(self): for i in range(10): self.params[\'start\'] = i*20 r = self.session.get(url=self.api,params=self.params) print(r.json()) def run(self): self.get_params() self.get_data() if __name__ == \'__main__\': douban = spider() douban.run()
三.校花网视频
from requests_html import HTMLSession import os class spider(): def __init__(self): self.session = HTMLSession() def get_index_page(self): for i in range(7): url = \'http://www.xiaohuar.com/list-3-%s.html\'%i yield url def parse_index_page(self,index_page): r = self.session.get(url=index_page) elements_list = r.html.find(\'#images .items a[class="imglink"]\') for element in elements_list: yield element.attrs.get(\'href\') def parse_detail_page(self,detail_page): r = self.session.get(url=detail_page) r.html.encoding = \'GBK\' result_obj = r.html.search(\'var vHLSurl = "{}";\') if result_obj: m3u8_url = result_obj[0] m3u8_name = r.html.find(\'title\',first=True).text.replace(\'\\\',\'\') yield m3u8_url,m3u8_name else: print("匹配失败,无资源") def save_m3u8(self,m3u8_url,m3u8_name): m3u8_dir = m3u8_name if not os.path.exists(m3u8_dir): os.mkdir(m3u8_dir) print(m3u8_url) r = self.session.get(url=m3u8_url) m3u8_path = os.path.join(m3u8_dir,\'playlist.m3u8\') with open(m3u8_path,\'wt+\',encoding=\'utf-8\') as f : f.write(r.text) f.seek(0,0) for line in f: line = line.strip() if line.endswith(\'.ts\'): ts_url = os.path.dirname(m3u8_url) + \'/%s\'%line r = self.session.get(url=ts_url) ts_path = os.path.join(m3u8_dir,line) with open(ts_path,\'wb\') as f1: f1.write(r.content) print(\'%s下载完毕\'%line) def run(self): for url in self.get_index_page(): for detail_page in self.parse_index_page(url): for m3u8_url,m3u8_name in self.parse_detail_page(detail_page): self.save_m3u8(m3u8_url,m3u8_name) if __name__ == \'__main__\': xioahua = spider() xioahua.run()
四.tmall
from requests_html import HTMLSession class spider(): def __init__(self): self.session = HTMLSession() self.api= \'http://list.tmall.com/search_product.htm?\' def get_params(self): pro = input("输入你要爬取的商品:") self.params = { \'q\':pro, \'totalPage\':1, \'jumpto\':1 } def get_totalPage(self): r = self.session.get(url=self.api,params=self.params) totalPage = r.html.find(\'[name="totalPage"]\',first=True).attrs.get(\'value\') self.params[\'totalPage\'] = int(totalPage) def get_pro_info(self): for i in range(1,self.params[\'totalPage\']+1): self.params[\'jumpto\'] = i r = self.session.get(url=self.api, params=self.params) elements_pro_list = r.html.find(\'.product\') for element_pro in elements_pro_list: title = element_pro.find(\'.productTitle a\',first=True).text price = element_pro.find(\'.productPrice em\',first=True).attrs.get(\'title\') print(title) print(price) print(\'-\'*30) def run(self): self.get_params() self.get_totalPage() self.get_pro_info() if __name__ == \'__main__\': tmall = spider() tmall.run()