爬取校花网 - 爱码网

1.拿到索引页的链接
import requests  #pip3 install requests  请求库  requests相较于urlibra 的封装程度更高。
import  re
'''
http://www.xiaohuar.com/list-3-0.html  第3页
http://www.xiaohuar.com/list-3-1.html  第2页
http://www.xiaohuar.com/list-3-2.html  第3页
http://www.xiaohuar.com/list-3-3.html  第4页
http://www.xiaohuar.com/list-3-4.html  第5页
'''



#1.发起请求，获取索引
def get_index_page(url):
    '''
    访问index配置, 注意：校花网的请求头user_agent可不加，至于其它的网站，一定要考虑。
    :return:
    '''
    response = requests.get(url)  #发完请求，得到一个响应的对象
    if response.status_code ==200:
        print(response.text)  #打印下：html的代码
        # return response.text

#2.解析索引页
def parse_index(index_page):
    '''
    解析库，把想要的内容解析出来。
    :param index_page:
    :return:
    '''
    re.findall("",index_page,re.S) #写一个规则，把网址都拿取到。



#3.爬取详情页的功能
def parse_detail_page():
    pass

def parse_detail():
    pass


def get_movie():
    pass

def main():
    base_url = 'http://www.xiaohuar.com/list-3-{page_num}.html'   #网址的规则
    for i in range(5):
        url = base_url.format(page_num=i)
        print(get_index_page(url)) #拿到链接内容


if __name__ == '__main__':
    main()
View Code