ArvinShaffer

用python实现爬取极客网数据。

#-*-coding:utf8-*-
import requests
import re
import sys

reload(sys)
sys.setdefaultencoding("utf-8")

class spider(object):
    def __init__(self):
        print u\'开始爬取内容。。。。\'

    def getsource(self,url):
        html = requests.get(url)
        return html.text

    def changepage(self,url,total_page):
        now_page = int(re.search(\'pageNum=(\d+)\',url,re.S).group(1))
        page_group = []
        for i in range(now_page,total_page+1):
            link = re.sub(\'pageNum=\d+\',\'pageNum=%s\'%i,url,re.S)
            page_group.append(link)
        return page_group

    def geteveryclass(self,source):
        everyclass = re.findall(\'<li id="(.*?)</li>\',source,re.S)
        return everyclass

    def getinfo(self,eachclass):
        info = {}
        info[\'title\'] = re.search(\'title="(.*?)alt="\',eachclass,re.S).group(1)
        info[\'content\'] = re.search(\'display: none;">(.*?)</p>\',eachclass,re.S).group(1)
        timeandlevel = re.findall(\'<em>(.*?)</em>\',eachclass,re.S)
        info[\'classtime\'] = timeandlevel[0]
        info[\'classlevel\'] = timeandlevel[1]
        info[\'learnnum\'] = re.search(\'"learn-number">(.*?)</em>\',eachclass,re.S).group(1)
        return info

    def saveinfo(self,classinfo):
        f = open(\'info.txt\',\'a\')
        for each in classinfo:
            #f.writelines(\'\n\n\'+contents+\'\n\')
            f.writelines(\'title:\'+each[\'title\']+\'\n\')
            f.writelines(\'content:\'+each[\'content\']+\'\n\')
            f.writelines(\'classtime:\'+each[\'classtime\']+\'\n\')
            f.writelines(\'classlevel:\'+each[\'classlevel\']+\'\n\')
            f.writelines(\'learnnum:\'+each[\'learnnum\']+\'\n\n\')

        f.close()

if __name__ == \'__main__\':
    classinfo = []
    url = \'http://www.jikexueyuan.com/course/?pageNum=1\'
    jikespider = spider()
    all_links = jikespider.changepage(url,20)

    for link in all_links:
        print u\'正在处理页面:\'+link
        html = jikespider.getsource(link)
        # i = 0
        # i=i+1
        # ii = str(i)
        # pages = "这个抓的是第"+ii+"页面"
        everyclass = jikespider.geteveryclass(html)
        for each in everyclass:
            info = jikespider.getinfo(each)
            classinfo.append(info)

    jikespider.saveinfo(classinfo)

爬去结果如下

 

分类:

技术点:

相关文章: