Python爬虫：爬虫demo

# -*- coding: utf-8 -*-

import urllib
import urllib2
import re

def getDetailUrl(name):
    reg = r\'<a href="(.*?)" target="_blank">(.*?)</a>\'
    pattern = re.compile(reg, re.I)
    match = re.search(pattern, name)
    return match.groups()
    
#处理一页数据   
def getlist(page): 
    #获取内容表格
    tablereg = r\'<table class="project_table" cellpadding="0" cellspacing="0">\' + \'.*?\' + r\'</table>\'
    pattern = re.compile(tablereg, re.I|re.S)
    match = re.search(pattern, page)
    table = match.group()
    
    #去掉注释
    res = r\'<!--.*?-->\'
    pattern = re.compile(res, re.I)
    table = re.sub(pattern, "", table)
    
    #获取表头
    theadreg = r\'<tr.*</th>.*?</tr>\'
    pattern = re.compile(theadreg, re.I|re.S)
    match = re.search(pattern, table)
    thead = match.group()
    
    reg = r\'<th.*?>(.*?)</th?\'
    pattern = re.compile(reg, re.I|re.S)
    match = re.findall(pattern, thead)
    head = []
    for e in match:
        head.append(e)
    
    #获取表格内容
    reg = r\'<td.*?>(.*?)</td>\'
    pattern = re.compile(reg, re.I|re.S)
    match = re.findall(pattern, table)
    td = {}
    res = []
    i = 0
    while i+len(head) <= len(match):
        for e in head:
            if e == \'企业名称\':
                url_name = getDetailUrl(match[i])
                td[\'企业名称\'] = url_name[1]
                td[\'url\'] = \'http://www.jnfdc.gov.cn/kfqy/\' + url_name[0]
            td[e] = match[i]
            i += 1
        res.append(td)
        td = {}
    
    return res

#获取开发企业列表
rooturl = "http://www.jnfdc.gov.cn/kfqy/"

values = {"entname":"","levelno":"-1"} #levelno=-1：资质不限
data = urllib.urlencode(values)

pageNum = 21

entlist = []
while True:
    if pageNum == 0:
        param1 = ""
    else:
        param1 = "_" + str(pageNum)
    url = rooturl + "index" + param1 + ".shtml"
    pageNum += 1
    geturl = url + "?"+data
    request = urllib2.Request(geturl)
    response = urllib2.urlopen(request)
    page = response.read()
    res = getlist(page)
    if len(res) == 0:
        break
    entlist += getlist(page)

#将企业信息写入数据库
import MySQLdb
ip = \'localhost\'
username = \'root\'
password = \'***\'
dbname = \'test\'

conn = MySQLdb.connect(ip, username, password, dbname, charset=\'utf8\')
cursor = conn.cursor()

print entlist[1]
#try:
#    sql = "insert into fdc_ent_info value (%(序号)s, %(企业名称)s, %(法人代表)s, %(资质编号)s, %(资质等级)s, %(url)s)"
#    cursor.executemany(sql, entlist)
#    conn.commit()
#except:
#    import traceback
#    traceback.print_exc()
#    conn.rollback()
#finally:
#    cursor.close()
#    conn.close()

print \'file...\'
f = file("d:\\entinfo.txt", \'w\')
for e in entlist:
    le = ""
    for key,value in e.items():
        le += key + ":" + value + ", "
    le = le[:-2]
    le += \'\n\'
    f.write(le)
    f.flush() 
f.close
print \'done\'