Python之爬取网页的一个例子
import time,random
import urllib2,urllib,socket,re
from bs4 import BeautifulSoup
import cx_Oracle
conn = cx_Oracle.connect(\'xxx/xxx\')
try:
cursor = conn.cursor()
cursor.execute(\'create table tb_user(id varchar2(50), name varchar2(50),password varchar(50))\')
except:
print "wwwwwwww"
x = 0
my_dh = 0
def crawl(url):
headers = { \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36\', }
req = urllib2.Request(url, headers=headers)
page = urllib2.urlopen(req, timeout=60)
contents = page.read()
soup = BeautifulSoup(contents, \'html.parser\')
my_title = soup.select(".des h2 a")
file = open(\'E:\Python\\text.txt\', \'a\')
# for i,z in zip(my_title,my_title2):
# b = i.text.strip()
# d = z.text.strip()
# # w = c.text.strip()
# n = b+\'\'+d
# print n
for phone in my_title:
time.sleep(random.random()*5)
url2 = phone[\'href\']
html = urllib2.urlopen(url2).read()
soup2 = BeautifulSoup(html, \'html.parser\')
my_dh = soup2.select(\'.phone-num\')
if len(my_dh)>0:
my_dh1=my_dh[0].text
else:
my_dh1= \'null\'
#continue
my_man = soup2.select(\'.c_000\')
if len(my_man)>0:
my_man1 = soup2.select(\'.c_000\')[0].text
my_bt = soup2.select(\'.c_333.f20\')[0].text
my_money = soup2.select(\'.c_ff552e\')[0].text
massage = url2 +\' \'+ my_man1+\' \'+my_dh1+\' \'+my_bt + my_money
print massage
param = {\'id\': url2, \'n\': my_man1, \'p\': my_dh1}
cursor.execute(\'insert into tb_user values(:id,:n,:p)\', param)
conn.commit()
print param
file.write(massage.encode(\'utf-8\') + \'\n\')
else:
continue
for page in range(1, 100):
page += 1
url = \'http://cc.58.com/chuzu/pn{}\'.format(page)
crawl(url)
能够将部分网页数据提取出来形成TXT文档。导入数据库时是每提取一条信息便导入oracle数据库。而导入文档时,则是提取一定数量的数据才会进行一次导入。
其中还有一些小毛病需要改善。