cwmizlp

Python之爬取网页的一个例子

import time,random
import urllib2,urllib,socket,re
from bs4 import BeautifulSoup
import cx_Oracle



conn = cx_Oracle.connect(\'xxx/xxx\')
try:
cursor = conn.cursor()
cursor.execute(\'create table tb_user(id varchar2(50), name varchar2(50),password varchar(50))\')
except:
print "wwwwwwww"
x = 0
my_dh = 0

def crawl(url):

headers = { \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36\', }
req = urllib2.Request(url, headers=headers)
page = urllib2.urlopen(req, timeout=60)
contents = page.read()

soup = BeautifulSoup(contents, \'html.parser\')
my_title = soup.select(".des h2 a")
file = open(\'E:\Python\\text.txt\', \'a\')

# for i,z in zip(my_title,my_title2):
# b = i.text.strip()
# d = z.text.strip()
# # w = c.text.strip()
# n = b+\'\'+d
# print n


for phone in my_title:
time.sleep(random.random()*5)
url2 = phone[\'href\']
html = urllib2.urlopen(url2).read()
soup2 = BeautifulSoup(html, \'html.parser\')
my_dh = soup2.select(\'.phone-num\')
if len(my_dh)>0:
my_dh1=my_dh[0].text
else:
my_dh1= \'null\'
#continue

my_man = soup2.select(\'.c_000\')
if len(my_man)>0:
my_man1 = soup2.select(\'.c_000\')[0].text
my_bt = soup2.select(\'.c_333.f20\')[0].text
my_money = soup2.select(\'.c_ff552e\')[0].text
massage = url2 +\' \'+ my_man1+\' \'+my_dh1+\' \'+my_bt + my_money

print massage
param = {\'id\': url2, \'n\': my_man1, \'p\': my_dh1}
cursor.execute(\'insert into tb_user values(:id,:n,:p)\', param)
conn.commit()
print param
file.write(massage.encode(\'utf-8\') + \'\n\')



else:

continue


for page in range(1, 100):
page += 1
url = \'http://cc.58.com/chuzu/pn{}\'.format(page)
crawl(url)

 

能够将部分网页数据提取出来形成TXT文档。导入数据库时是每提取一条信息便导入oracle数据库。而导入文档时,则是提取一定数量的数据才会进行一次导入。

其中还有一些小毛病需要改善。

分类:

技术点:

相关文章:

  • 2021-05-18
  • 2021-10-09
  • 2022-12-23
  • 2021-10-30
  • 2021-11-09
  • 2022-01-05
  • 2022-12-23
  • 2020-06-14
猜你喜欢
  • 2021-09-15
  • 2021-11-11
  • 2021-12-30
  • 2021-12-22
  • 2021-12-22
  • 2022-12-23
  • 2022-12-23
相关资源
相似解决方案