cwmizlp

爬虫代码——做一下记录

比较成功完善的爬取代码,装入数据库,去除重复数据,形成表格,产生数据的json串。

# -*- coding:utf-8 -*-

import urllib2, requests
from bs4 import BeautifulSoup
import socket,random
from retrying import retry
import sys
import xlwt
import cx_Oracle
import json

try:
conn = cx_Oracle.connect(\'xxx/xxx\')
cursor = conn.cursor()
cursor.execute(\'create table tb_user(url varchar2(250), name varchar2(250),introduce varchar(250),address varchar(250))\')
except:
print "The table already exists, but please continue"

x = 0
my_dh = 0

ippool = [\'118.180.49.24:8080\',
\'27.184.130.29:8888\',
\'113.140.43.136:80\',
\'60.169.19.66:9000\',
\'60.21.206.165:9999\']

@retry
def crawl(url):
rip = random.choice(ippool)
print rip
s = requests.session()
proxies = {
\'http\': \'http://\' + rip,
\'https\': \'http://\' + rip,
}
#print rip
headers = {\'User-Agent\': \'User-Agent:Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)\'}
html = s.get(url, proxies=proxies,timeout=15, headers=headers,).text
s.encoding = \'utf-8\'
soup = BeautifulSoup(html, \'html.parser\')
my_title = soup.select(\'.des h2 a\')

file = open(\'F:\yjh2\\xx.txt\', \'a\')

for phone in my_title:
url2 = phone[\'href\']
rip2 = random.choice(ippool)
proxies = {
\'http\': \'http://\' + rip2,
\'https\': \'http://\' + rip2,
}
#print rip2
headers = {
\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36\'}
html = s.get(url2, proxies=proxies, headers=headers,timeout=15).text
s.encoding = \'utf-8\'
soup2 = BeautifulSoup(html, \'html.parser\')
my_dh = soup2.select(\'.phone-num\')
if len(my_dh) > 0:
my_dh1 = my_dh[0].text
else:
my_dh1 = \'null\'

try:
my_man_list = soup2.select(\'.c_000\')
my_man = soup2.select(\'.c_000\')[0].text
my_bt = soup2.select(\'.c_333.f20\')[0].text
my_money = soup2.select(\'.c_ff552e\')[0].text
my_dq = soup2.select(\'.f14 span a\')[1].text

if len(my_man_list) > 0:

massage = url2 + \';\' + my_man + \':\' + my_dh1 + \';\' + my_bt + my_money + \';\' + my_dq
param = {\'id\': url2, \'n\': my_man+\':\'+ my_dh1,\'p\':my_bt + my_money,\'m\':my_dq}
print massage
cursor.execute(\'insert into tb_user values(:id,:n,:p,:m)\', param)
conn.commit()
cursor.execute(\'delete from tb_user where\'+\'(url)\'+ \'in (select url from tb_user group by url having count(url) >1)\'+\'and rowid not in (select min(rowid) from tb_user group by url having count(url)>1)\')
conn.commit()
jsonData = []
cursor.execute(\'select * from tb_user\')
i = 0
wbk = xlwt.Workbook()
sheet = wbk.add_sheet(\'foobar\', cell_overwrite_ok=True)
for row in cursor:
result = {}
result[\'url\'] = row[0]
result[\'name\'] = row[1]
result[\'jieshao\'] = row[2]
result[\'diqu\'] = row[3]
jsonData.append(result)
sheet.write(i, 0, row[0])
sheet.write(i, 1, row[1].decode(\'utf-8\'))
sheet.write(i, 2, row[2].decode(\'utf-8\'))
sheet.write(i, 3, row[3].decode(\'utf-8\'))
i = i + 1

wbk.save("58.xls")
jsondatar = json.dumps(jsonData, ensure_ascii=False, indent=4)
# 对jsondata可以进行数组操作,但是对jsondatar不行

file.write(massage.encode(\'utf-8\') + \'\n\')
else:
print \'空!\'
continue
except IndexError, socket.error:
print \'!\'
pass



for page in range(1, 30):
page += 1
url = \'http://cc.58.com/chuzu/pn{}\'.format(page)
crawl(url)
print "下载完成"
















分类:

技术点:

相关文章: