爬取完会自动生成csv电子表格文件,含有房价、押付、链接等信息
环境
py2.7
pip install lxml
pip install cssselect
1 #coding:utf-8 2 import csv 3 import urllib2 4 import lxml.html 5 import time 6 import sys 7 from lxml.cssselect import CSSSelector 8 import threading 9 reload(sys) 10 sys.setdefaultencoding(\'utf8\') 11 12 print "请输入要爬取得城市简称例如bj(北京):" 13 CITY=str(raw_input(">>>")) 14 def download(url, user_agent=\'Google\', num_retries=2): 15 16 headers = {\'User-agent\': user_agent} 17 request = urllib2.Request(url, headers=headers) 18 try: 19 html = urllib2.urlopen(request).read() 20 except urllib2.URLError as e: 21 html = None 22 if num_retries > 0: 23 if hasattr(e, \'code\') and 500 <= e.code < 600: 24 return download(url, num_retries-1) 25 return html 26 27 28 def get_data(url): 29 html_text_detail = download(url) 30 try: 31 tree = lxml.html.fromstring(html_text_detail) 32 house_ext = CSSSelector(\'div.house-pay-way > span:nth-child(3)\') 33 house_title = CSSSelector(\'div.main-wrap > div.house-title > h1\') 34 house_pay_way1 = CSSSelector(\'div.house-pay-way > span:nth-child(1)\') 35 house_pay_way2 = CSSSelector(\'div.house-pay-way > span:nth-child(2)\') 36 print house_title(tree)[0].text_content() 37 print \'%s|%s\' % (house_pay_way1(tree)[0].text_content(), house_pay_way2(tree)[0].text_content()) 38 39 for i in range(7): 40 for j in range(2): 41 css = \'div.house-desc-item > ul.f14 > li:nth-child(%s) > span:nth-child(%s)\' % (i+1, j+1) 42 house_info = CSSSelector(css) 43 data = [ 44 (\'标题 : \',house_title(tree)[0].text_content(), \'#\',url), 45 (\'价格: \',house_pay_way1(tree)[0].text_content(), \'#\'), 46 (\'压付: \',house_pay_way2(tree)[0].text_content(), \'#\'), 47 (\'详情: \',house_info(tree)[0].text_content().replace(\' \', \'\'), \'#\')] 48 with open(\'%s_houses.csv\'%CITY,\'ab+\') as csvfile: 49 writer = csv.writer(csvfile,lineterminator=\'\n\') 50 writer.writerows(data) 51 52 except TypeError as e: 53 pass 54 except IndexError as e: 55 pass 56 57 def get_url(html): 58 tree = lxml.html.fromstring(html) 59 sel = CSSSelector(\'div.mainbox > div.main > div.content > div.listBox > ul.listUl > li > div.des > h2 > a\') 60 url_list = [] 61 for i in sel(tree): 62 if i.get(\'href\') not in url_list: 63 url_list.append(i.get(\'href\')) 64 return url_list 65 66 67 if __name__ == \'__main__\': 68 url_index = \'http://%s.58.com/chuzu/\'%CITY 69 html_text_list = download(url_index) 70 url_list = get_url(html_text_list) 71 72 for url_detail in url_list: 73 thr = threading.Thread(target=get_data, args=(url_detail,)) 74 thr.start() 75 76 time.sleep(0.001)