cutesnow

爬取完会自动生成csv电子表格文件,含有房价、押付、链接等信息

环境
py2.7
pip install lxml
pip install cssselect
 
 1 #coding:utf-8
 2 import csv
 3 import urllib2
 4 import lxml.html
 5 import time
 6 import sys
 7 from lxml.cssselect import CSSSelector
 8 import threading
 9 reload(sys)
10 sys.setdefaultencoding(\'utf8\')
11 
12 print "请输入要爬取得城市简称例如bj(北京):"
13 CITY=str(raw_input(">>>"))
14 def download(url, user_agent=\'Google\', num_retries=2):
15 
16     headers = {\'User-agent\': user_agent}
17     request = urllib2.Request(url, headers=headers)
18     try:
19         html = urllib2.urlopen(request).read()
20     except urllib2.URLError as e:
21         html = None
22         if num_retries > 0:
23             if hasattr(e, \'code\') and 500 <= e.code < 600:
24                 return download(url, num_retries-1)
25     return html
26 
27 
28 def get_data(url):
29     html_text_detail = download(url)
30     try:
31         tree = lxml.html.fromstring(html_text_detail)
32         house_ext = CSSSelector(\'div.house-pay-way > span:nth-child(3)\')
33         house_title = CSSSelector(\'div.main-wrap > div.house-title > h1\')
34         house_pay_way1 = CSSSelector(\'div.house-pay-way > span:nth-child(1)\')
35         house_pay_way2 = CSSSelector(\'div.house-pay-way > span:nth-child(2)\')
36         print house_title(tree)[0].text_content()
37         print \'%s|%s\' % (house_pay_way1(tree)[0].text_content(), house_pay_way2(tree)[0].text_content())
38 
39         for i in range(7):
40             for j in range(2):
41                 css = \'div.house-desc-item > ul.f14 > li:nth-child(%s) > span:nth-child(%s)\' % (i+1, j+1)
42                 house_info = CSSSelector(css)
43                 data = [
44                 (\'标题 : \',house_title(tree)[0].text_content(), \'#\',url),
45                 (\'价格: \',house_pay_way1(tree)[0].text_content(), \'#\'),
46                 (\'压付: \',house_pay_way2(tree)[0].text_content(), \'#\'),
47                 (\'详情: \',house_info(tree)[0].text_content().replace(\' \', \'\'), \'#\')]
48                 with open(\'%s_houses.csv\'%CITY,\'ab+\') as csvfile:
49                     writer = csv.writer(csvfile,lineterminator=\'\n\')
50                     writer.writerows(data)
51 
52     except TypeError as e:
53         pass
54     except IndexError as e:
55         pass
56 
57 def get_url(html):
58     tree = lxml.html.fromstring(html)
59     sel = CSSSelector(\'div.mainbox > div.main > div.content > div.listBox > ul.listUl > li > div.des > h2 > a\')
60     url_list = []
61     for i in sel(tree):
62         if i.get(\'href\') not in url_list:
63             url_list.append(i.get(\'href\'))
64     return url_list
65 
66 
67 if __name__ == \'__main__\':
68     url_index = \'http://%s.58.com/chuzu/\'%CITY
69     html_text_list = download(url_index)
70     url_list = get_url(html_text_list)
71 
72     for url_detail in url_list:
73         thr = threading.Thread(target=get_data, args=(url_detail,))
74         thr.start()
75 
76         time.sleep(0.001)
py58.py

 

分类:

技术点:

相关文章: