啦啦,网站屏蔽太严重。很难完成
#爬去小猪短租网南京地区短租网13页信息 #导入beautifulsoup库和request库和time库 from bs4 import BeautifulSoup import requests import time #加入请求头:User-Agent,伪装成浏览器 headers={ \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36\' } def judgment_sex(class_name): if(class_name==[\'member_icol\']): return \'女\' else: return \'男\' def get_links(url): web_data=requests.get(url,headers=headers) soup=BeautifulSoup(web_data.text,\'lxml\') links=soup.select(\'#page_list > ul > li > a\') for link in links: href=link.get(\'href\') print(href) if __name__==\'__main__\': urls=\'http://nj.xiaozhu.com/search-duanzufang-p2-0/\' get_links(urls)
1 #爬去小猪短租网南京地区短租网13页信息 2 #导入beautifulsoup库和request库和time库 3 from bs4 import BeautifulSoup 4 import requests 5 import time 6 7 #加入请求头:User-Agent,伪装成浏览器 8 headers={ 9 \'User-Agent\':\'Nokia6600/1.0 (3.42.1) SymbianOS/7.0s Series60/2.0 Profile/MIDP-2.0 Configuration/CLDC-1.0\' 10 #\'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36\' 11 } 12 13 def judgment_sex(class_name): 14 if(class_name==[\'member_icol\']): 15 return \'女\' 16 else: 17 return \'男\' 18 19 20 def get_links(url): 21 web_data=requests.get(url,headers=headers) 22 soup=BeautifulSoup(web_data.text,\'lxml\') 23 links=soup.select(\'#page_list > ul > li > a\') 24 #href=links[0].get(\'href\') 25 #time.sleep(6) 26 #get_info(href) 27 for link in links: 28 href=link.get(\'href\') 29 time.sleep(10) 30 get_info(href) 31 32 33 34 def get_info(url): 35 wb_data=requests.get(url,headers=headers) 36 print(wb_data) 37 print(\'666666666666666666666666666666\') 38 soup=BeautifulSoup(wb_data.text,\'lxml\') 39 #tittles=soup.select(\'div.pho_info > h4\') 40 #print(tittles) 41 42 tittles=soup.select(\' div.pho_info > h4\') 43 addresses=soup.select(\'span.pr5\') 44 prices=soup.select(\'#pricePart > div.day_l > span\') 45 imgs=soup.select(\'#floatRightBox > div.js_box.clearfix > div.member_pic > a > img\') 46 names=soup.select(\'#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a\') 47 sexs=soup.select(\'#floatRightBox > div.js_box.clearfix > div.member_pic > div\') 48 for tittle,addresse,price,img,name,sex in zip(tittles,addresses,prices,imgs,names,sexs): 49 data={ 50 \'tittle\':tittle.get_text().strip(), 51 \'addresse\':addresse.get_text().strip(), 52 \'price\':price.get_text(), 53 \'img\':img.get("src"), 54 \'name\':name.get_text(), 55 \'sex\':judgment_sex(sex.get("class")) 56 } 57 print(data) 58 59 60 if __name__==\'__main__\': 61 urls=[\'http://nj.xiaozhu.com/search-duanzufang-p{}-0/\'.format(number) for number in range(1,15)] 62 for single_url in urls: 63 get_links(single_url) 64 #print(single_url)