csv存储使用html_save(s)函数
图片存储使用pic_save(url,name)函数
爬取时首先爬取首页所有英文名详细内容的链接并存入列表,然后将列表中的链接依次爬取,并调用存储函数存储价值数据。
1 import sys 2 import io 3 import re 4 sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding=\'gb18030\') 5 import requests 6 from bs4 import BeautifulSoup 7 from urllib import request 8 9 def html_save(s): 10 with open(\'Name.csv\',\'a\',encoding=\'gb18030\')as f: 11 f.write(s+\'\n\') 12 def pic_save(url,name): 13 root="C://Users//L//Desktop//ba//" 14 # name=url.split(\'=\')[-1] 15 path=root+name+\'.jpg\' 16 r=requests.get(url) 17 with open(path,\'wb\')as f: 18 f.write(r.content) 19 f.close() 20 # print(\'ok\') 21 # soup = BeautifulSoup(html,\'index\') 22 def getName_link(): 23 lst=[] 24 url=\'http://www.babynology.com/baby-boy-names.html\' 25 r=requests.get(url) 26 soup= BeautifulSoup(r.text,\'html.parser\') 27 # soup = BeautifulSoup(open(\'Girl.html\')) 28 for div in soup.find_all(\'div\',{\'class\':\'babynology_textevidence babynology_bg_grey babynology_shadow babynology_radius left overflow_scroll\'}): 29 for strong in div.find_all(\'strong\'): 30 # print(strong.find_all(\'a\')[0].text.replace(\' \',\'\').replace(\' \',\'\').replace(\'\n\',\'\')) 31 # print(strong.find_all(\'a\')[0].get(\'href\').replace(\'\n\',\'\')) 32 i=strong.find_all(\'a\')[0].text.replace(\' \',\'\').replace(\' \',\'\').replace(\'\n\',\'\') 33 j=strong.find_all(\'a\')[0].get(\'href\').replace(\'\n\',\'\') 34 lst.append(j) 35 # html_save(i) 36 # html_save(j) 37 # # print(lst) 38 return lst 39 40 def hh(lst): 41 for i in lst: 42 url=i 43 # url=\'http://www.babynology.com/name/bahula-m.html\' 44 r=requests.get(url) 45 soup= BeautifulSoup(r.text,\'html.parser\') 46 name=soup.find(\'h2\',{\'class\':\'txtclrm name-head2\'}).text 47 print("Name:",name) 48 # print(soup) 49 #gender=soup.find(\'div\',{\'class\':\'grid grid_8\'})#.find(\'div\',{\'class\':\'babynology_textevidence babynology_width_percentage40 babynology_width100_responsive\'}) 50 gender=soup.find(\'h5\',{\'style\':\'color:#000;\'}).text 51 print("Gender:",gender) 52 # Numerology=soup.find(\'h5\',{\'style\':\'color:#000; text-align:justify;\'}).stripped_strings 53 # font=soup.find(\'h5\',{\'style\':\'color:#000; text-align:justify;\'}).find(\'font\').text 54 # print(type(Numerology)) 55 # Numerology=str(Numerology) 56 Numerology=soup.find(\'h5\',{\'style\':\'color:#000; text-align:justify;\'}).text.replace(\' \',\'\').replace(\'\n\',\'\').replace(\' \',\'\') 57 a=soup.find(\'h5\',{\'style\':\'color:#000; text-align:justify;\'}).find(\'span\').text.replace(\' \',\'\').replace(\'\n\',\'\').replace(\' \',\'\') 58 b=soup.find(\'h5\',{\'style\':\'color:#000; text-align:justify;\'}).find(\'script\').text.replace(\' \',\'\').replace(\'\n\',\'\').replace(\' \',\'\') 59 n=Numerology.strip(b).strip(a) 60 # print(name,\'Numerology:\',font,"%s"%list(Numerology)[1].replace(\'\n\',\'\').replace(\' \',\'\').replace(\' \',\'\')) 61 print(name,\'Numerology:\',Numerology.strip(b).strip(a)) 62 n=Numerology.strip(b).strip(a) 63 n=name+\' Numerology:\'+n 64 n=n.replace(\',\',\' \').replace(\',\',\' \') 65 # n=n.encode(\'UTF-8\',\'ignore\').decode(\'UTF-8\') 66 print(n) 67 # url=\'http://www.babynology.com/name/bahula-m.html\' 68 r=requests.get(url) 69 pic=soup.find(\'img\',{\'style\':\'margin-left:-10px; margin-top:-5px;\'}).get(\'src\') 70 # print(pic) 71 html_save(\'Name:\'+name) 72 html_save(\'Gender:\'+gender) 73 html_save(n) 74 pic_save(pic,name) 75 # html_save(\'--------------------------------------------------------------------------------------------------------------------------\') 76 print(\'---------------------------------------------------------------------------\') 77 # print(name,\'Numerology:\',Numerology.strip(b).strip(a)) 78 hh(getName_link())