huanghuangwei

csv存储使用html_save(s)函数

图片存储使用pic_save(url,name)函数

爬取时首先爬取首页所有英文名详细内容的链接并存入列表,然后将列表中的链接依次爬取,并调用存储函数存储价值数据。

 1 import sys
 2 import io
 3 import re
 4 sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding=\'gb18030\')
 5 import requests
 6 from bs4 import BeautifulSoup
 7 from urllib import request
 8 
 9 def html_save(s):
10     with open(\'Name.csv\',\'a\',encoding=\'gb18030\')as f:
11         f.write(s+\'\n\')
12 def pic_save(url,name):
13     root="C://Users//L//Desktop//ba//"
14     # name=url.split(\'=\')[-1]
15     path=root+name+\'.jpg\'
16     r=requests.get(url)
17     with open(path,\'wb\')as f:
18         f.write(r.content)
19         f.close()
20         # print(\'ok\')
21 # soup = BeautifulSoup(html,\'index\')
22 def getName_link():
23     lst=[]
24     url=\'http://www.babynology.com/baby-boy-names.html\'
25     r=requests.get(url)
26     soup= BeautifulSoup(r.text,\'html.parser\')
27     # soup = BeautifulSoup(open(\'Girl.html\'))
28     for div in soup.find_all(\'div\',{\'class\':\'babynology_textevidence babynology_bg_grey babynology_shadow babynology_radius left overflow_scroll\'}):
29         for strong in div.find_all(\'strong\'):
30             # print(strong.find_all(\'a\')[0].text.replace(\'    \',\'\').replace(\' \',\'\').replace(\'\n\',\'\'))
31             # print(strong.find_all(\'a\')[0].get(\'href\').replace(\'\n\',\'\'))
32             i=strong.find_all(\'a\')[0].text.replace(\'    \',\'\').replace(\' \',\'\').replace(\'\n\',\'\')
33             j=strong.find_all(\'a\')[0].get(\'href\').replace(\'\n\',\'\')
34             lst.append(j)
35             # html_save(i)
36             # html_save(j)
37     # # print(lst)        
38     return lst
39 
40 def hh(lst):
41     for i in lst:
42         url=i
43     # url=\'http://www.babynology.com/name/bahula-m.html\'
44         r=requests.get(url)
45         soup= BeautifulSoup(r.text,\'html.parser\')
46         name=soup.find(\'h2\',{\'class\':\'txtclrm name-head2\'}).text
47         print("Name:",name)
48         # print(soup)
49         #gender=soup.find(\'div\',{\'class\':\'grid grid_8\'})#.find(\'div\',{\'class\':\'babynology_textevidence babynology_width_percentage40 babynology_width100_responsive\'})
50         gender=soup.find(\'h5\',{\'style\':\'color:#000;\'}).text
51         print("Gender:",gender)
52         # Numerology=soup.find(\'h5\',{\'style\':\'color:#000; text-align:justify;\'}).stripped_strings
53         # font=soup.find(\'h5\',{\'style\':\'color:#000; text-align:justify;\'}).find(\'font\').text
54         # print(type(Numerology))
55         # Numerology=str(Numerology)
56         Numerology=soup.find(\'h5\',{\'style\':\'color:#000; text-align:justify;\'}).text.replace(\'   \',\'\').replace(\'\n\',\'\').replace(\'    \',\'\')
57         a=soup.find(\'h5\',{\'style\':\'color:#000; text-align:justify;\'}).find(\'span\').text.replace(\'   \',\'\').replace(\'\n\',\'\').replace(\'    \',\'\')
58         b=soup.find(\'h5\',{\'style\':\'color:#000; text-align:justify;\'}).find(\'script\').text.replace(\'   \',\'\').replace(\'\n\',\'\').replace(\'    \',\'\')
59         n=Numerology.strip(b).strip(a)
60         # print(name,\'Numerology:\',font,"%s"%list(Numerology)[1].replace(\'\n\',\'\').replace(\'    \',\'\').replace(\'      \',\'\'))
61         print(name,\'Numerology:\',Numerology.strip(b).strip(a))
62         n=Numerology.strip(b).strip(a)
63         n=name+\' Numerology:\'+n
64         n=n.replace(\'\',\' \').replace(\',\',\' \')
65         # n=n.encode(\'UTF-8\',\'ignore\').decode(\'UTF-8\')
66         print(n)
67         # url=\'http://www.babynology.com/name/bahula-m.html\'
68         r=requests.get(url)
69         pic=soup.find(\'img\',{\'style\':\'margin-left:-10px; margin-top:-5px;\'}).get(\'src\')
70         # print(pic)
71         html_save(\'Name:\'+name)
72         html_save(\'Gender:\'+gender)
73         html_save(n)
74         pic_save(pic,name)
75         # html_save(\'--------------------------------------------------------------------------------------------------------------------------\')
76         print(\'---------------------------------------------------------------------------\')
77         # print(name,\'Numerology:\',Numerology.strip(b).strip(a))
78 hh(getName_link())

分类:

技术点:

相关文章:

  • 2021-09-12
  • 2021-11-16
  • 2022-12-23
  • 2022-12-23
  • 2021-10-20
  • 2021-11-09
  • 2022-12-23
  • 2022-12-23
猜你喜欢
  • 2021-11-09
  • 2021-11-21
  • 2022-12-23
  • 2021-11-01
  • 2022-12-23
  • 2022-01-14
相关资源
相似解决方案