homeways

python BeautifulSoup 获取页面多个子节点中的各个节点的内容

页面html格式为

<tr bgcolor="#7bb5de">
<td style="border-bottom: 1px solid #C9D8AD" width="118" align="center" bgcolor="#D9E6FF">
<p align="center">
lyl5577d92</p></td>
<td style="border-bottom: 1px solid #C9D8AD" width="96" align="center" bgcolor="#D9E6FF">
<p align="center">李永利</p></td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="105" bgcolor="#D9E6FF">
<div align="center"><font color="#FF0000">lyl5577d</font></div>
</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="153" bgcolor="#D9E6FF">
<div align="center">469680008</div>
</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="218" bgcolor="#D9E6FF">
<div align="center">2016-05-21 15:24:27.0</div>
</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="171" bgcolor="#D9E6FF">
<div align="center">0</div>
</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="119" bgcolor="#D9E6FF">0</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="111" bgcolor="#D9E6FF">0</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="87" bgcolor="#D9E6FF">0</td>
<td align="center" style="border-bottom: 1px solid #C9D8AD" width="128" bgcolor="#D9E6FF">0</td>
</tr>

 

 1 import httplib
 2 from BeautifulSoup import BeautifulSoup
 3 
 4 
 5 def main():
 6     f = open(\'result\',\'a\')
 7 
 8     headers = {\'Content-Type\':\'application/x-www-form-urlencoded\',
 9         \'Accept\': \'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\',
10         \'Accept-Language\': \'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3\',
11         \'Accept-Encoding\': \'gzip, deflate\',
12         \'Referer\': \'http://xxx.xxx.com/admin/userlist\',
13         \'Cookie\': \'JSESSIONID=9F6F2D03D2C11400B3D6731E90D73117\',
14         \'User-Agent\': \'User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:46.0) Gecko/20100101 Firefox/46.0\',
15     }
16           
17     conn = httplib.HTTPConnection(\'*.*.*.*\', timeout=50)
18 
19     for p in range(1,1287):
20         print p
21         conn.request(method=\'GET\',
22                         url="/admin/userlist?toPage=%s&sessionID=" % str(p),
23                         headers=headers)
24         resp = conn.getresponse()
25         html_doc = resp.read()
26         mainSoup = BeautifulSoup(html_doc)
27         for s in mainSoup.findAll(\'tr\', attrs={\'bgcolor\':\'#7bb5de\'}):
28             if \'style\' not in str(s):
29                 continue
30             for d in s.findAll(\'td\'):
31                 print d.getText(),
32                 f.write("%s " % d.getText().encode(\'utf-8\'))   #f.write("%s " % d.getText())==> UnicodeEncodeError: \'ascii\' codec can\'t encode characters in position 0-2: ordinal not in range(128)
33             f.write("%s\n" % d.getText().encode(\'utf-8\'))
34             print
35     f.close()
36     conn.close()
37 
38 
39 if __name__ == \'__main__\':
40     main()
41     

分类:

技术点:

相关文章: