你可以使用html.parser.unescape():
import html.parser as hp
import re
data = response.text
soup = BeautifulSoup(hp.unescape(data), "lxml")
p = soup.find(text=re.compile("Pursuant to the requirements of Section 13"))
print(p)
为什么你的文本失败是还有一个换行符,以下正则表达式有效:
p = soup.find(text=re.compile("Pursuant\s+to\s+the\s+requirements\s+of\s+Section\s+13"))
运行它:
In [15]: url = "https://www.sec.gov/Archives/edgar/data/1800/000110465907013496/a07-1583_110k.htm"
In [16]: response = requests.get(url, headers={
....: "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
....: })
In [17]: data = response.text
In [18]: soup = BeautifulSoup(hp.unescape(data), "lxml")
In [19]: p = soup.find(text=re.compile("Pursuant\s+to\s+the\s+requirements\s+of\s+Section\s+13"))
In [20]: print(p)
Pursuant to the
requirements of Section 13 or 15(d) of the Securities Exchange Act of
1934, Abbott Laboratories has duly caused this report to be signed on its
behalf by the undersigned, thereunto duly authorized.
您看到Pursuant to the 之后有一个换行符,因此您的文本无法匹配。使用 \s+ 将适用于任何空白。实际上也不需要转义,文本匹配可以正常工作。
获取第二张表之后:
table = p.find_next("table").find_next("table")
这给了你:
<table border="0" cellpadding="0" cellspacing="0" style="border-collapse:collapse;">
<tr style="page-break-inside:avoid;">
<td style="padding:0pt .7pt 0pt 0pt;width:210.0pt;" valign="top" width="280">
<p style="font-size:10.0pt;margin:0pt 0pt .0001pt;"><!-- SET mrlNoTableShading -->/s/ MILES D.
WHITE</p>
</td>
<td style="padding:0pt .7pt 0pt 0pt;width:30.0pt;" valign="top" width="40">
<p style="margin:0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:1.0pt;"> </font></p>
</td>
<td style="padding:0pt .7pt 0pt 0pt;width:215.7pt;" valign="top" width="288">
<p style="margin:0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:10.0pt;">/s/ ROXANNE S.
AUSTIN</font></p>
</td>
</tr>
<tr style="page-break-inside:avoid;">
<td style="border:none;border-top:solid windowtext 1.0pt;padding:0pt .7pt 0pt 0pt;width:210.0pt;" valign="top" width="280">
<p style="font-size:10.0pt;margin:0pt 0pt .0001pt;"><!-- SET mrlHTMLTableFull --><!-- SET mrlNoTableShading -->Miles D. White<br/>
Chairman of the Board, Chief Executive<br/>
Officer and Director of Abbott Laboratories<br/>
(principal executive officer)</p>
</td>
<td style="padding:0pt .7pt 0pt 0pt;width:30.0pt;" valign="top" width="40">
<p style="margin:0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:1.0pt;"> </font></p>
</td>
<td style="border:none;border-top:solid windowtext 1.0pt;padding:0pt .7pt 0pt 0pt;width:215.7pt;" valign="top" width="288">
<p style="margin:0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:10.0pt;">Roxanne S. Austin<br/>
Director of Abbott Laboratories</font></p>
</td>
</tr>
<tr style="page-break-inside:avoid;">
<td style="border:none;border-bottom:solid windowtext 1.0pt;padding:0pt .7pt 0pt 0pt;width:210.0pt;" valign="top" width="280">
<p style="margin:24.0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:10.0pt;">/s/ RICHARD A. GONZALEZ</font></p>
</td>
<td style="padding:0pt .7pt 0pt 0pt;width:30.0pt;" valign="top" width="40">
<p style="margin:24.0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:1.0pt;"> </font></p>
</td>
<td style="border:none;border-bottom:solid windowtext 1.0pt;padding:0pt .7pt 0pt 0pt;width:215.7pt;" valign="top" width="288">
<p style="margin:24.0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:10.0pt;">/s/ WILLIAM M. DALEY</font></p>
</td>
</tr>
<tr style="page-break-inside:avoid;">
<td style="border:none;padding:0pt .7pt 0pt 0pt;width:210.0pt;" valign="top" width="280">
<p style="margin:0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:10.0pt;">Richard A. Gonzalez<br/>
President and Chief Operating Officer<br/>
and Director of Abbott Laboratories</font></p>
</td>
<td style="padding:0pt .7pt 0pt 0pt;width:30.0pt;" valign="top" width="40">
<p style="margin:0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:1.0pt;"> </font></p>
</td>
<td style="border:none;padding:0pt .7pt 0pt 0pt;width:215.7pt;" valign="top" width="288">
<p style="margin:0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:10.0pt;">William M. Daley<br/>
Director of Abbott Laboratories</font></p>
</td>
</tr>
<tr style="page-break-inside:avoid;">
<td style="border:none;border-bottom:solid windowtext 1.0pt;padding:0pt .7pt 0pt 0pt;width:210.0pt;" valign="top" width="280">
<p style="margin:24.0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:10.0pt;">/s/ THOMAS C. FREYMAN</font></p>
</td>
<td style="padding:0pt .7pt 0pt 0pt;width:30.0pt;" valign="top" width="40">
<p style="margin:0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:1.0pt;"> </font></p>
</td>
<td style="border:none;border-bottom:solid windowtext 1.0pt;padding:0pt .7pt 0pt 0pt;width:215.7pt;" valign="bottom" width="288">
<p style="margin:0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:10.0pt;">/s/ W.
JAMES FARRELL</font></p>
</td>
</tr>
<tr style="page-break-inside:avoid;">
<td style="border:none;padding:0pt .7pt 0pt 0pt;width:210.0pt;" valign="top" width="280">
<p style="margin:0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:10.0pt;">Thomas C. Freyman<br/>
Executive Vice President, Finance<br/>
and Chief Financial Officer<br/>
(principal financial officer)</font></p>
</td>
<td style="padding:0pt .7pt 0pt 0pt;width:30.0pt;" valign="top" width="40">
<p style="margin:0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:1.0pt;"> </font></p>
</td>
<td style="border:none;padding:0pt .7pt 0pt 0pt;width:215.7pt;" valign="top" width="288">
<p style="margin:0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:10.0pt;">W. James Farrell<br/>
Director of Abbott Laboratories</font></p>
</td>
</tr>
<tr style="page-break-inside:avoid;">
<td style="border:none;border-bottom:solid windowtext 1.0pt;padding:0pt .7pt 0pt 0pt;width:210.0pt;" valign="top" width="280">
<p style="margin:24.0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:10.0pt;">/s/ GREG W. LINDER</font></p>
</td>
<td style="padding:0pt .7pt 0pt 0pt;width:30.0pt;" valign="top" width="40">
<p style="margin:0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:1.0pt;"> </font></p>
</td>
<td style="border:none;border-bottom:solid windowtext 1.0pt;padding:0pt .7pt 0pt 0pt;width:215.7pt;" valign="bottom" width="288">
<p style="margin:0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:10.0pt;">/s/ H.
LAURANCE FULLER</font></p>
</td>
</tr>
<tr style="page-break-inside:avoid;">
<td style="border:none;padding:0pt .7pt 0pt 0pt;width:210.0pt;" valign="top" width="280">
<p style="margin:0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:10.0pt;">Greg W. Linder<br/>
Vice President and Controller<br/>
(principal accounting officer)</font></p>
</td>
<td style="padding:0pt .7pt 0pt 0pt;width:30.0pt;" valign="top" width="40">
<p style="margin:0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:1.0pt;"> </font></p>
</td>
<td style="border:none;padding:0pt .7pt 0pt 0pt;width:215.7pt;" valign="top" width="288">
<p style="margin:0pt 0pt .0001pt;"><font face="Times New Roman" size="2" style="font-size:10.0pt;">H. Laurance Fuller<br/>
Director of Abbott Laboratories</font></p>
</td>
</tr>
</table>