BeautifulSoup学习笔记

from BeautifulSoup import BeautifulSoup
import re

doc = ['<html><head><title>Page title</title></head>',
       '<body><p >This is paragraph <b>one</b>.',
       '<p >This is paragraph <b>two</b>.',
       '</html>']
soup = BeautifulSoup(''.join(doc))
print soup.prettify()

运行结果为： BeautifulSoup学习笔记

print soup.contents[0].name
#
print soup.contents[0].contents[0].name

for i in range(len(soup.contents[0])):
    print soup.contents[0].contents[i].name

BeautifulSoup学习笔记

titleTag = soup.html.head.title
titleTag
# <title>Page title</title>

titleTag.string
# u'Page title'

len(soup('p'))
# 2

soup.findAll('p', align="center")
# [<p >This is paragraph <b>one</b>. </p>]

soup.find('p', align="center")
# <p >This is paragraph <b>one</b>. </p>

soup('p', align="center")[0]['id']
# u'firstpara'

soup.find('p', align=re.compile('^b.*'))['id']
# u'secondpara'

soup.find('p').b.string
# u'one'

soup('p')[1].b.string
# u'two'