tszr

吴裕雄--天生自然PYTHON爬虫:使用BeautifulSoup解析中国旅游网页数据

 

 

import requests
from bs4 import BeautifulSoup

url = "http://www.cntour.cn/"
strhtml = requests.get(url)
soup = BeautifulSoup(strhtml.text,"lxml")
#下面的参数由网站开发者模式中Copy->copy selector复制而来
data = soup.select("#main > div > div.mtop.firstMod.clearfix > div.centerBox > ul.newsList > li:nth-child(1) > a")
print(data)

 

 

import requests
from bs4 import BeautifulSoup

url = "http://www.cntour.cn/"
strhtml = requests.get(url)
soup = BeautifulSoup(strhtml.text,"lxml")
#下面的参数由网站开发者模式中Copy->copy selector复制而来,获取该网站所有超链接内容,删掉::nth-child(1),如下:
data = soup.select("#main > div > div.mtop.firstMod.clearfix > div.centerBox > ul.newsList > li > a")
print(data)

 

 

#清洗和组织爬取到的数据
import requests
from bs4 import BeautifulSoup

url = "http://www.cntour.cn/"
strhtml = requests.get(url)
soup = BeautifulSoup(strhtml.text,"lxml")
#下面的参数由网站开发者模式中Copy->copy selector复制而来,获取该网站所有超链接内容,删掉::nth-child(1),如下:
data = soup.select("#main > div > div.mtop.firstMod.clearfix > div.centerBox > ul.newsList > li > a")
for item in data:
    result={
        "title":item.get_text(),
        "link":item.get("href")
    }
    print(result)

 

 

#清洗和组织爬取到的数据,获取每个链接后面的ID
import re
import requests
from bs4 import BeautifulSoup

url = "http://www.cntour.cn/"
strhtml = requests.get(url)
soup = BeautifulSoup(strhtml.text,"lxml")
#下面的参数由网站开发者模式中Copy->copy selector复制而来,获取该网站所有超链接内容,删掉::nth-child(1),如下:
data = soup.select("#main > div > div.mtop.firstMod.clearfix > div.centerBox > ul.newsList > li > a")
for item in data:
    result={
        "title":item.get_text(),
        "link":item.get("href"),
        "ID":re.findall("\d+",item.get("href"))
    }
    print(result)

发表于 2020-01-12 22:53  吴裕雄  阅读(327)  评论(0编辑  收藏  举报
 

分类:

技术点:

相关文章:

  • 2021-12-15
  • 2021-10-19
  • 2022-01-15
  • 2021-11-27
  • 2021-08-30
  • 2021-12-15
  • 2021-12-01
  • 2021-12-02
猜你喜欢
  • 2021-10-18
  • 2021-11-21
  • 2021-12-13
  • 2022-01-24
  • 2022-12-23
  • 2021-11-12
  • 2021-10-09
相关资源
相似解决方案