RankFan

爬虫案例:中国大学排名(软科)

这是一个动态的网站,只能爬取前30个,如果想要获得全部的数据,必须找到对应的Json文件

# ref: https://blog.csdn.net/qq_42103091/article/details/118002291
# https://www.shanghairanking.cn/api/pub/v1/bcur?bcur_type=11&year=2021
import requests
from bs4 import BeautifulSoup
import re
import bs4

# ref: https://blog.csdn.net/weixin_44578172/article/details/109340255

para = {\'user-agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36\'}

def getUrlText(url):
    try:
        response = requests.get(url, params=para, timeout=30)
        response.raise_for_status()
        # response.status_code
        response.encoding = response.apparent_encoding
        return response.text
    except:
        print(\'爬取失败\')

def fillUnivList(html, ulist):
    soup = BeautifulSoup(html, \'html.parser\')
    for tr in soup.find(\'tbody\').children:
        if isinstance(tr, bs4.element.Tag):
            tds = tr(\'td\')
            a = tr(\'a\')
            # tda_3 = tds[3]
            # tda_3.text.strip()
            # tda_3.contents[0].strip()
            ulist.append([tds[0].string.strip(),
                         a[0].string.strip(),
                         tds[2].text.strip(),
                         tds[3].text.strip(),
                         tds[4].string.strip()])
    return ulist

def printUnivList(ulist, num):
    string_format = "{0:^10}\t{1:{5}^10}\t{2:{5}^10}\t{3:^10}\t{4:^10}\t{5:^10}"
    print(string_format.format("排名", "学校", "省市", "类型", "综合得分", chr(12288))) # 12288 采用中文字符填充
    for i in range(num):
        uni = ulist[i]
        print(string_format.format(uni[0], uni[1], uni[2], uni[3], uni[4], chr(12288)))

def main():
    uinfo = []
    url = \'https://www.shanghairanking.cn/rankings/bcur/2021\'
    html = getUrlText(url)
    uinfo = fillUnivList(html, uinfo)
    printUnivList(uinfo, num=20)

if __name__ == \'__main__\':
    main()

分类:

技术点:

相关文章:

  • 2022-12-23
  • 2021-05-18
  • 2021-12-11
  • 2021-12-10
  • 2022-12-23
  • 2021-08-20
  • 2021-12-23
猜你喜欢
  • 2021-12-30
  • 2021-12-02
  • 2021-09-16
  • 2022-01-12
  • 2021-06-30
相关资源
相似解决方案