爬虫2_2019年549所中国大学排名

最新推荐文章于 2020-12-19 12:10:10 发布

年纪轻轻keep_coding

最新推荐文章于 2020-12-19 12:10:10 发布

阅读量290

点赞数

CC 4.0 BY-SA版权

分类专栏：爬虫

本文链接：https://blog.csdn.net/qq_35629706/article/details/104837167

爬虫专栏收录该内容

3 篇文章

订阅专栏

"""
# - 实战
# - 教程:https://www.bilibili.com/video/av9784617?p=32
"""
import requests
from bs4 import BeautifulSoup
import bs4


def request_url(url, *params):
    """
    :param url:
    :param params: 一般为输入内容，例如百度搜索:params = {'wd': "搜索内容"}
    :return: 查看源
    """
    try:
        headers = {'user-agent': 'my-app/0.0.1'}  # 伪装成浏览器，避免403，被禁止访问
        response = requests.get(url, headers=headers, params=params)
        print(response.request.url)
        print('访问状态:', response.status_code)
        print('编码方式:', response.encoding)
        response.encoding = response.apparent_encoding
        if response.status_code == 200:
            return response
    except requests.RequestException:
        return "返回异常"


def fillUnivList(ulist, html):
    """
    :param ulist: 每个大学的数据
    :param html: response.text
    :return: 排名list
    """
    soup = BeautifulSoup(html, 'html.parser')
    for tr in soup.find('tbody').children:
    # tbody = soup.find('tbody')
    # for tr in tbody.find_all('tr'):
        if isinstance(tr, bs4.element.Tag):  # 如果为Tag类型
            td_list = tr.find_all('td')
            # 排名/学校/评分
            ulist.append([td_list[0].string, td_list[1].string, td_list[2].string, td_list[3].string])

    return ulist


def PrintUnivList(list2):
    """
    :param list2: 排名数据列表
    :return: 打印结果
    """
    tplt = '{0:^5}\t{1:{4}^10}\t{2:{5}^6}\t{3:^6}'
    print(tplt.format('排名', '大学', '位置', '评分', chr(12288), chr(12288)))
    for per_univ in list2:
        print(tplt.format(per_univ[0], per_univ[1], per_univ[2], per_univ[3], chr(12288), chr(12288)))


if __name__ == "__main__":
    # - 中国大学排名
    list1 = []
    url1 = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html'
    response = request_url(url=url1)
    # print(response.text)
    ulist = fillUnivList(ulist=list1, html=response.text)
    # print(ulist)
    PrintUnivList(list2=ulist)