"""
# - 实战
# - 教程:https://www.bilibili.com/video/av9784617?p=32
"""
import requests
from bs4 import BeautifulSoup
import bs4
def request_url(url, *params):
"""
:param url:
:param params: 一般为输入内容,例如百度搜索:params = {'wd': "搜索内容"}
:return: 查看源
"""
try:
headers = {'user-agent': 'my-app/0.0.1'} # 伪装成浏览器,避免403,被禁止访问
response = requests.get(url, headers=headers, params=params)
print(response.request.url)
print('访问状态:', response.status_code)
print('编码方式:', response.encoding)
response.encoding = response.apparent_encoding
if response.status_code == 200:
return response
except requests.RequestException:
return "返回异常"
def fillUnivList(ulist, html):
"""
:param ulist: 每个大学的数据
:param html: response.text
:return: 排名list
"""
soup = BeautifulSoup(html, 'html.parser')
for tr in soup.find('tbody').children:
# tbody = soup.find('tbody')
# for tr in tbody.find_all('tr'):
if isinstance(tr, bs4.element.Tag): # 如果为Tag类型
td_list = tr.find_all('td')
# 排名/学校/评分
ulist.append([td_list[0].string, td_list[1].string, td_list[2].string, td_list[3].string])
return ulist
def PrintUnivList(list2):
"""
:param list2: 排名数据列表
:return: 打印结果
"""
tplt = '{0:^5}\t{1:{4}^10}\t{2:{5}^6}\t{3:^6}'
print(tplt.format('排名', '大学', '位置', '评分', chr(12288), chr(12288)))
for per_univ in list2:
print(tplt.format(per_univ[0], per_univ[1], per_univ[2], per_univ[3], chr(12288), chr(12288)))
if __name__ == "__main__":
# - 中国大学排名
list1 = []
url1 = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html'
response = request_url(url=url1)
# print(response.text)
ulist = fillUnivList(ulist=list1, html=response.text)
# print(ulist)
PrintUnivList(list2=ulist)
爬虫2_2019年549所中国大学排名
最新推荐文章于 2020-12-19 12:10:10 发布