逻辑思路是什么?
1. 获取页面
2. 处理页面,提取信息
3. 格式输出
先走面向过程编程:
1. 要定义3个函数,对应以上三个过程
2. 在__main__函数中传入参数,并执行以上三个过程
#!/usr/bin/python3 import bs4 import requests from bs4 import BeautifulSoup def getHTMLText(url): '''获取页面''' try: r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return "" def fillUnivList(ulist, html): '''处理页面''' soup = BeautifulSoup(html, "html.parser") for tr in soup.find('tbody').children: if isinstance(tr, bs4.element.Tag): tds = tr('td') ulist.append([tds[0].string, tds[1].string, tds[3].string]) def printUnivList(ulist, num): '''格式输出页面''' tplt = "{0:^10} {1:{3}^10} {2:^10}" print(tplt.format("排名", "学校名称", "总分", chr(12288))) for i in range(num): u = ulist[i] print(tplt.format(u[0], u[1], u[2], chr(12288))) if __name__ == '__main__': uinfo = [] url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html' html = getHTMLText(url) fillUnivList(uinfo, html) printUnivList(uinfo, 20) # 输出20个大学排名
如何走向面向对象?
1. 输入: url ?+ 想要获得几条信息?
2. 输出: 格式化信息
3. 对于获取页面和处理页面为私有方法,不应该暴露
#!/usr/bin/python3 import requests import bs4 from bs4 import BeautifulSoup class SchoolMessage(object): '''爬取大学排名''' def __init__(self, url, number): self.url = url self.number = number def __get_html(self): '''获得页面''' try: r = requests.get(self.url,timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return '1' def __get_message(self): '''获得信息''' info = [] html = self.__get_html() if html is not '1': soup = BeautifulSoup(html, 'html.parser') for i in soup.find('tbody').children: if isinstance(i, bs4.element.Tag): tds = i('td') info.append([tds[0].string, tds[1].string, tds[2].string]) return info else: return '1' def get_message(self): '''格式化输出信息''' info = self.__get_message() if info is not '1': temp = "{0:^10} {1:{3}^10} {2:^10}" print(temp.format("排名", "学校名称", "总分", chr(12288))) for i in range(self.number): u = info[i] print(temp.format(u[0], u[1], u[2], chr(12288))) else: print('爬取失败') if __name__ == '__main__': url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html' school_1 = SchoolMessage(url, 10) school_1.get_message()
所需要的环境:
python 3.5
requests 库
beautifulsoup 库