在学习了嵩天老师的《Python网络爬虫与信息提取课程》之后,我自己尝试按照老师的步骤做了一个小demo。
爬取有趣网址之家首页中有趣网站。
url = “https://youquhome.com/”
代码如下:
import requests from bs4 import BeautifulSoup #获得response对象 def getUrlText(url): r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding return r.text #获得url和对应名称 def getUrlList(ulist,html): soup = BeautifulSoup(html,'lxml') res = soup.find_all('h1',class_ = "entry-title") a_soup = BeautifulSoup(str(res),'lxml')#res是列表,需要转化为字符串。 a = a_soup.find_all('a') for each in a: ulist.append([each.get('href'),each.string]) return ulist #打印网站 def printFunUrl(ulist): tplt = "{0:^10} {1:{2}^10}" print(tplt.format("网址","网站名",chr(12288))) for i in range(len(ulist)): u = ulist[i] print(tplt.format(u[0],u[1],chr(12288))) #定义主函数 def main(): ulist = [] url = "https://youquhome.com/" html = getUrlText(url) getUrlList(ulist,html) printFunUrl(ulist) main()