爬虫大作业

import requests
from bs4 import BeautifulSoup

def catchSoup(url):
#url=‘http://cul.news.sina.com.cn/topline/2018-04-24/doc-ifzqvvsa2785251.shtml/‘
res=requests.get(url)
res.encoding=‘utf-8‘
soup=BeautifulSoup(res.text,‘html.parser‘)
return soup

def kindSearch(soup):
herbKind=[]
for new in soup.select(‘li‘):
if(new.text!=‘首页‘):
perKind=[]
perKind.append(new.text)
perKind.append(new.select(‘a‘)[0].attrs[‘href‘])
herbKind.append(perKind)
return herbKind

def nameSearch(soup):
herbName=[]
for new in soup.select(‘h3‘):
pername=new.text.split(‘_‘)[0].rstrip(‘图片‘).lstrip(‘xa0‘)
pername=pername.rstrip(‘读书‘)
herbName.append(pername)
return herbName

def perPage(soup):
kindPage=[]
add=[]
for new in soup.select(‘.post.pagebar‘):
for detail in new.select(‘a‘):
d=[]
d.append(detail.text)
d.append(detail.attrs[‘href‘])
kindPage.append(d)
kindPage.remove(kindPage[0])
kindPage.remove(kindPage[-1])
return kindPage
def herbDetail(kind):
soup=catchSoup(‘http://cul.news.sina.com.cn/topline/2018-04-24/doc-ifzqvvsa2785251.shtml‘)
kindName=kindSearch(soup)[kind][0]
adds=kindSearch(soup)[kind][1]
totalRecord = []
print("正在爬取 "+str(kind)+‘.‘+kindName)
totalRecord.append(nameSearch(catchSoup(adds)))
for add in perPage(catchSoup(adds)):
pageAdd=add[1]
totalRecord.append(nameSearch(catchSoup(pageAdd)))
#print(nameSearch(catchSoup(pageAdd)))
print(totalRecord)
return totalRecord

if __name__=="__main__":
totalKind=kindSearch(catchSoup(‘http://cul.news.sina.com.cn/topline/2018-04-24/doc-
ifzqvvsa2785251.shtml‘))
totalRecord=[]
kind=0
detailContent = ‘‘
while(kind<20):
totalRecord=herbDetail(kind)
if(kind==0):
detailContent+=‘目录： ‘
for i in totalKind:
detailContent+=str(totalKind.index(i)+1)+‘.‘+i[0]+‘ ‘
kind+=1
continue
else:
detailContent+=‘ ‘+str(totalKind[kind][0])+‘: ‘
for i in totalRecord:
detailContent+=str(totalRecord.index(i)+1)+‘.‘+i[0]+‘ ‘
kind+=1

f = open(‘herbDetail.txt‘, ‘a+‘,encoding=‘utf-8‘)
f.write(detailContent)
f.close()

相关阅读:
一个Mini的ASP.NET Core框架的实现
DDD领域驱动设计理论篇
你准备好了在云中工作吗？
FFM原理及公式推导
IOS学习：常用第三方库（GDataXMLNode：xml解析库）
网络数据的XML解析
iOS开发之html解析
iOS解析HTML
iOS设计模式——MVC（Model-View-Controller）
iOS设计模式——Category

原文地址：https://www.cnblogs.com/yh5788lz/p/8970978.html