• 爬虫大作业


    import requests
    from bs4 import BeautifulSoup


    def catchSoup(url):
    #url=‘http://cul.news.sina.com.cn/topline/2018-04-24/doc-ifzqvvsa2785251.shtml/‘
    res=requests.get(url)
    res.encoding=‘utf-8‘
    soup=BeautifulSoup(res.text,‘html.parser‘)
    return soup

    def kindSearch(soup):
    herbKind=[]
    for new in soup.select(‘li‘):
    if(new.text!=‘首页‘):
    perKind=[]
    perKind.append(new.text)
    perKind.append(new.select(‘a‘)[0].attrs[‘href‘])
    herbKind.append(perKind)
    return herbKind


    def nameSearch(soup):
    herbName=[]
    for new in soup.select(‘h3‘):
    pername=new.text.split(‘_‘)[0].rstrip(‘图片‘).lstrip(‘xa0‘)
    pername=pername.rstrip(‘读书‘)
    herbName.append(pername)
    return herbName

    def perPage(soup):
    kindPage=[]
    add=[]
    for new in soup.select(‘.post.pagebar‘):
    for detail in new.select(‘a‘):
    d=[]
    d.append(detail.text)
    d.append(detail.attrs[‘href‘])
    kindPage.append(d)
    kindPage.remove(kindPage[0])
    kindPage.remove(kindPage[-1])
    return kindPage
    def herbDetail(kind):
    soup=catchSoup(‘http://cul.news.sina.com.cn/topline/2018-04-24/doc-ifzqvvsa2785251.shtml‘)
    kindName=kindSearch(soup)[kind][0]
    adds=kindSearch(soup)[kind][1]
    totalRecord = []
    print("正在爬取 "+str(kind)+‘.‘+kindName)
    totalRecord.append(nameSearch(catchSoup(adds)))
    for add in perPage(catchSoup(adds)):
    pageAdd=add[1]
    totalRecord.append(nameSearch(catchSoup(pageAdd)))
    #print(nameSearch(catchSoup(pageAdd)))
    print(totalRecord)
    return totalRecord


    if __name__=="__main__":
    totalKind=kindSearch(catchSoup(‘http://cul.news.sina.com.cn/topline/2018-04-24/doc-
    ifzqvvsa2785251.shtml‘))
    totalRecord=[]
    kind=0
    detailContent = ‘‘
    while(kind<20):
    totalRecord=herbDetail(kind)
    if(kind==0):
    detailContent+=‘目录: ‘
    for i in totalKind:
    detailContent+=str(totalKind.index(i)+1)+‘.‘+i[0]+‘ ‘
    kind+=1
    continue
    else:
    detailContent+=‘ ‘+str(totalKind[kind][0])+‘: ‘
    for i in totalRecord:
    detailContent+=str(totalRecord.index(i)+1)+‘.‘+i[0]+‘ ‘
    kind+=1

    f = open(‘herbDetail.txt‘, ‘a+‘,encoding=‘utf-8‘)
    f.write(detailContent)
    f.close()

  • 相关阅读:
    一个Mini的ASP.NET Core框架的实现
    DDD领域驱动设计理论篇
    你准备好了在云中工作吗?
    FFM原理及公式推导
    IOS学习:常用第三方库(GDataXMLNode:xml解析库)
    网络数据的XML解析
    iOS开发之html解析
    iOS解析HTML
    iOS设计模式——MVC(Model-View-Controller)
    iOS设计模式——Category
  • 原文地址:https://www.cnblogs.com/yh5788lz/p/8970978.html
Copyright © 2020-2023  润新知