• 爬虫大作业


    import requests
    from bs4 import BeautifulSoup
    
    
    def catchSoup(url):
        #url=‘http://www.18ladys.com/post/buchong/‘
        res=requests.get(url)
        res.encoding=‘utf-8‘
        soup=BeautifulSoup(res.text,‘html.parser‘)
        return soup
    
    def kindSearch(soup):
        herbKind=[]
        for new in soup.select(‘li‘):
            if(new.text!=‘首页‘):
                perKind=[]
                perKind.append(new.text)
                perKind.append(new.select(‘a‘)[0].attrs[‘href‘])
                herbKind.append(perKind)
        return herbKind
    
    
    def nameSearch(soup):
        herbName=[]
        for new in soup.select(‘h3‘):
            pername=new.text.split(‘_‘)[0].rstrip(‘图片‘).lstrip(‘xa0‘)
            pername=pername.rstrip(‘的功效与作用‘)
            herbName.append(pername)
        return herbName
    
    def perPage(soup):
        kindPage=[]
        add=[]
        for new in soup.select(‘.post.pagebar‘):
            for detail in new.select(‘a‘):
                d=[]
                d.append(detail.text)
                d.append(detail.attrs[‘href‘])
                kindPage.append(d)
        kindPage.remove(kindPage[0])
        kindPage.remove(kindPage[-1])
        return kindPage
    def herbDetail(kind):
        soup=catchSoup(‘http://www.18ladys.com/post/buchong/‘)
        kindName=kindSearch(soup)[kind][0]       
        adds=kindSearch(soup)[kind][1]           
        totalRecord = []                        
        print("正在爬取 "+str(kind)+‘.‘+kindName)
        totalRecord.append(nameSearch(catchSoup(adds)))
        for add in perPage(catchSoup(adds)):           
            pageAdd=add[1]
            totalRecord.append(nameSearch(catchSoup(pageAdd)))
            #print(nameSearch(catchSoup(pageAdd)))
        print(totalRecord)
        return totalRecord
    
    
    if __name__=="__main__":
           totalKind=kindSearch(catchSoup(‘http://www.18ladys.com/post/buchong/‘))     totalRecord=[]
        kind=0
        detailContent = ‘‘
        while(kind<20):
            totalRecord=herbDetail(kind)
            if(kind==0):
                detailContent+=‘目录:
    ‘
                for i in totalKind:
                    detailContent+=str(totalKind.index(i)+1)+‘.‘+i[0]+‘ ‘
                kind+=1
                continue
            else:
                detailContent+=‘
    ‘+str(totalKind[kind][0])+‘:
    ‘
            for i in totalRecord:
                detailContent+=str(totalRecord.index(i)+1)+‘.‘+i[0]+‘ ‘
            kind+=1
    
    f = open(‘herbDetail.txt‘, ‘a+‘,encoding=‘utf-8‘)
    f.write(detailContent)
    f.close()





  • 相关阅读:
    hdoj 1010-Tempter of the Bone
    leetcode 91. 解码方法
    leetcode 925. 长按键入
    leetcode 437. 路径总和 III
    leetcode 892. 三维形体的表面积
    二分查找
    HBASE 安装
    Linux 日常指令
    Linux Centos7 配置代理
    Linux ssh 免密
  • 原文地址:https://www.cnblogs.com/SOLARLKS/p/8970848.html
Copyright © 2020-2023  润新知