• 网络爬虫-python-爬取天涯求职贴


    使用urllib请求页面,使用BeautifulSoup解析页面,使用xlwt3写入Excel

    import urllib.request
    from bs4 import BeautifulSoup
    import time
    import xlwt3
    from xlrd import open_workbook
    wExcel=xlwt3.Workbook()
    sheet1=wExcel.add_sheet('my',cell_overwrite_ok=True)
    num=0
    fo=open(r'contents.txt','a',encoding='utf-8')
    def getconten(url):
        opener = urllib.request.build_opener()
        try:
            content = opener.open(url).read()
            content2=content.decode('utf-8')
        except:
            try:
                content = opener.open(url).read()
                content2=content.decode('gbk')
            except:
                print('decode fail!')
                return None
            return None
        return content2
    def getdetail(url):
        opener = urllib.request.build_opener()
        con=getconten(url)
    ##    print(url)
        if con:
            soup=BeautifulSoup(con)
            job=soup.find('div','bbs-content clearfix')
            if job:
                jobdetail=job.get_text()
                return jobdetail
        else:
            return None
    
    def getonepage(url):
        global num
        opener = urllib.request.build_opener()
        content=getconten(url)
        if content:
            soup=BeautifulSoup(content)
        for tr in soup.find_all('tr','bg'):
            oneitem=[]
            j=0
            detailurl=tr.td.a['href']
            detailurl='http://bbs.tianya.cn'+detailurl
    ##        print(detailurl)
            detailcon=getdetail(detailurl)
    ##        print(detailcon)
            for item in tr.strings:
                item=item.strip()
                if item:
                    oneitem.append(item)
                    sheet1.write(num,j,item)
                    j=j+1
    ##            print(item.strip())
            sheet1.write(num,j,detailcon)
            num=num+1
    ##        print('one is ok')
    
    
    
    if __name__=='__main__':
        mainpage='http://bbs.tianya.cn/list.jsp?item=763&sub=2'
        getonepage(mainpage)
        wExcel.save('res0.xls')
        i=0
        soup=BeautifulSoup(getconten(mainpage))
        currentpage=soup.find('div','links').a.find_next_sibling('a')
        currentpage='http://bbs.tianya.cn'+currentpage['href']
        nextpage=currentpage
        while i<30:
            print(nextpage)
            getonepage(nextpage)
            print('one page finished!')
            con=getconten(nextpage)
            if con:
                soup=BeautifulSoup(con)
                currentpage=soup.find('div','links').a.find_next_sibling('a').find_next_sibling('a')
                nextpage='http://bbs.tianya.cn'+currentpage['href']
                i=i+1
            else:
                break
        wExcel.save('res.xls')
    
  • 相关阅读:
    C#窗体操作的小技巧
    C#操作Xml
    Path类对路径字符串的操作
    Google Maps 基础
    C#时间操作总结
    根据地理坐标计算瓦片行列号
    使用VBA宏批量修改表格
    检测到在集成的托管管道模式下不适用的ASP.NET设置的解决方法
    Asp.net实现URL重写
    VS2013利用ajax访问不了json文件——VS2013配置webconfig识别json文件
  • 原文地址:https://www.cnblogs.com/freeopen/p/5483035.html
Copyright © 2020-2023  润新知