• python之路——爬虫实例


    urlController.py

    import bsController
    from urllib import request
    
    class SpiderMain(object):
        def __init__(self):
            self.header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
                   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                   'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                   'Accept-Encoding': 'none',
                   'Accept-Language': 'en-US,en;q=0.8',
                   'Connection': 'keep-alive'}
            self.bsManage = bsController.bsManage()
    
        def getUrl(self,rootUrl):
            for i in range(1,500):
                url = rootUrl+'%s' %i+'.html'
                req = request.Request(url)
                for h in self.header:
                       req.add_header(h, self.header[h])
                try:
                  html = request.urlopen(req).read()
                  # print(html)
                  self.bsManage.getPageUrl(html,i)
                  req.close()
                except request.URLError as e:
                  if hasattr(e, 'code'):
                    print('Error code:',e.code)
                  elif hasattr(e, 'reason'):
                    print('Reason:',e.reason)
    
    
    if __name__=='__main__':
        rootUrl = 'http://www.meitulu.com/item/'
        obj_root = SpiderMain()
        obj_root.getUrl(rootUrl)
    

    bsController.py

    from bs4 import BeautifulSoup
    from urllib import request
    import os
    
    class bsManage:
        def __init__(self):
            self.pageUrl = 'http://www.meitulu.com/item/'
            self.header = {
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Accept-Encoding': 'none',
                'Accept-Language': 'en-US,en;q=0.8',
                'Connection': 'keep-alive'}
    
        # html是获取到的网页的html
        # i表示i_x.html
        def getPageUrl(self,html,i):
            soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
            # 获取到最后一个连接
            lastUrl = soup.find_all('div', {'id': 'pages'})[0].find_all('a')[-2]['href']
            # print(html)
            # print(lastUrl)
            # 获取到最后一页的数字
            if i < 10:
                len = 1
            elif i<100:
                len = 2
            elif i<1000:
                len = 3
            elif i<10000:
                len = 4
            lastPage = int(lastUrl[29+len:-5])
            # 创建图片文件夹
            if not os.path.exists('img'):
                os.mkdir('img')
            path = 'img/%s' %i
            if not os.path.exists(path):
                os.mkdir(path)
            # 先爬取第一页 因为url格式不一样
            # 获取所需要图片的连接 array
            links = soup.find_all('img',class_='content_img')
            for link in links:
                   name = str(link['src'])[-21:]
                   data = request.urlopen(link['src']).read()
                   img = open('img/%s/' %i + name,'wb+')
                   img.write(data)
                   img.close()
            # print('%d 已经爬完' %i)
    
            # str = self.pageUrl + '%s' %i + '.html'
            # print(str)
    
            # 每一个页面下有lastPage个小页面
            for j in range(2,lastPage+1):
                # 重新拼接url 获取到下一页的url
                url = self.pageUrl + '%s_%s' %(i,j) + '.html'
                self.saveImgWithUrl(url,i)
            print('%d 已经爬完' %i)
    
        def saveImgWithUrl(self,url,i):
            req = request.Request(url)
            for h in self.header:
                req.add_header(h, self.header[h])
            try:
                html = request.urlopen(req).read()
                soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
                # 获取所需要图片的连接 array
                links = soup.find_all('img', class_='content_img')
                for link in links:
                    name = str(link['src'])[-21:]
                    data = request.urlopen(link['src']).read()
                    img = open('img/%s/' % i + name, 'wb+')
                    img.write(data)
                    img.close()
            except request.URLError as e:
                if hasattr(e, 'code'):
                    print('Error code:', e.code)
                elif hasattr(e, 'reason'):
                    print('Reason:', e.reason)
    
  • 相关阅读:
    黄聪:Wordpress写文章自动过滤HTML标签解决方法
    黄聪:C#中调用python脚本语言
    黄聪:DIV+CSS建站经验总结,不同版本IE下CSS布局bug问题(IE5、IE6、IE7、IE8、火狐Firefox兼容)
    黄聪:Python下安装Mysqldb出现DeprecationWarning: the sets module is deprecated from sets错误解决方案
    黄聪:Wordpress数据库中各个表的用途描述
    黄聪:Python实现Discuz论坛的自动POST登录发贴回帖(转)
    黄聪:python访问抓取网页常用命令(保存图片到本地、模拟POST、GET、中文编码问题)
    黄聪:jquery对ajax的error内的XMLHttpRequest返回的exception获取里面的信息
    黄聪:XML操作中常用的XPath表达式
    黄聪:Python初始化系统变量设置
  • 原文地址:https://www.cnblogs.com/xj76149095/p/5851065.html
Copyright © 2020-2023  润新知