• 京东商品信息及其价格爬虫


    python 2.7

    # -*- coding:utf-8 -*-  
    
    #导入模块
    import urllib2,re,urllib
    from bs4 import BeautifulSoup
    import json,time
    import sys  
    reload(sys)  
    sys.setdefaultencoding('utf8') 
    
    fout = open(r'res.txt', "wb")
    tot = 0
    
    #定义抓取类
    class JD:
        #记录抓取产品个数
        prodNum = 1
        #初始化参数
        def __init__(self,baseurl,page):
            self.baseurl = baseurl
            self.page = page
            #拼装成url
            self.url = self.baseurl+'&'+'page='+str(self.page)
    
        def getHtml(self,url):
            try:
                #请求抓取对象
                request = urllib2.Request(url)
                #响应对象
                reponse = urllib2.urlopen(request)
                #读取源代码
                html = reponse.read()
                #返回源代码
            except:
                time.sleep(0.1)
                return self.getHtml(url)
            return html
    
    
        #获取总页数
        def getNum(self,html):
            #封装成BeautifulSoup对象
            soup = BeautifulSoup(html)
            #定位到总页数节点
            items = soup.find_all('span',class_='p-skip')
            #获取总页数
            for item in items:
                pagenum = item.find('em').find('b').string
            return pagenum
    
        #获取所有产品id列表
        def getIds(self,html):
            #生成匹配规则
            pattern =  re.compile('<a target="_blank" href="//item.jd.com/(.*?).html".*?>')
            #查询匹配对象
            items = re.findall(pattern,html)
            return items
    
        #根据产品id获取同款产品列表
        def getIdByItems(self,id):
            #拼装成url
            url = basePd+str(id)+'.html'
            #调用抓取函数返回源代码
            html = self.getHtml(url)
            # 封装成BeautifulSoup对象
            soup = BeautifulSoup(html)
            #查询匹配对象
            items = []
            items = soup.find('div',class_='dd clearfix')
            l = []
            #生成列表
    
            for item in items:
                pattern = re.compile('href="//item.jd.com/(.*?).html".*?>')
                id = re.findall(pattern,str(item))
                if id:
                    l += id
            return l
    
        #获取产品价格
        def getPrice(self,id):
            url = 'http://p.3.cn/prices/mgets?skuIds=J_'+str(id)
            jsonString = self.getHtml(url)
            jsonObject = json.loads(jsonString.decode())
            price_jd = jsonObject[0]['p']
            price_mk = jsonObject[0]['m']
            fout.write('jd price:'+str(price_jd)+'
    ')
            fout.write('market price:'+str(price_mk)+'
    ')
    
        #获取产品图片
        def getImg(self,html,subid):
            '''
            pattern = re.compile(r'<img id=.*?data-origin="(.*?)" alt=.*?', re.S)
            items = re.findall(pattern, html)
            for item in items:
                imgurl = 'http:%s' % (item)
                urllib.urlretrieve(imgurl, 'd:/temp/jdimg/%s.jpg' % (str(subid)))
                '''
    
        #获取内容
        def getContent(self,html,subid):
            soup = BeautifulSoup(html)
            title = soup.find('div',class_='sku-name')
            fout.write('
    -----------------'+ str(JD.prodNum) +'--------------------
    ')
            try:
                for t in title:
                    fout.write('name:'+t.string+'
    ')
            except:
                return
            time.sleep(1)
            #价格
            self.getPrice(subid)
            #编码
            items1 = soup.find_all('ul',class_='parameter1 p-parameter-list')
            #商品基本信息
            for item in items1:
                p = item.findAll('p')
                for i in p:
                    i.string=""
            # 商品基本信息
            items2 = soup.find_all('ul', class_='parameter2 p-parameter-list')
            for item in items2:
                p = item.findAll('li')
                if len(str(p[0].string))>0:
                    fout.write(str(p[0].string))
                fout.write('
    ')
                '''
                for i in p:
                    if len(str(i.string))>0:
                        fout.write(str(i.string))
                    fout.write('
    ')
                '''
            #规格与包装
            '''
            items3 = soup.find_all('div',class_='Ptable-item')
            for item in items3:
                contents1 = item.findAll('dt')
                contents2 = item.findAll('dd')
                for i in range(len(contents1)):
                    if len(str(contents1[i].string))>0 and len(str(contents2[i].string))>0:
                        fout.write(contents1[i].string)
                        if len(str(contents2[i].string))>0:
                            fout.write(str(contents2[i].string))
                        fout.write('
    ')
            '''
            JD.prodNum += 1
            print JD.prodNum
    
        #启动抓取程序
        def start(self):
            html = spider.getHtml(self.url)
            pageNum = self.getNum(html)
            print 'doing............'
            #time.sleep(3)
            print 'finish. all',pageNum,'pages'
            #time.sleep(1)
            print 'doing.........'
            #循环1--页数
            for page in range(1,int(pageNum)+1):
                url = self.baseurl+'&'+'page='+str(page)
                html = self.getHtml(url)
                ids = self.getIds(html)
                #循环2--产品列表
                for id in ids:
                    urlprod = basePd+str(id)+'.html'
                    htmlprod = self.getHtml(urlprod)
                    '''
                    subids = self.getIdByItems(id)
                    '''
                    self.getContent(htmlprod,id)
                    self.getImg(htmlprod,id)
                    '''
                    #循环3--产品组列表
                    for subid in subids:
                        urlsubprod = basePd+str(subid)+'.html'
                        subhtml = self.getHtml(urlsubprod)
                        time.sleep(1)
                        self.getContent(subhtml,subid)
                        self.getImg(subhtml,subid)
                    '''
    
    
    #产品列表base页
    basePd  = 'http://item.jd.com/'
    #抓取入口URL
    baseURL = 'http://list.jd.com/list.html?cat=9987,653,655'
    #生成爬虫抓取对象
    spider = JD(baseURL,1)
    
    #开始抓取
    spider.start()
  • 相关阅读:
    centos7上修改lv逻辑卷的大小
    centos6上调整lv逻辑卷
    nginx的日志配置
    修改Linux系统默认编辑器
    mysqldump命令的安装
    centos7上设置中文字符集
    nginx的80端口跳转到443
    ubuntu上安装docker和docker-compose
    javascript递归、循环、迭代、遍历和枚举概念
    Lattice 开发工具Diamond 相关版本下载地址
  • 原文地址:https://www.cnblogs.com/qscqesze/p/6820184.html
Copyright © 2020-2023  润新知