• Python爬虫(三)爬淘宝MM图片


    直接上代码:

    # python2
    # -*- coding: utf-8 -*-
    
    import urllib2
    import re
    import string
    import os
    import shutil
    
    def crawl_taobaoMM(baseUrl, start, end):
        imgDir = 'mm_img'
        isImgDirExist = os.path.exists(imgDir)
        if not isImgDirExist:
            os.makedirs(imgDir)
        else:
            shutil.rmtree(imgDir)
    
        fileName = 'mm.txt'
        picNumber = 0
        with open(fileName, 'a') as f:
            for i in range(start, end + 1):
                url = baseUrl + '?page=' + str(i)
                userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)' 
                            ' AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
                headers = {'user-agent': userAgent}
                req = urllib2.Request(url, headers=headers)
                response = urllib2.urlopen(req).read().decode('gbk')
                # 图片url、姓名、年龄、城市、职业
                serchPattern = r'<div class="personal-info">.*?<img src="//(.*?)".*?<a class="lady-name".*?>(.*?)' 
                               r'</a>.*?<strong>(.*?)</strong>.*?<span>(.*?)</span>.*?<em>(.*?)</em>'
                searchObj = re.compile(serchPattern, re.S)
                results = searchObj.findall(response)
    
                print '' + str(i) + '页...'
                for result in results:
                    message = '%s %s %s %s %s
    ' % (result[0], result[1], result[2], result[3], result[4])
                    print picNumber
                    print message
                    f.write(message.encode('utf-8'))
                    pic = urllib2.urlopen('https://' + result[0]).read()
                    picName = imgDir + '/' + string.zfill(picNumber, 5) + '.jpg'
                    with open(picName, 'wb') as pf:
                        pf.write(pic)
                    picNumber += 1
    
    crawl_taobaoMM('https://mm.taobao.com/json/request_top_list.htm', 1, 10)

    爬下来的图片:

    参考资料:

    Python爬虫实战四之抓取淘宝MM照片

  • 相关阅读:
    技术人生:墨菲定律
    Ioc:Autofac Registration Concepts
    Ioc:autofac lifetime scope.
    Ioc:The basic pattern for integrating Autofac into your application
    Logstash filter 插件之 date
    配置 Elasticsearch 集群
    Linux 命名管道
    Linux 管道
    Golang 入门 : channel(通道)
    Golang 入门 : 竞争条件
  • 原文地址:https://www.cnblogs.com/gattaca/p/6930592.html
Copyright © 2020-2023  润新知