直接上代码:
# python2 # -*- coding: utf-8 -*- import urllib2 import re import string import os import shutil def crawl_taobaoMM(baseUrl, start, end): imgDir = 'mm_img' isImgDirExist = os.path.exists(imgDir) if not isImgDirExist: os.makedirs(imgDir) else: shutil.rmtree(imgDir) fileName = 'mm.txt' picNumber = 0 with open(fileName, 'a') as f: for i in range(start, end + 1): url = baseUrl + '?page=' + str(i) userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)' ' AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' headers = {'user-agent': userAgent} req = urllib2.Request(url, headers=headers) response = urllib2.urlopen(req).read().decode('gbk') # 图片url、姓名、年龄、城市、职业 serchPattern = r'<div class="personal-info">.*?<img src="//(.*?)".*?<a class="lady-name".*?>(.*?)' r'</a>.*?<strong>(.*?)</strong>.*?<span>(.*?)</span>.*?<em>(.*?)</em>' searchObj = re.compile(serchPattern, re.S) results = searchObj.findall(response) print '第' + str(i) + '页...' for result in results: message = '%s %s %s %s %s ' % (result[0], result[1], result[2], result[3], result[4]) print picNumber print message f.write(message.encode('utf-8')) pic = urllib2.urlopen('https://' + result[0]).read() picName = imgDir + '/' + string.zfill(picNumber, 5) + '.jpg' with open(picName, 'wb') as pf: pf.write(pic) picNumber += 1 crawl_taobaoMM('https://mm.taobao.com/json/request_top_list.htm', 1, 10)
爬下来的图片:
参考资料: