• 爬取淘女郎模特个人信息数据和写真


    趁着国庆节有时间,帮人写了个爬取淘女郎模特动态加载的图片的爬虫,还有爬取模特们的个人信息数据,这个爬虫花了3天时间,因为图片是异步加载的所以爬取
    的复杂度有点大,最终我通过研究URL的变化,构造新的URL来进行持续性爬取,不过爬取速度真心慢(查看了cpu的利用率还有很多没有利用到),我准备把多线程加进去
    说实话不太好加,有点头大,

      1 # -*- coding: utf-8 -*-
      2 import requests,time,re
      3 import threadpool
      4 import json,os,redis
      5 import xlwt,xlrd,random
      6 import urllib.request
      7 from lxml import etree
      8 
      9 url = 'https://mm.taobao.com/search_tstar_model.htm?spm=5679.126488.640745.2.38a457c5mEWpPl&style=&place=city%3A%E5%B9%BF%E5%B7%9E'
     10 url1='https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8'
     11 
     12 
     13 class tn(object):
     14 
     15     headers = {
     16         'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
     17         'cookie': 'thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; linezing_session=dTr7U8KOxddDd4CJ4VFihDPH_1506516599348AOqB_1; v=0; _tb_token_=5b9375847e363; _m_h5_tk=4efc0ba8d72376fa1968a3f0a92f0eef_1506518851245; _m_h5_tk_enc=e10a67c79b8a47f97dd3134779acfdfe; uc3=sg2=URsQfTD%2BFY9mkKOl%2FNBXqNFPPUNKq8HjGx%2Bair7O99U%3D&nk2=UoCKEw%2B1myb2u1mo&id2=UoCJiFOLhjN6OQ%3D%3D&vt3=F8dBzWk7FANQZ7%2B830Y%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D; existShop=MTUwNjk0NjQwMQ%3D%3D; uss=VFcwj3YmzKmO6xkbkJFH%2FN%2FOd2CPNJzRBWBygIM3IKKXIgbm1DSeGb87; lgc=1132771621aa; tracknick=1132771621aa; cookie2=11aae79de97ae344158e4aa965c7003c; sg=a2d; cookie1=Aihx9FxoyUYIE7uEPgeqstl%2B5uvfGslyiCQ%2FpePYriI%3D; unb=1100473042; skt=11ea4b0360e50e08; t=b63e6968872da200706b694d67c62883; _cc_=UtASsssmfA%3D%3D; tg=0; _l_g_=Ug%3D%3D; _nk_=1132771621aa; cookie17=UoCJiFOLhjN6OQ%3D%3D; cna=1/K/EZz4HDECAXhVTdivCBle; mt=ci=45_1; isg=Anx8i770Cd1_Zz2Ede24lLr7TRqCdCGCcQXP_1b95GdKIR2rfoTZL77ZdX-i; JSESSIONID=F34B74BB5A7A0A1BF96E8B3F2C02DE87; uc1=cookie14=UoTcCfmfxB%2Fd7g%3D%3D&lng=zh_CN&cookie16=WqG3DMC9UpAPBHGz5QBErFxlCA%3D%3D&existShop=false&cookie21=U%2BGCWk%2F7owY3i1vB1W2BgQ%3D%3D&tag=8&cookie15=UtASsssmOIJ0bQ%3D%3D&pas=0',
     18 
     19     }
     20 
     21     def getUrlinfo(self,page):
     22 
     23         datas=[]
     24         pageurl = 'https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8'
     25 
     26         data = {
     27             'q':'',
     28             'viewFlag':'A',
     29             'sortType':'default',
     30             'searchStyle':'',
     31             'searchRegion':'city:',
     32             'searchFansNum':'',
     33             'currentPage':'%s'%(page),
     34             'pageSize':'100'
     35         }
     36         try:
     37             while True:
     38                 time.sleep(1)
     39                 reqs = requests.post(pageurl,data=data,headers=self.headers,timeout=5)
     40                 if reqs.status_code ==200:
     41                     break
     42                 else:
     43                     print('field')
     44         except Exception as e:
     45             print('error:',e)
     46         dictx = json.loads(str(reqs.text))
     47         t = dictx['data']['searchDOList']
     48         for i in t:
     49             r =  i['realName'],i['height'],i['weight'],i['city'],i['userId']
     50             #userid = i['userId']
     51             datas.append(r)
     52         return datas  #返回淘女郎信息数据
     53 
     54     def getImages(self,rs):
     55 
     56         a=0
     57         for id in rs:
     58             #print(id)
     59             os.mkdir(os.path.join('D:\SpiderProject\ZhiHu\taonvlang\img', str(id[0])))
     60             imagurl = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20='+str(id[4])
     61 
     62             try:
     63 
     64                 html1 = requests.get(imagurl,headers=self.headers,timeout=5)
     65                 reqsones = str(html1.text)
     66                 #print(reqsones)
     67             except Exception as e:
     68                 print('error:',e)
     69             urls = etree.HTML(reqsones)
     70             imagesurl = urls.xpath('//a[@class="mm-first"]/@href')#获取淘女郎对应相册url
     71             #print(imagesurl)
     72             ad = 'album_id=d{11}|album_id=d{8}|album_id=d{9}'#获取album_id
     73             album_id = re.compile(ad)
     74             result = album_id.findall(str(imagesurl))
     75 
     76             for im in result:
     77                 pturl = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id=%s&%s&top_pic_id=0&page=0'%(id[4],im)
     78                 time.sleep(1)
     79                 html5 = requests.get(pturl,headers=self.headers,timeout=5)#获取json图片URL
     80                 print('获取json图片URL')
     81                 jsons =json.loads(str(html5.text))
     82                 try:
     83                     pic = jsons['picList']
     84                 except KeyError as e:
     85                     print('Error:',e)
     86 
     87                 for ius in pic:
     88                         a+=1
     89                         iu = ius['picUrl']
     90                         imurl = 'http:'+ str(iu)
     91                         filename = os.path.join('D:\SpiderProject\ZhiHu\taonvlang\img\%s'%(id[0]),str(a)+'.jpg')
     92                         print('开始下载图片')
     93                         try: 
     94                             file = urllib.request.urlretrieve(str(imurl),filename)
     95                         except urllib.error.HTTPErro as e:
     96                             print('Error:',e)
     97 
     98     def getInfophone(self):
     99 
    100         userurl = 'https://mm.taobao.com/self/aiShow.htm?spm=719.7763510.1998643336.1.6WJFuT&userId=277949921'
    101         html = requests.get(userurl,headers=self.headers,timeout=5)
    102         html.encoding = 'GBK'
    103         print(html.encoding)
    104         selector = etree.HTML(str(html.text))
    105         phone = selector.xpath('//strong[@style="font-family: simhei;color: #000000;font-size: 24.0px;line-height: 1.5;"]|//span[@style="font-size: 24.0px;"]/text()')
    106 
    107     def saveInfo(self,p):
    108 
    109         a,b = 1,0
    110         workbook = xlwt.Workbook(encoding='ascii')
    111         worksheet = workbook.add_sheet('My Worksheet')
    112         worksheet.write(0,0, label='姓名')
    113         worksheet.write(0, 1, label='身高')
    114         worksheet.write(0, 2, label='体重')
    115         worksheet.write(0, 3, label='城市')
    116 
    117         while a<=30 or b<=5:
    118             for names in p:
    119                 n = names[0]
    120                 w = names[1]
    121                 h = names[2]
    122                 c = names[3]
    123                 a+=1
    124                 worksheet.write(3, 5, label=str(n))
    125         workbook.save('Excel_Workbook.xls')
    126 
    127 if __name__ =="__main__":
    128     t = tn()
    129     for ii in range(1):
    130             rs = t.getUrlinfo(ii)
    131             #print(rs)
    132             t.getImages(rs)
    133         #t.saveInfo(p)
    134     #t.getInfophone()
    
    
    




    下面是运行代码截图
     
  • 相关阅读:
    [CTSC2017]吉夫特(Lucas定理,DP)
    [CTSC2017]游戏(Bayes定理,线段树)
    [BZOJ3551][ONTAK2010]Peaks(加强版)(Kruskal重构树,主席树)
    [BZOJ4337][BJOI2015]树的同构(树的最小表示法)
    [BZOJ3786]星系探索(伪ETT)
    [CTSC2017]密钥
    PKUSC2018训练日程(4.18~5.30)
    [NOI2016]优秀的拆分
    [SDOI2008]Sandy的卡片
    [JSOI2007]字符加密
  • 原文地址:https://www.cnblogs.com/Huangsh2017Come-on/p/7624033.html
Copyright © 2020-2023  润新知