• python 爬取百度图片


    import requests
    from bs4 import BeautifulSoup
    import re
    import os
    import json
    from urllib import parse
    headers='''
    Accept-Ranges: bytes
    Access-Control-Allow-Origin: *
    Age: 570820
    Cache-Control: max-age=2628000
    Connection: keep-alive
    Content-Length: 45163
    Content-Type: image/jpeg
    Date: Sat, 11 May 2019 06:17:00 GMT
    ETag: 3448023fd5dc275ff4088c50d1da7d5f
    Expires: Tue, 04 Jun 2019 01:43:20 GMT
    Last-Modified: Thu, 01 Jan 1970 00:00:00 GMT
    Ohc-Response-Time: 1 0 0 0 0 0
    Server: JSP3/2.0.14
    '''

    class DownBaiDuImg(object):
    listheader='''
    Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
    Accept-Encoding: gzip, deflate
    Accept-Language: zh-CN,zh;q=0.9,en;q=0.8
    Cache-Control: max-age=0
    Connection: keep-alive
    Cookie: BDIMGISLOGIN=0; winWH=%5E6_1366x631; BDqhfp=%E6%AF%94%E5%9F%BA%E5%B0%BC%26%26-10-1undefined%26%260%26%261; BAIDUID=ED5602028E2013468035151C8C3C3A53:FG=1; BIDUPSID=ED5602028E2013468035151C8C3C3A53; PSTM=1552569672; BDSFRCVID=ZoFOJeC62GC4q3c9ZolNh5mNHGcamB3TH6aoUWSSBZNRGvSy07o7EG0PqU8g0Kub55HBogKK0mOTHv8F_2uxOjjg8UtVJeC6EG0P3J; H_BDCLCKID_SF=tJAq_D0hfIP3fP36q45Mq4tHen6y0fRZ5mAqoq3nJPD5HITLhPvFM5LDX47x5-oL0J7naIQqaM5RVUOtWxTCQnK92H0f25b43bRTQxKy5KJvfJ_Gjf7IhP-UyN3LWh37bJblMKoaMp78jR093JO4y4Ldj4oxJp8eWJQ2QJ8BJI02MDJP; BDUSS=k5MTWt1V2RvRHRBMVBrUVFMeURRY243ZWRMNDEtMkg1Mm94VnNYcVp5cUh5cmxjQVFBQUFBJCQAAAAAAAAAAAEAAAA64oOWs8y36rPYMQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIc9klyHPZJcW; uploadTime=1557547291054; cleanHistoryStatus=0; BDRCVFR[Tp5-T0kH1pb]=mk3SLVN4HKm; delPer=0; PSINO=1; BDRCVFR[CCf63Vmik7b]=mk3SLVN4HKm; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; H_PS_PSSID=1441_28939_28981_21126_28519_28775_28723_28963_28836_28585_26350_22157; indexPageSugList=%5B%22%E6%AF%94%E5%9F%BA%E5%B0%BC%22%2C%22%E7%BE%8E%E5%A5%B3%22%5D
    Host: image.baidu.com
    Upgrade-Insecure-Requests: 1
    User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36
    '''
    """docstring for DownBaiDuImg"""
    def __init__(self,header,kw):
    super(DownBaiDuImg, self).__init__()
    self.heades=self.gen_headers(header)
    self.num=0
    self.kw=parse.quote(kw)

    def gen_headers(self,s):
    ls = s.split(' ')
    lsl = []
    ls = ls[1:-1]
    headers = {}
    for l in ls:
    l = l.split(': ')
    lsl.append(l)
    for x in lsl:
    headers[str(x[0]).strip(' ')] = x[1]
    return headers

    def downimg(self,url,name):
    try:
    content=requests.get(url,timeout=2).content
    with open('../images/'+name,'wb') as f:
    f.write(content)
    f.close()
    return True
    except Exception as e:
    return False
    else:
    pass
    finally:
    pass

    def doing(self,page):
    listheader=self.gen_headers(self.listheader);
    page=str(page)
    # print('http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord='+self.kw+'&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=&copyright=&word='+self.kw+'&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&expermode=&force=&cg=girl&pn=60&rn='+page)
    text=requests.get('http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord='+self.kw+'&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=&copyright=&word='+self.kw+'&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&expermode=&force=&cg=girl&pn=60&rn='+page,headers=listheader)
    text=text.text
    if json.loads(text)['data']:
    for x in json.loads(text)['data']:
    if 'thumbURL' in x.keys():
    h=parse.quote(x['hoverURL'])
    g=parse.quote(x['thumbURL'])
    imgurl='http://image.baidu.com/search/down?tn=download&ipn=dwnl&word=download&ie=utf8&fr=result&url='+h+'&thumburl='+g
    exe=os.path.splitext(imgurl)[-1]
    self.num+=1
    filename=str(self.num)+exe;
    if self.downimg(imgurl,filename):
    print('下载成功')
    else:
    print('下载失败')
    obj=DownBaiDuImg(headers,'绿色护眼壁纸大全')
    for x in range(0,5):
    obj.doing(x*30)

  • 相关阅读:
    MyEclipse添加XML的xsd文件和dtd文件(自动补全xml节点代码)
    浅析Java中Map与HashMap,Hashtable,HashSet的区别(转载)
    jsp中:jsp声明与jsp脚本<%! int count=0;%> 与<% int count=0;%>
    JSP内置对象详细介绍(上)<转载>
    学习运用json
    win7与win7之间无法访问共享文件的问题解决(转)
    关于工作情绪化的问题
    Hadoop配置学习
    问题汇总
    mysql读写分离
  • 原文地址:https://www.cnblogs.com/chengfengchi/p/10849864.html
Copyright © 2020-2023  润新知