• Python Spider入门


    添加header和data

    request=urllib.request.Request(url)
    response=urllib.request.urlopen(request)
    #response是一个http.client.HTTPResponse对象
    print(response.geturl()) #获取网址
    print(response.info())  #获取信息
    print(response.getcode()) # http状态码
    
    html=response.read()

    urllib.request.Request(url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None)
    1.data参数:the HTTP request will be a POST instead of a GET when the data parameter is provided.data should be a buffer in the standard application/x-www-form-urlencoded format. The urllib.parse.urlencode() function takes a mapping or sequence of 2-tuples and returns a string in this format.

    data={}
    data['type']='AUTO'
    data['i']=content
    data['doctype']='json'
    data['xmlVersion']=1.8
    data['keyfrom']='fanyi.web'
    data['ue']='UTF-8'
    data['action']='FY_BY_CLICKBUTTON'
    data['typoResult']='true'
    
    data=urllib.parse.urlencode(data).encode('utf-8')

    2.headers:一个字典,可以直接从浏览器中复制过来

    header={}
    #添加header的第一种方法
    header['User-Agent']='Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
    request=urllib.request.Request(url,data,header)
    #也可使用以下方法
    request.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0')

    获取Response Headers

    下面代码是用来判断 网页是否使用gzip压缩过。

    for i in response.getheaders():
        if i[0]=="Content-Encoding":
            if(i[-1]=="gzip"):
                html=gzip.decompress(html)

    使用代理

    1.参数是一个字典{‘类型’:‘代理ip:端口号’}
    proxy_support=urllib.request.ProxyHandler({})
    2.定制、创建一个openner
    opener=urllib.request.build_opener(proxy_support)
    3a.安装opener
    urllib.request.install_opener(opener)
    3b.调用opener
    opener.open(url)`

    代码

    proxy_support=urllib.request.ProxyHandler({'http':random.choice(iplist)})
    
    opener=urllib.request.build_opener(proxy_support)
    opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0')]
    urllib.request.install_opener(opener)
    
    req=urllib.request.Request(url)
    response=urllib.request.urlopen(req)

    爬知乎图片

    import urllib.request
    import os
    import random
    
    #打开网页
    def url_open(url):
        iplist=[
            '49.77.22.1:8118',
            '58.134.102.3:12696',
            '120.26.213.55:9999'...]
    
        proxy_support=urllib.request.ProxyHandler({'http':random.choice(iplist)})
    
        opener=urllib.request.build_opener(proxy_support)
        opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0')]
    
        urllib.request.install_opener(opener)
    
        req=urllib.request.Request(url)
        response=urllib.request.urlopen(req)
        html=response.read()
        return html
    
    #获取图片地址,返回图片地址的list
    def get_imgs(url):
        html=url_open(url).decode('utf-8')
    
        img_address=[]
        a=html.find('data-original')
        while(a!=-1):
            b=html.find('.jpg',a,a+300)
            if(b!=-1):
                # print(html[a+15:b+4])
                img_address.append(html[a+15:b+4])
            else:
                b=a+9
            a=html.find('data-original=',b)
    
    
        for i in img_address:
            print(i)
    
        return img_address
    
    #存储到本地
    def save_imgs(img_address):
    
        for i in img_address:
            # print(i)
            filename=i.split('/')[-1]
            with open(filename,'wb') as f:
                img=url_open(i)
                f.write(img)
    
    
    def zhihuPic(url,folder="zhihu"):
        if(os.path.exists(folder)):
            os.chdir(folder)
        else:
            os.mkdir(folder)
            os.chdir(folder)
        img_address=get_imgs(url)
        save_imgs(img_address)
    
    
    
    
    
    if __name__=='__main__':
        zhihuPic("https://www.zhihu.com/question/22070147")
  • 相关阅读:
    接口
    echartsx轴名称过长,截断+鼠标划过显示全称
    浏览器兼容的几点思路
    安装gulp教程(整理)
    TortoiseSVN文件夹及文件图标、标识、绿色小对号不显示解决方法(转载)
    css实现小三角(转载+个人笔记)
    css常用样式(待更新)
    表格样式设计和几点考量
    一些大神或者觉得有益的博客、专栏等(不定时更新)
    搭配bootstracp运用的通用样式(想起来就开个头,待补充……)
  • 原文地址:https://www.cnblogs.com/fei-hsueh/p/6106282.html
Copyright © 2020-2023  润新知