• python爬虫基础


    Demo1:urllib使用

    #encoding:utf-8
    import urllib
    import urlparse
    def printlist(lines):
        for i in lines:
            print(i)
    def demo():
        s=urllib.urlopen('http://blog.kamidox.com')
        msg=s.info()
        #printlist(msg.items())
        #printlist(msg.headers)
        #print(s.getcode())
        #printlist(s.readlines())
        #print(msg.getheader("Content-Type"))
        printlist(dir(msg))#find the methods of the class
    def progress(blk,blk_size,total_size):
        print("%d/%d - %.02f%%"%(blk*blk_size,total_size,(float)(blk*blk_size)*100/total_size))
    def retrieve():
        fname,msg=urllib.urlretrieve('http://blog.kamidox.com','index.html',reporthook=progress)
        #print(fname)
        #printlist(msg.items())
    def urlencode():
        params={'score':100,'name':'pachongjichu','comment':'very good'}
        qs=urllib.urlencode(params)
        print(qs)
        print(urlparse.parse_qs(qs))
    if __name__ == '__main__':
        urlencode()
    

    Demo2:抓取图片

    #encoding:utf-8
    import urllib
    response=urllib.urlopen("http://placekitten.com/g/300/400")
    cat_img=response.read()
    with open('cat_300_400.jpg','wb')as f:#图片是二进制文件
        f.write(cat_img)
    print(response.info())
    #print(response.read())
    

    Demo3:有道词典翻译

    # encoding:utf-8
    import urllib
    import json
    import sys
    reload(sys)
    sys.setdefaultencoding('utf8')
    # 解决错误 UnicodeDecodeError: 'ascii' codec can't decode byte 0xe7 in position 0:
    content=raw_input('请输入要翻译的内容:')#raw_input是输入原始字符串
    url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=http://www.youdao.com/"
    data = {}#来自于form data那一项中的数据
    data['i']=content
    data['from']='AUTO'
    data['to']='AUTO'
    data['smartresult']='dict'
    data['client']='fanyideskweb'
    data['salt']='1497500950438'
    data['sign']='cd8af15baafdac91e90445ce25f3cea1'
    data['doctype']='json'
    data['version']='2.1'
    data['keyfrom']='fanyi.web'
    data['action']='FY_BY_CLICKBUTTON'
    data['typoResult']='true'
    data=urllib.urlencode(data).encode('utf-8')
    response=urllib.urlopen(url,data)
    html=response.read().decode('utf-8')
    #print(html)#此时得到的是json结构
    target=json.loads(html)
    #type(target)->dict
    print("翻译结果:%s"%(target['translateResult'][0][0]['tgt']))

    Demo4:代码隐藏和延迟请求

     

    # encoding:utf-8
    import urllib
    import json
    import time
    import sys
    reload(sys)
    sys.setdefaultencoding('utf8')
    # 解决错误 UnicodeDecodeError: 'ascii' codec can't decode byte 0xe7 in position 0:
    while True:
    content=raw_input('请输入要翻译的内容(输入”q!“退出程序):')#raw_input是输入原始字符串
    if content=='q!':
    break
    url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=http://www.youdao.com/"
    #request 对象生成之前
    head={}
    head['User-Agent']='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36'
    data = {}
    data['i']=content
    data['from']='AUTO'
    data['to']='AUTO'
    data['smartresult']='dict'
    data['client']='fanyideskweb'
    data['salt']='1497500950438'
    data['sign']='cd8af15baafdac91e90445ce25f3cea1'
    data['doctype']='json'
    data['version']='2.1'
    data['keyfrom']='fanyi.web'
    data['action']='FY_BY_CLICKBUTTON'
    data['typoResult']='true'
    data=urllib.urlencode(data).encode('utf-8')
    response=urllib.urlopen(url,data,head)
    html=response.read().decode('utf-8')
    #print(html)#此时得到的是json结构
    target=json.loads(html)
    #type(target)->dict
    print("翻译结果:%s"%(target['translateResult'][0][0]['tgt']))
    time.sleep(3)#第一个方案,延迟请求

     也可以在创建对象后再addheaders('User_Agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36')

    不过python2.7好像不可以

    但是如果你连续的下载那么就不太行了,不像一个正常人下载那样,每个ip单位时间就会有个数值

    第一个方案,是延迟请求

    第二个方案,是代理

    1.参数是一个字典{‘类型’:‘代理ip:端口号’}

    proxy_support=urllib.ProxyHandler({})

    2.定制、创建一个opener

    opener=urllib.build_opener(proxy_support)

    3.安装 opener

    urllib.install_opener(opener)

    或者opener.open(url)

    # coding: utf-8
    import urllib
    import urllib2
    import random
    import sys
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    url='http://www.ip138.com/'
    iplist=['183.203.208.166:8118','111.1.32.28:81','119.6.144.73:81']
    proxy_support=urllib2.ProxyHandler({'http': random.choice(iplist)})
    
    opener=urllib2.build_opener(proxy_support)
    opener.addheaders=[('User_Agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36')]
    urllib2.install_opener(opener)
    response=urllib.urlopen(url)
    html=response.read()
    print(html)
    

      

  • 相关阅读:
    图片上传-下载-删除等图片管理的若干经验总结3-单一业务场景的完整解决方案
    图片上传-下载-删除等图片管理的若干经验总结2
    HDU 1195 Open the Lock
    HDU 1690 Bus System
    HDU 2647 Reward
    HDU 2680 Choose the best route
    HDU 1596 find the safest road
    POJ 1904 King's Quest
    CDOJ 889 Battle for Silver
    CDOJ 888 Absurdistan Roads
  • 原文地址:https://www.cnblogs.com/BetterThanEver_Victor/p/7016970.html
Copyright © 2020-2023  润新知