• fetch html use python still have some small issue


    #!/usr/bin/env python
    #encoding=utf-8
    import redis
    import urllib2
    import time
    import StringIO
    import gzip
    import httplib
    import cookielib
    httplib.HTTPConnection.debuglevel = 1
    files=["12148","12510","15362","11593","11750"]

    class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
        def http_error_301(self, req, fp, code, msg, headers): 
            result = urllib2.HTTPRedirectHandler.http_error_301(
                self, req, fp, code, msg, headers)
            result.status = code
            return result                                      
        def http_error_302(self, req, fp, code, msg, headers):
            result = urllib2.HTTPRedirectHandler.http_error_302(
                self, req, fp, code, msg, headers)             
            result.status = code                               
            return result

    ckjar = cookielib.MozillaCookieJar()
    ckproc = urllib2.HTTPCookieProcessor(ckjar)
    count=0
    def fetch(k,r1):
        try:
            request = urllib2.Request(k)
            request.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:10.0.1) Gecko/20100101 Firefox/10.0.1')
            #ckjar = cookielib.MozillaCookieJar(filename)
            #ckproc = urllib2.HTTPCookieProcessor(ckjar)
            global ckproc
            global count
            #opener = urllib2.build_opener(ckproc)
            opener = urllib2.build_opener(ckproc,SmartRedirectHandler())
            f = opener.open(request)
            #print f.status
            context=f.read()
            #要加上agent
            #request = urllib2.Request(k)
            #request.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:10.0.1) Gecko/20100101 Firefox/10.0.1')
            #request.add_header("Accept-encoding", "gzip")           
            #retval=urllib2.urlopen(request)
            #context=""
            #
            #if retval.headers.has_key('content-encoding'):
            #      fileobj = StringIO.StringIO()
            #      fileobj.write(retval.read())
            #      fileobj.seek(0)
            #      gzip_file = gzip.GzipFile(fileobj=fileobj)
            #      context = gzip_file.read()
            #else:
            #      context = retval.read()
            html=context.decode("gb18030","ignore").encode("utf-8")
            #print html
            if len(html.strip())>0:
                r1.hset(file,k,html)
                count+=1
                print "save %s"%count           
            time.sleep(2)
        except urllib2.HTTPError,e:
            print "error->"+k
            r1.rpush("errors",k)
            print str(e)
            print e.getcode()
            print "rework"
            fetch(k,r1)


       
    r1=redis.Redis(db=1)
    count=0
    for file in files:
        dict=r1.hgetall(file)
        for k,v in dict.iteritems():
            if v=="":
                print k
                fetch(k,r1)
    print "done!"

  • 相关阅读:
    emlog6.0代码审计
    Linux系统
    PHP反序列化链挖掘
    Yii框架反序列化RCE利用链分析
    CVE-2020-1472漏洞复现
    linux 报错 Input/output error
    SKimage
    layui 统计行totalRow 保留两位小数
    linux清空历史命令(history)
    shred 命令详解
  • 原文地址:https://www.cnblogs.com/lexus/p/2351832.html
Copyright © 2020-2023  润新知