• 爬虫二 cookie&正则


    一.cookie应用实例

    import urllib.request
    import urllib.parse
    
    '''带着cookie进入人人网的用户管理界面:
            1.用浏览器登录人人网
            2.下次请求时,抓包,拿到它带着的cookie
            3.编写代码,带着cookie过去
            4.如果不行,带着所有的请求信息过去(终极方案)'''
    url = 'http://www.renren.com/971302264/profile'
    
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
                             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
                'Cookie': 'anonymid=jxczgs3yw3oby9; _de=9718742970B17AD7ABC87CAAA6A740CC;'
                          ' p=176166a1bb4a1d1a163443225f52e24e4; first_login_flag=1; ln_uact=18404904721; '
                          'ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; '
                          't=21d77ab67402235d4282cf725f991aab4; societyguester=21d77ab67402235d4282cf725f991aab4; '
                          'id=971302264; xnsid=6d1019cd; ver=7.0; loginfrom=null; JSESSIONID=abcOB4RHNlyeq8Dv_7sUw; '
                          'jebe_key=2819f31f-79cc-428e-b61e-8b968e2beda4%7C920b82268747e02c45f3056eeda651c7%7C1561538325515%7C1%7C1561538325729; '
                          'jebe_key=2819f31f-79cc-428e-b61e-8b968e2beda4%7C920b82268747e02c45f3056eeda651c7%7C1561538325515%7C1%7C1561538325732; wp_fold=0'
    
    }
    
    req = urllib.request.Request(url,headers=headers)
    
    rep = urllib.request.urlopen(req)
    
    with open('ren.html','wb') as fp:
        fp.write(rep.read())

    二、编程登录人人网

    import urllib.request
    import urllib.parse
    import http.cookiejar
    
    '''python登录人人网:
            1.用浏览器登录并抓包
            2.拿到目标url和post信息
            3.带着这些信息发请求'''
    
    '''创建这样的打开器,登录时会保存cookie信息到该打开器'''
    cj = http.cookiejar.CookieJar()    #创建CookieJar对象
    handler = urllib.request.HTTPCookieProcessor(cj)     #创建cookie处理者
    opener = urllib.request.build_opener(handler)      #创建打开器
    
    post_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019531649636 '
    
    form_data = {'email':'18404904721',
                'icode':'',
                'origURL':'http://www.renren.com/home',
                'domain':'renren.com',
                'key_id':'1',
                'captcha_type':'web_login',
                'password':'641fd8bce69ff3a3acfb14fc094fefe9487f9b4f843d18063fcce22e0a468066',
                'rkey':'2c3ae276413c03a1eb5159d355895bd0',
                'f':'http%3A%2F%2Fwww.renren.com%2F971302264%2Fprofile'}
    
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
                             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',}
    
    form_data = urllib.parse.urlencode(form_data).encode()       #post表单处理
    
    req = urllib.request.Request(url=post_url,headers=headers)     #创建请求对象
    
    rep = opener.open(req,data=form_data)            #发送post请求
    
    # print(rep.read().decode())
    
    '''进入用户管理界面,验证是否登录成功'''
    get_url = 'http://www.renren.com/971302264/profile'
    
    req1 = urllib.request.Request(url=get_url,headers=headers)
    
    rep1 = opener.open(req1)     #再次使用该打开器,里面会带着cookie
    
    with open('guanli.html','wb') as fp:
        fp.write(rep1.read())

    三、正则表达式提取内容

    import re
    
    '''()子模式'''
    # string = '<div><span>悟空</span></div>'
    # '''匹配上面的字符串,标签是对称的'''
    # pattern = re.compile(r'<(w+)><(w+)>w+</2></1>')
    # ret = pattern.search(string)
    # print(ret)
    
    '''贪婪与非贪婪'''
    # string = '<div>八戒</div></div></div>'
    # '''匹配上面的字符串,标签是对称的'''
    # pattern1 = re.compile(r'<div>.*</div>')
    # pattern2 = re.compile(r'<div>.*?</div>')
    # ret1 = pattern1.search(string)
    # ret2 = pattern2.search(string)
    # print(ret1)
    # print(ret2)
    
    '''re.M多行匹配'''
    string = '''beautiful'
    beach'''
    pattern = re.compile(r'^bea',re.M)
    ret = pattern.findall(string)
    print(ret)
    
    '''re.S单行匹配'''
    # string = '<div>《沁园春-雪》' 
    #          '北国风光,千里冰封,万里雪飘。' 
    #          '望长城内外,惟余莽莽。' 
    #          '大河上下,顿失滔滔。</div>'
    # pattern = re.compile(r'.*',re.S)
    # ret = pattern.search(string)
    # print(ret)
    
    '''re.I 单忽略大小写'''
    # string = 'Life Is Short You Must Be Sexy'
    # pattern = re.compile(r'life is short you must be sexy',re.I)
    # ret = pattern.search(string)
    # print(ret)
    
    '''正则替换'''
    string = 'Life Is Short You Must Be Sexy'
    pattern = re.compile(r'Sexy')
    ret = re.sub(pattern,'sao',string)
    ret2 = pattern.sub('lang',string)
    print(ret)
    print(ret2)
    
    def func(a):
        ret = int(a.group())
        return str(ret - 3)
    string = '最佳身高为175cm'
    pattern = re.compile(r'd+')
    ret2 = pattern.sub(func,string)
    print(ret2)

    四、正则例子-爬取糗图图片

    import urllib.request
    import urllib.parse
    import re
    import  os
    
    def create_request(url,page):
        post_url = url + str(page) +'/'
        header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
                             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
        req = urllib.request.Request(url=post_url,headers=header)
        return req
    
    def download_image(content):
        pattern = re.compile(r'<div class="thumb">.*?<img src="(.*?)" .*?>.*?</div>',re.S)
        img_list = pattern.findall(content)
        #print(img_list)
        for img_src in img_list:
            img_url = 'https:' + img_src      #拼接图片链接
            dirname = 'qiutu'
            if not os.path.exists(dirname):
                os.mkdir(dirname)
            img_name = img_url.split('/')[-1]
            filepath = dirname + '/' + img_name
            urllib.request.urlretrieve(img_url,filepath)
    
    def main():
        url = 'https://www.qiushibaike.com/pic/page/'
    
        start_page = int(input('起始页码:'))
        end_page = int(input('结束页码:'))
    
        for page in range(start_page,end_page):
            print('第%s页开始下载...' %page)
            #创建请求
            req = create_request(url,page)
    
            #发送请求,得到内容
            rep = urllib.request.urlopen(req).read().decode()
    
            #解析内容,下载图片
            download_image(rep)
            print('第%s页结束下载...' % page)
    
    if __name__ == '__main__':
        main()

    五、正则例子-爬取语录

    import urllib.request
    import urllib.parse
    import re
    import  os
    
    def create_request(url,page=None):
        if page != None:
            url = url + str(page) + '.html'
        #print(post_url)
        header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
                             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    
        req = urllib.request.Request(url=url,headers=header)
        return req
    
    def get_content(href):
        request = create_request(href)
        content_html = urllib.request.urlopen(request).read().decode()
        pattern = re.compile(r'<div class="neirong">(.*?)</div>', re.S)
        content_list = pattern.findall(content_html)
        #print(content_list)
        pat = re.compile(r'<img .*?>')
        text = pat.sub('',content_list[0])
        return text
    
    def parse_html(content):
    
        #正则筛选内容
        pattern = re.compile(r'<h3><a href="/mingrenjingdianyulu/(d+/d+/d+.html)"><b>(.*?)</b></a></h3>', re.S)
        title_list = pattern.findall(content)
        #print(title_list)
    
        for i in title_list:
    
            href = 'http://www.yikexun.cn/mingrenjingdianyulu/' + i[0]     # 拼接内容的跳转链接
            title = i[1]
    
            #向href发送请求,获取内容
            content = get_content(href)
    
            #写入文件
            string = '<!DOCTYPE html>' 
                     '<html lang="en">' 
                     '<head>' 
                     '  <meta charset="UTF-8">' 
                     '  <title>Title</title>' 
                     '</head>' 
                     '<body>' 
                     '  <h1>%s</h1>%s' 
                     '</body>' %(title,content)
    
            with open('yulu.html','a',encoding='utf8') as fp:
                fp.write(string)
    
    def main():
        url = 'http://www.yikexun.cn/mingrenjingdianyulu/list_10_'
    
        start_page = int(input('起始页码:'))
        end_page = int(input('结束页码:'))
    
        for page in range(start_page,end_page+1):
            print('第%s页开始下载...' %page)
    
            #创建请求
            req = create_request(url,page)
    
            #发送请求,得到内容
            rep = urllib.request.urlopen(req).read().decode()
    
            #解析内容,下载图片
            parse_html(rep)
            print('第%s页结束下载...' % page)
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    蜕变过程中的思考
    Django template for 循环用法
    Django 发送html邮件
    Django F对象的使用
    在Django中使用Q()对象
    ubuntu中管理用户和用户组
    Django settings.py 的media路径设置
    Git版本控制 备忘录
    Git .gitignore文件的使用
    将git关联到pycharm
  • 原文地址:https://www.cnblogs.com/Finance-IT-gao/p/11099529.html
Copyright © 2020-2023  润新知