• Python: 爬取百度贴吧图片


    练习之代码片段,以做备忘:

    # encoding=utf8
    
    from __future__ import unicode_literals
    import urllib, urllib2
    import re
    import os
    import threading
    
    
    def get_html(url):
        try:
            url = url.encode('utf-8')
            page = urllib2.urlopen(url)
            return page.read()
        except Exception as ex:
            print 'get url_%s html error, ex=%s' % (url, ex)
    
    
    def get_images(url):
        url = url.encode('utf-8')
        html = get_html(url)
        pattern = r'<img.+class="BDE_Image".+src="([^"]+.jpg)"'
        img_list = re.findall(pattern, html)
        pattern = r'<img.+src="([^"]+.jpg)".+class="BDE_Image"'
        img_list.extend(re.findall(pattern, html))
        # 去重
        img_list = sorted(set(img_list), key=img_list.index)
        return img_list
    
    
    # 指定主题页面之总页面数
    def get_page_count(url):
        html = get_html(url)
        pattern = r'"total_page":(d+)'
        m = re.search(pattern, html)
        return m.group(1) if m else 0
    
    
    # 获取每页主题url列表
    def get_page_urls(html):
        pattern = r'<a href="/p/(d+)"'
        url_list = re.findall(pattern, html)
        if url_list:
            url_list = map(lambda x: 'https://tieba.baidu.com/p/%s' % x, url_list)
        return url_list
    
    
    # 下载指定页面之图片
    def download_page_images(page_url):
        html = get_html(page_url)
        title = re.search(r'(?<=<title>)(.*)(?=</title>)', html).group(1)
        print title
        page_no = re.search(r'(d+)', page_url).group(0)
        page_count = int(get_page_count(page_url))
        print 'page: %s, page_count: %d' % (page_no, page_count)
    
        for page_idx in range(1, page_count + 1):
            url = page_url + '?pn=%d' % page_idx
            img_list = get_images(url)
            if img_list:
                print 'page index: %d, image_count: %d' % (page_idx, len(img_list))
                if not os.path.exists('images'):
                    os.mkdir('images')
    
                img_folder = 'images\%s' % page_no
                if not os.path.exists(img_folder):
                    os.mkdir(img_folder)
                idx = 0
                for img_url in img_list:
                    img_filename = img_folder + '\%d_%d.jpg' % (page_idx, idx)
                    if not os.path.exists(img_filename):
                        urllib.urlretrieve(img_url, img_filename)
                    idx += 1
    
    
    def main():
        # 扒取最大页数
        max_pagecount = 30
        base_url = r'https://tieba.baidu.com/f?kw=图片&ie=utf-8?pn=%s'
    
        # 分页而扒
        for idx in range(1, max_pagecount):
            url = base_url % ((idx - 1) * 50)
            html = get_html(url)
            url_list = get_page_urls(html)
            for page_url in url_list:
                try:
                    download_page_images(page_url)
                    threading._sleep(2)
                except:
                    continue
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    Java构造和解析Json数据的两种方法详解一
    微信小程序-自定义组件
    微信小程序
    微信公众号
    微信小程序
    微信小程序
    微信小程序
    vue
    vue
    sass 和 css 互转网址
  • 原文地址:https://www.cnblogs.com/crwy/p/7444009.html
Copyright © 2020-2023  润新知