1.4.2python网站地图爬虫（每天一更）

# -*- coding: utf-8 -*-
'''
Created on 2019年5月6日

@author: 薛卫卫
'''

import urllib.request
import re

def download(url, user_agent="wswp",num_retries=2):
    print("Downloading: " , url)
    headers = { 'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read()
    except urllib.request.URLError as e:
        print('Download error:' , e.reason)
        html = None
        if num_retries > 0 :
            if hasattr(e, 'code') and 500 <= e.code < 600:
                return download(url, user_agent, num_retries-1)
    return html

def crawl_sitemap(url):
    # download the sitemap file
    sitemap = download(url)
    # 不修改正则表达式，修改输出的结果，将urlopen().read()返回的data进行解码
    sitemap = sitemap.decode('utf-8')
    # extract the sitemap links
    links = re.findall('<loc>(.*?)</loc>', sitemap)
    #download each link
    for link in links:
        html = download(link)
        # scrape html here
        # ...
        
crawl_sitemap("http://example.webscraping.com/sitemap.xml")

相关阅读:
java 字节流与字符流的区别
什么是缓冲区
java流输入输出
Apache安装配置
Maven学习
Redis
数据结构与算法
pig ERROR 2997: Encountered IOException. File or directory null does not exist.
hadoop学习路线（转）
86标准SQL与92标准SQL用法区别

原文地址：https://www.cnblogs.com/xww115/p/10828446.html