scrapy递归下载网站

# encoding: utf-8
import os
import re
import subprocess
import sys

import chardet
import scrapy
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
from scrapy.utils.url import urljoin_rfc

from mychardet import *

# print sys.getdefaultencoding()

# print sys.path

def get_default_to_codec():
return mytogb18030

def getfirst(a):
#     print 'enter getfirst', repr(a)
    if a == None:
        return u''
    elif len(a) == 0:
        return u''
    b = a[0]
#     print repr(b)#, chardet.detect(b)
    return b

class Greasemonkey1Spider(scrapy.Spider):
    name = "test"
    allowed_domains = ["localhost"]
    start_urls = (
        'http://localhost/test',
    )

    def parseContext(self, response):
#         print "Enter parseContext: ", response.url
        hxs = response
        sel = hxs.xpath('//title/text()')
        if sel != None:
            titles = sel.extract()
            if len(titles) > 0: title = titles[0]
            else: title = ''

        sel = hxs.xpath('/html/body')
        if sel != None:
            bodys = sel.extract()
            if len(bodys) > 0: body = bodys[0]
            else: body = ''
#         print title, repr(body)

    def parse(self, response):
        baseurl = response.url
        print 'baseurl = ', baseurl
        self.parseContext(response)

        hxs = response.xpath(r'//a')
        for path in hxs:
            titles = getfirst(path.xpath(r'text()').extract())
            urls = getfirst(path.xpath(r'@href').extract())
#             print titles, urls
            item_url = urljoin_rfc(baseurl, urls)
            yield Request(item_url,callback=self.parse)

if __name__ == '__main__':
    cmd = '''E:Python27Scriptsscrapy.exe crawl --nolog test'''
    cwd = os.path.split(__file__)[0]
    p = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False, cwd=cwd)
    while None == p.poll():
        out, err = p.communicate()
#         print 'out, err', out, err, repr(out), repr(err)
        if err:
            print err
        elif out:
            print out

print p.returncode

#     while not p.poll():
#         print p.stdout.read()
#         print p.stderr.read()

相关阅读:
CSS选择器
HTML2
html
http协议
python--Selectors模块/队列
Linux系统管理02----目录和文件管理
Linux系统管理01-----系统命令
02作业 linux第一章和第三章命令
01作业 Linux系统管理应用
01：计算机硬件组层与基本配置------02计算机系统硬件核心知识

原文地址：https://www.cnblogs.com/zhang-pengcheng/p/4287293.html