scrapy 项目实战（一）----爬取雅昌艺术网数据

第一步：创建scrapy项目：

　　scrapy startproject Demo

第二步：创建一个爬虫

scrapy genspider demo http://auction.artron.net/result/pmh-0-0-2-0-1/

第三步：项目结构：

第四部：依次粘贴处各个文件的代码：

　　1. demo.py 文件验证码

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from Demo.items import *
from bs4 import BeautifulSoup
import time
# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')
import re
import hashlib

# 加密去重

def md5(str):

    m = hashlib.md5()
    m.update(str)
    return m.hexdigest()
#过滤注释信息，去掉换行
def replace(newline):
    newline = str(newline)
    newline = newline.replace('
','').replace('
','').replace('	','').replace('   ','').replace('amp;','')
    re_comment = re.compile('<!--[^>]*-->')
    newlines = re_comment.sub('', newline)
    newlines = newlines.replace('<!--','').replace('-->','')

    return newlines

class DemoSpider(scrapy.Spider):
    name = 'demo'
    allowed_domains = ['http://auction.artron.net/result/']
    start_urls = ['http://auction.artron.net/result/pmh-0-0-2-0-1/',
                  'http://auction.artron.net/result/pmh-0-0-2-0-2/',
                  'http://auction.artron.net/result/pmh-0-0-2-0-4/',
                  'http://auction.artron.net/result/pmh-0-0-2-0-5/',
                  'http://auction.artron.net/result/pmh-0-0-2-0-6/',
                  'http://auction.artron.net/result/pmh-0-0-2-0-7/',
                  'http://auction.artron.net/result/pmh-0-0-2-0-8/',
                  'http://auction.artron.net/result/pmh-0-0-2-0-9/',
                  'http://auction.artron.net/result/pmh-0-0-2-0-10/',
                  'http://auction.artron.net/result/pmh-0-0-2-0-3/']

    def parse(self, response):
        html = response.text
        soup = BeautifulSoup(html,'html.parser')
        result_lists = soup.find_all('ul',attrs={"class":"dataList"})[0]
        result_lists_replace = replace(result_lists)
        result_lists_replace = result_lists_replace.decode('utf-8')
        result_list = re.findall('<ul><li class="name">(.*?)</span></li></ul></li>',result_lists_replace)
        for ii in result_list:
            item = DemoItem()
            auction_name_url = re.findall('<a alt="(.*?)" href="(.*?)" target="_blank" title',ii)[0]
            auction_name = auction_name_url[0]
            auction_url = auction_name_url[1]
            auction_url = "http://auction.artron.net" + auction_url

            aucr_name_spider = re.findall('<li class="company"><a href=".*?" target="_blank">(.*?)</a>',ii)[0]
            session_address_time = re.findall('<li class="city">(.*?)</li><li class="time">(.*?)</li></ul>',ii)[0]

            session_address = session_address_time[0]
            item_auct_time = session_address_time[1]

            hashcode = md5(str(auction_url))
            create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))

            item['auction_name'] = auction_name
            item['auction_url'] = auction_url
            item['aucr_name_spider'] = aucr_name_spider
            item['session_address'] = session_address
            item['item_auct_time'] = item_auct_time
            item['hashcode'] = hashcode
            item['create_time'] = create_time
            print item
            yield item

2. items.py 文件

# -*- coding: utf-8 -*-

import scrapy


class DemoItem(scrapy.Item):
    auction_name = scrapy.Field()
    auction_url = scrapy.Field()
    aucr_name_spider = scrapy.Field()
    session_address = scrapy.Field()
    item_auct_time = scrapy.Field()
    hashcode = scrapy.Field()
    create_time = scrapy.Field()

3. pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
import MySQLdb



def insert_data(dbName,data_dict):

    try:

        data_values = "(" + "%s," * (len(data_dict)) + ")"
        data_values = data_values.replace(',)', ')')

        dbField = data_dict.keys()
        dataTuple = tuple(data_dict.values())
        dbField = str(tuple(dbField)).replace("'",'')
        conn = MySQLdb.connect(host="10.10.10.77", user="xuchunlin", passwd="ed35sdef456", db="epai_spider_2018", charset="utf8")
        cursor = conn.cursor()
        sql = """ insert into %s %s values %s """ % (dbName,dbField,data_values)
        params = dataTuple
        cursor.execute(sql, params)
        conn.commit()
        cursor.close()
        conn.close()

        print "=====  插入成功  ====="
        return 1

    except Exception as e:
        print "********                 插入失败                 ********"
        print e
        return 0


class DemoPipeline(object):

    def process_item(self, item, spider):
        dbName = "yachang_auction"
        data_dict= item
        insert_data(dbName, data_dict)

4. setting.py

# -*- coding: utf-8 -*-

# Scrapy settings for Demo project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'Demo'

SPIDER_MODULES = ['Demo.spiders']
NEWSPIDER_MODULE = 'Demo.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Demo (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

DEFAULT_REQUEST_HEADERS = {

    "Host":"auction.artron.net",
    # "Connection":"keep-alive",
    # "Upgrade-Insecure-Requests":"1",
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36",
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Referer":"http://auction.artron.net/result/pmh-0-0-2-0-2/",
    "Accept-Encoding":"gzip, deflate",
    "Accept-Language":"zh-CN,zh;q=0.8",
    "Cookie":"td_cookie=2322469817; gr_user_id=84f865e6-466f-4386-acfb-e524e8452c87; 
gr_session_id_276fdc71b3c353173f111df9361be1bb=ee1eb94e-b7a9-4521-8409-439ec1958b6c; gr_session_id_276fdc71b3c353173f111df9361be1bb_ee1eb94e-b7a9-4521-8409-
439ec1958b6c=true; _at_pt_0_=2351147; _at_pt_1_=A%E8%AE%B8%E6%98%A5%E6%9E%97; _at_pt_2_=e642b85a3cf8319a81f48ef8cc403d3b;
 Hm_lvt_851619594aa1d1fb8c108cde832cc127=1533086287,1533100514,1533280555,1534225608; Hm_lpvt_851619594aa1d1fb8c108cde832cc127=1534298942",

}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'Demo.middlewares.DemoSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'Demo.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {
   'Demo.pipelines.DemoPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

5. 爬虫数据库表格：

CREATE TABLE `yachang_auction` (
  `key_id` int(255) NOT NULL AUTO_INCREMENT,
  `auction_name` varchar(255) DEFAULT NULL,
  `auction_url` varchar(255) DEFAULT NULL,
  `aucr_name_spider` varchar(255) DEFAULT NULL,
  `session_address` varchar(255) DEFAULT NULL,
  `item_auct_time` varchar(255) DEFAULT NULL,
  `hashcode` varchar(255) DEFAULT NULL,
  `create_time` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`key_id`),
  UNIQUE KEY `hashcode` (`hashcode`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=230 DEFAULT CHARSET=utf8;

6.数据展示

相关阅读:
c# tabcontrol事件以及上下文菜单
 RDBMS架构的开源DW/DSS引擎列表
 spider RPC管理接口
 spider RPC高级特性
 linux TCP: time wait bucket table overflow
设计一个较好的框架的难点之一--API兼容性的设计
 windows 7/10下安装oracle 10g
mybatis同时启用mapperscanner和传统DAO
windows根据端口号找进程
 The POM for XXX:jar:${com.ld.base.service.version} is missing, no dependency information available
原文地址：https://www.cnblogs.com/xuchunlin/p/7253951.html