一 新浪新闻爬取
1 爬取新浪新闻(全站爬取)
项目搭建与开启
scrapy startproject sina
cd sina
scrapy genspider mysina http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_2.shtml
2 项目setting配置
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'sina.pipelines.SinaPipeline': 300,
}
3 启动文件start.py配置
import scrapy.cmdline
def main():
# -o ['json', 'jsonlines', 'jl', 'csv', 'xml', 'marshal', 'pickle']
scrapy.cmdline.execute(['scrapy','crawl','mysina'])
if __name__ == '__main__':
main()
4 需求目标item配置
import scrapy
class SinaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
newsTitle = scrapy.Field()
newsUrl = scrapy.Field()
newsTime = scrapy.Field()
content = scrapy.Field()
5 爬虫逻辑文件配置mysina.py
import scrapy
import requests
from lxml import etree
from sina import items
from scrapy.spiders import CrawlSpider,Rule #CrawlSpiders:定义了一些规则跟进link
from scrapy.linkextractors import LinkExtractor #提取链接
class MysinaSpider(CrawlSpider): #继承了CrawlSpider因此parse需要重命名防止冲突
name = 'mysina'
allowed_domains = ['sina.com.cn']
start_urls = ['http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_2.shtml']
'''
Rule参数:link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity
LinkExtractor部分参数: allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=()
allow=(正则)允许的, deny=(正则)不允许的
callback=回调函数
follow= 跟随如果为True就跟随
'''
rules = [Rule(LinkExtractor(allow=('index_(d+).shtml')),callback='getParse',follow=True)]
def getParse(self, response): #重命名逻辑方法
newsList = response.xpath("//ul[@class='list_009']/li")
for news in newsList:
item = items.SinaItem() #对其进行实例化
newsTitle = news.xpath('./a/text()')[0].extract()
newsUrl = news.xpath('./a/@href')[0].extract()
newsTime = news.xpath('./span/text()')[0].extract()
content = self.getContent(newsUrl)
item['newsTitle'] = newsTitle
item['newsUrl'] = newsUrl
item['newsTime'] = newsTime
item['content'] = content
yield item
def getContent(self,url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36"
}
response = requests.get(url,headers=headers).content.decode('utf-8','ignore') #content二进制
mytree = etree.HTML(response)
contentList = mytree.xpath("//div[@class='article']//text()")
print(contentList)
content = ''
for c in contentList:
#Python strip() 方法用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列
content += c.strip().replace('
','') #保证content为整片文章
return content
方法二 :mysina.py也可采用scrapy创建请求
# -*- coding: utf-8 -*-
import scrapy
import requests
from lxml import etree
from sina import items
from scrapy.spiders import CrawlSpider,Rule #CrawlSpiders:定义了一些规则跟进link
from scrapy.linkextractors import LinkExtractor #提取链接
class MysinaSpider(CrawlSpider):
name = 'mysina'
allowed_domains = ['sina.com.cn']
start_urls = ['http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_2.shtml']
rules = [Rule(LinkExtractor(allow=('index_(d+).shtml')),callback='getParse',follow=True)]
def getParse(self, response):
newsList = response.xpath("//ul[@class='list_009']/li")
for news in newsList:
newsTitle = news.xpath('./a/text()')[0].extract()
newsUrl = news.xpath('./a/@href')[0].extract()
newsTime = news.xpath('./span/text()')[0].extract()
#构造请求(修改为框架Request构造请求)
request = scrapy.Request(newsUrl,callback=self.getMataContent) #回调为getMataContent
#使用meta传参
request.meta['newsTitle'] = newsTitle
request.meta['newsUrl'] = newsUrl
request.meta['newsTime'] = newsTime
yield request
def getMataContent(self,response):
'''
getMataContent接受来自request请求后的响应response
'''
contentList = response.xpath("//div[@class='article']//text()")
content = ''
for c in contentList:
content += c.extract().strip()
item = items.SinaItem()
#response响应数据对应字段赋值给item
item['newsTitle'] = response.meta['newsTitle']
item['newsUrl'] = response.meta['newsUrl']
item['newsTime'] = response.meta['newsTime']
item['content'] = content
yield item
6 管道存储pipelines.py
import pymysql
class SinaPipeline(object):
def __init__(self):
self.conn = None
self.cursor = None
def open_spider(self,spider):
self.conn = pymysql.connect(host='111.230.169.xxx',user='root',password='xxx',database='sina', port=3306,charset='utf8') #创建连接
self.cursor = self.conn.cursor() #创建数据库游标
def process_item(self, item, spider):
sql = 'insert into sina_news(newsTitle,newsUrl,newsTime,content) VALUES (%r,%r,%r,%r)'%(item['newsTitle'], item['newsUrl'], item['newsTime'], item['content'])
self.cursor.execute(sql) #执行sql语句
self.conn.commit() #提交
return item
def close_spider(self,spider):
self.cursor.close() #关闭
self.conn.close()
方法二 : pipelines.py 补充快速创建sql语句
import pymysql
class DemoPipeline(object):
def __init__(self):
self.conn = None
self.cur = None
def open_spider(self, spider):
self.conn = pymysql.connect(
host='127.0.0.1',
port=3306,
user='root',
password='123456',
db='fate',
charset='utf8')
self.cur = self.conn.cursor()
def process_item(self, item, spider):
cols, values = zip(*item.items()) #zip打包返回两个参数
sql = "INSERT INTO `%s` (%s) VALUES (%s)" %
(
'sina_news',
','.join(cols),
','.join(['%s'] * len(values))
)
self.cur.execute(sql, values) #执行sql语句并将values填充到%s
self.conn.commit()
return item
def close_spider(self, spider):
self.cur.close()
self.conn.close()
二 百科资料的爬取
1 百科资料爬取
项目搭建与开启
scrapy startproject baike
cd baike
scrapy genspider mybaike baike.baidu.com/item/Python/407313
2 项目setting配置
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
ITEM_PIPELINES = {
'baike.pipelines.BaikePipeline': 300,
}
3 启动文件start.py配置
import scrapy.cmdline
def main():
# -o ['json', 'jsonlines', 'jl', 'csv', 'xml', 'marshal', 'pickle']
scrapy.cmdline.execute(['scrapy','crawl','mybaike'])
if __name__ == '__main__':
main()
4 需求目标items配置
import scrapy
class BaikeItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
level1Title = scrapy.Field()
level2Title = scrapy.Field()
content = scrapy.Field()
5 爬虫逻辑文件配置mybaike.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider,Rule
from scrapy.linkextractors import LinkExtractor
from baike.items import BaikeItem
class MybaikeSpider(CrawlSpider):
name = 'mybaike'
allowed_domains = ['baike.baidu.com']
start_urls = ['https://baike.baidu.com/item/Python/407313']
rules = [Rule(LinkExtractor(allow=('item/(.*)')),callback='getParse',follow=True)]
def getParse(self, response):
level1Title = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h1/text()")[0].extract()
level2Title = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h2/text()")
if len(level2Title) != 0:
level2Title = level2Title[0].extract()
else:
level2Title = '待编辑'
contentList = response.xpath("//div[@class='lemma-summary']//text()")
content = ''
for c in contentList:
content += c.extract()
item = BaikeItem()
item['level1Title'] = level1Title
item['level2Title'] = level2Title
item['content'] = content
yield item
6 管道存储pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class BaikePipeline(object):
def __init__(self):
self.conn = None
self.cousor = None
def open_spider(self, spider):
# 连接
self.conn = pymysql.connect(host='111.230.169.107', user='root', password="20111673",
database='baike', port=3306,
charset='utf8')
# 游标
self.cousor = self.conn.cursor()
def process_item(self, item, spider):
cols, values = zip(*item.items())
# `表名`
sql = "INSERT INTO `%s`(%s) VALUES (%s)" %
('baike', ','.join(cols), ','.join(['%s'] * len(values)))
self.cousor.execute(sql, values)
self.conn.commit()
return item
def close_spider(self, spider):
self.cousor.close()
self.conn.close()
三 豆瓣电影的爬取
1 豆瓣电影排行版
项目搭建与开启
scrapy startproject douban
cd douban
scrapy genspider mysina movie.douban.com/top250
2 项目setting配置
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36"
}
ITEM_PIPELINES = {
'douban.pipelines.DoubanPipeline': 300,
}
3 启动文件start.py配置
import scrapy.cmdline
def main():
# -o ['json', 'jsonlines', 'jl', 'csv', 'xml', 'marshal', 'pickle']
scrapy.cmdline.execute(['scrapy','crawl','mybaike'])
if __name__ == '__main__':
main()
4 需求目标items配置
import scrapy
class DoubanItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
movieInfo = scrapy.Field()
star = scrapy.Field()
quote = scrapy.Field()
5 爬虫逻辑文件配置mydouban.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from douban.items import DoubanItem
class MydoubanSpider(scrapy.Spider):
name = 'mydouban'
url = ['https://movie.douban.com/top250']
start_urls = {'https://movie.douban.com/top250'} #方法1
'''#方法二
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
}
def start_requests(self):
url = 'https://movie.douban.com/top250'
yield Request(url, headers=self.headers)
'''
def parse(self, response):
item = DoubanItem()
movies = response.xpath('//ol[@class="grid_view"]/li')
for movie in movies:
item['name'] = movie.xpath(".//div[@class='pic']/a/img/@alt").extract()[0]
item['movieInfo'] = movie.xpath(".//div[@class='info']/div[@class='bd']/p/text()").extract()[0].strip()
item['star'] = movie.xpath(".//div[@class='info']/div[@class='bd']/div[@class='star']/span[2]/text()").extract()[0]
item['quote'] = movie.xpath('.//div[@class="star"]/span/text()').re(r'(d+)人评价')[0]
yield item
next_url = response.xpath('//span[@class="next"]/a/@href').extract() #获取下一页链接
if next_url:
next_url = 'https://movie.douban.com/top250' + next_url[0]
yield Request(next_url,callback=self.parse) #执行回调
6 管道存储pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class DoubanPipeline(object):
def __init__(self):
self.conn = pymysql.connect(host='111.230.169.107', port=3306, user= 'root', passwd = 'xxx', database = 'douban',charset = 'utf8')
self.cursor = self.conn.cursor()
self.cursor.execute("truncate table Movie") #此处设置每开启就清空
self.conn.commit()
def process_item(self, item, spider):
try:
self.cursor.execute("insert into Movie (name,movieInfo,star,quote) VALUES (%s,%s,%s,%s)",(item['name'], item['movieInfo'], item['star'], item['quote']))
self.conn.commit()
except pymysql.Error:
print("Error%s,%s,%s,%s" % (item['name'], item['movieInfo'], item['star'], item['quote']))
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()