我们要用scrapy抓取企业名录网站的企业信息,并且保存在mysql数据库中,数据大概是22万条,我们用scrapy抓取。
第一步,现在item中定义好要抓取的字段
import scrapy class RepairSpiderItem(scrapy.Item): city_name = scrapy.Field() area_name = scrapy.Field() company_name = scrapy.Field() company_address = scrapy.Field() phone = scrapy.Field() mobile_phone = scrapy.Field()
第二步,定义spider的抓取逻辑。
def start_requests(self): url = 'http://xiu.iqixiu.cn/' yield Request(url=url,callback=self.parse) def parse(self, response): html1 = etree.HTML(response.text) tr_list = html1.xpath('/html/body/table/tbody/tr') # 得到每一行的元素 for tr in tr_list: # 遍历每一行 td_list = tr.xpath('./td[2]/font') # 去除 for td in td_list: href = td.xpath('./a/@href')[0] print('href:', href) href_url = 'http://xiu.iqixiu.cn/' + str(href.replace('{', '%7B').replace('}', '%7D')) print('href_url:', href_url) yield Request(url=href_url, callback=self.parse_dail) def parse_dail(self,response): html = etree.HTML(response.text) tr_list = html.xpath(r'//table/tbody/tr') for tr in tr_list[2:]: # 获取这一行所有得文本内容 name_tr = tr.xpath('string()') # 将这一行字符串处理,得到一个列表,这样我们可以得到比如地址,电话这些信息,直接用列表下标取出来就可以了 name_list = name_tr.replace(' ', '').replace(' ', '').split(' ') print(name_list) # 去除里面得文本属性值,并字符串处理。因为列表里得文本不全,这个文本属性值比较全 print(tr.xpath('./td[4]/@*')[-1].replace("'", '')) if '名称' in tr_list[1].xpath('string()').replace(' ', '').replace(' ', '').split(' ')[3]: city_name = tr_list[0].xpath('string()').split('市')[0]+'市' area_name = name_list[2] if tr.xpath('./td[3]/@*')[-1].replace("'", '') == 'nowrap': company_name = name_list[3] else: company_name = tr.xpath('./td[3]/@*')[-1].replace("'", '') company_address = name_list[4] phone = name_list[5] mobile_phone = name_list[6] #实例化字典存储对象 Repair_item= RepairSpiderItem() Repair_item['city_name'] = city_name Repair_item['area_name'] = area_name Repair_item['company_name'] = company_name Repair_item['company_address'] = company_address Repair_item['phone'] = phone Repair_item['mobile_phone'] = mobile_phone yield Repair_item else: city_name = name_list[2] area_name = name_list[3] company_name = tr.xpath('./td[4]/@*')[-1].replace("'", '') company_address = name_list[5] phone = name_list[6] mobile_phone = name_list[7] # 实例化字典存储对象 Repair_item = RepairSpiderItem() Repair_item['city_name'] = city_name Repair_item['area_name'] = area_name Repair_item['company_name'] = company_name Repair_item['company_address'] = company_address Repair_item['phone'] = phone Repair_item['mobile_phone'] = mobile_phone yield Repair_item
第三步,在setting里把保存的管道打通。也就是把默认的67,68行被注释的给解开
ITEM_PIPELINES = { 'repair_spider.pipelines.RepairSpiderPipeline': 300, }
第四步,在pipelines中定义存储
import pymysql class RepairSpiderPipeline(object): def __init__(self): self.coon = pymysql.connect(host='193.112.180.37', user='root', password='***********', port=3306, db='*********') self.cursor = self.coon.cursor() self.cursor.execute("create table IF NOT EXISTS klkl_Service_shop(city_name char(50), area_name char(50), company_name char(100), company_address char(100), phone char(50), mobile_phone char(50));") self.coon.commit() def process_item(self, item, spider): sql = 'insert into klkl_Service_shop(city_name,area_name,company_name,company_address,phone,mobile_phone) values (%s,%s,%s,%s,%s,%s)' try: self.coon.ping(reconnect=True) self.cursor.execute(sql, (item['city_name'], item['area_name'], item['company_name'], item['company_address'], item['phone'],item['mobile_phone'])) self.coon.commit() print('klkl_Service_shop提交成功') except: self.coon.rollback() print('klkl_Service_shop提交失败') return item