企查查网站中汇聚了有关注册企业的详细信息,为了更好的查询企业相关信息,本人对网站中安徽省境内的企业进行了爬取,其中遇到的问题和使用的技术如下:
1、遇到的问题:
1>企查查PC版数据只显示前500页,为了尽可能最大化爬取网站数据,本次爬取按照市级分别爬取,共计爬取安徽省境内16个市区共计80000条企业信息;
2>在爬取网站数据时,若爬取速度过快,会出现手动验证功能,为了解决手动验证,同时为了避免封号,直接采用随机更换IP代理,IP代理可以在《89免费代理》网站获取免费代理账号,网址为:http://www.89ip.cn/,可以一次性获取30个代理IP,如果不够用,
可以多次提取,然后构建代理池,本人试了,该网站的免费代理比西次代理和快代理网站的免费代理要好很多,如下图:
2、使用的技术:
1>请求模块:requests请求,为了避免反爬,采用随机代理,同时使用fake_useragent随机产生user-agent;
2>解析库:使用xpath和正则表达式
3>提速优化:采用多线程,同时对爬取的数据进行一次性保存,避免磁盘频繁IO;
3、核心代码如下:
import requests from lxml import etree from queue import Queue from threading import Thread from fake_useragent import UserAgent import csv import os import re import random import time from ippools import ProxySpider from proxy_ip import IP_LIST class QichachaSpider: def __init__(self): self.url = 'https://www.qichacha.com/gongsi_area.html?prov={}&city={}&p={}' self.q = Queue() self.company_info = [] self.headers = { 'Host': 'www.qichacha.com', 'Referer': 'https: // www.qichacha.com /', 'X-Requested-With': 'XMLHttpRequest' } # 随机User-Agent def random_ua(self): ua = UserAgent() return ua.random # 随机IP def random_proxy(self): proxy_list = ProxySpider().get_training_ip('https://www.qichacha.com/') return proxy_list # 爬取目标入队列 def put_url(self): self.headers['User-Agent'] = self.random_ua() url = 'https://www.qichacha.com/' html = requests.get(url, headers=self.headers).content.decode('utf-8', 'ignore') parse_html = etree.HTML(html) r_list = parse_html.xpath('//div[@class="areacom"]/div[2]/div[2]/a/@href') for r in r_list: link = r.split('_')[1:] for i in range(1, 501): url = self.url.format(link[0], link[1], i) print(url) self.q.put(url) # 获取一级页面数据 def get_data(self): while True: if not self.q.empty(): url = self.q.get() self.headers['User-Agent'] = self.random_ua() # proxies = self.random_proxy() proxies = random.choice(IP_LIST) try: html = requests.get(url, headers=self.headers, proxies=proxies, timeout=3).content.decode('utf-8', 'ignore') # html = requests.get(url, headers=self.headers).content.decode('utf-8', 'ignore') # time.sleep(random.uniform(0.5, 1.5)) parse_html = etree.HTML(html) company_list = parse_html.xpath('//table[@class="m_srchList"]/tbody/tr') if company_list is not None: for company in company_list: try: company_name = company.xpath('./td[2]/a/text()')[0].strip() company_link = 'https://www.qichacha.com' + company.xpath('./td[2]/a/@href')[0].strip() company_type, company_industry, company_business_scope = self.get_company_info( company_link) company_person = company.xpath('./td[2]/p[1]/a/text()')[0].strip() company_money = company.xpath('./td[2]/p[1]/span[1]/text()')[0].split(':')[-1].strip() company_time = company.xpath('./td[2]/p[1]/span[2]/text()')[0].split(':')[-1].strip() company_email = company.xpath('./td[2]/p[2]/text()')[0].split(':')[-1].strip() company_phone = company.xpath('td[2]/p[2]/span/text()')[0].split(':')[-1].strip() company_address = company.xpath('td[2]/p[3]/text()')[0].split(':')[-1].strip() company_status = company.xpath('td[3]/span/text()')[0].strip() company_dict = { '公司名称': company_name, '公司链接': company_link, '公司类型': company_type, '所属行业': company_industry, '经营范围': company_business_scope, '公司法人': company_person, '注册资本': company_money, '注册时间': company_time, '邮箱': company_email, '电话': company_phone, '地址': company_address, '是否存续': company_status, } print(company_dict) # self.company_info.append( # (company_name, company_link, company_type, company_industry, company_business_scope, # company_person, company_money, company_time, company_email, company_phone, # company_address, company_status)) info_list = [company_name, company_link, company_type, company_industry, company_business_scope, company_person, company_money, company_time, company_email, company_phone, company_address, company_status] self.save_data(info_list) except: with open('./bad.csv', 'a', encoding='utf-8', newline='') as f: writer = csv.writer(f) writer.writerow(url) continue except: self.q.put(url) else: break # 获取二级页面数据 def get_company_info(self, company_link): headers = {'User-Agent': UserAgent().random} html = requests.get(company_link, headers=headers, proxies=random.choice(IP_LIST), timeout=3).content.decode( 'utf-8', 'ignore') while True: if '企业类型' not in html: html = requests.get(company_link, headers=headers, proxies=random.choice(IP_LIST), timeout=3).content.decode( 'utf-8', 'ignore') else: break try: company_type = re.findall(r'企业类型</td> <td class="">(.*?)</td>', html, re.S)[0].strip() company_industry = re.findall(r'所属行业</td> <td class="">(.*?)</td>', html, re.S)[0].strip() company_business_scope = re.findall(r'经营范围.*?"3">(.*?)</td>', html, re.S)[0].strip() return company_type, company_industry, company_business_scope except: return '无', '无', '无' # 保存数据 def save_data(self, info): with open('./1111.csv', 'a', encoding='utf-8', newline='') as f: writer = csv.writer(f) writer.writerow(info) def main(self): if os.path.exists('./1111.csv'): os.remove('./1111.csv') with open('./1111.csv', 'a', encoding='utf-8', newline='') as f: writer = csv.writer(f) writer.writerow( ['公司名称', '公司链接', '公司类型', '所属行业', '经营范围', '公司法人', '注册资本', '注册时间', '邮箱', '电话', '地址', '是否存续']) self.put_url() t_list = [] for i in range(0, 10): t = Thread(target=self.get_data) t_list.append(t) t.start() for j in t_list: j.join() if __name__ == "__main__": spider = QichachaSpider() spider.main()
未了提高爬取速率,同时为了显示高大上,下面使用scrapy框架进行爬取,代码如下:
1、items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class QichachaItem(scrapy.Item): company_name = scrapy.Field() company_person = scrapy.Field() company_money = scrapy.Field() company_establish = scrapy.Field() company_email = scrapy.Field() company_phone = scrapy.Field() company_address = scrapy.Field() company_risk = scrapy.Field() company_status = scrapy.Field() company_type = scrapy.Field() company_trade = scrapy.Field() company_business_scope = scrapy.Field() company_link = scrapy.Field() company_city = scrapy.Field()
2、qichacha.py
# -*- coding: utf-8 -*- import scrapy import time import random import re from ..items import QichachaItem class QichachaSpider(scrapy.Spider): name = 'qichacha' allowed_domains = ['www.qichacha.com'] base_url = 'https://www.qichacha.com/gongsi_area.html?prov=AH&city={}&p={}' city_code_list = [340100, 340200, 340300, 340400, 340500, 340600, 340700, 340800, 341000, 341100, 341200, 341300, 341500, 341600, 341700, 341800] city_name_list = ['合肥市', '芜湖市', '蚌埠市', '淮南市', '马鞍山市', '淮北市', '铜陵市', '安庆市', '黄山市', '滁州市', '阜阳市', '宿州市', '六安市', '亳州市', '池州市', '宣城市'] base_company_url = 'https://www.qichacha.com{}' def start_requests(self): for i in range(len(self.city_code_list)): for j in range(1, 501): item = QichachaItem() item['company_city'] = self.city_name_list[i] url = self.base_url.format(self.city_code_list[i], j) yield scrapy.Request( url=url, meta={'item': item}, callback=self.parse_page ) time.sleep(random.randint(30, 60)) def parse_page(self, response): item = response.meta['item'] company_list = response.xpath('//*[@id="searchlist"]/table/tbody/tr') for company in company_list: item['company_name'] = company.xpath('td[2]/a/text()').extract_first() item['company_link'] = self.base_company_url.format(company.xpath('td[2]/a/@href').extract_first()) item['company_person'] = company.xpath('td[2]/p[1]/a/text()').extract_first() item['company_money'] = company.xpath('td[2]/p[1]/span[1]/text()').extract_first().split(':')[-1] item['company_establish'] = company.xpath('td[2]/p[1]/span[2]/text()').extract_first().split(':')[-1] item['company_email'] = company.xpath('td[2]/p[2]/text()').extract_first().split(':')[-1].strip() item['company_phone'] = company.xpath('td[2]/p[2]/span/text()').extract_first().split(':')[-1] item['company_address'] = company.xpath('td[2]/p[3]/text()').extract_first().split(':')[-1].strip() item['company_status'] = company.xpath('td[3]/span/text()').extract_first().split(':')[-1] yield scrapy.Request( url=item['company_link'], meta={'item': item}, callback=self.parse_company ) time.sleep(random.randint(10, 20)) def parse_company(self, response): item = response.meta['item'] html = response.text if re.findall(r'<h2>经营风险.*?<span>(.*?)</span>', html, re.S): item['company_risk'] = re.findall(r'<h2>经营风险.*?<span>(.*?)</span>', html, re.S)[0].strip() else: item['company_risk'] = '-' if re.findall(r'企业类型</td> <td class="">(.*?)</td>', html, re.S): item['company_type'] = re.findall(r'企业类型</td> <td class="">(.*?)</td>', html, re.S)[0].strip() else: item['company_type'] = '-' if re.findall(r'所属行业</td> <td class="">(.*?)</td>', html, re.S): item['company_trade'] = re.findall(r'所属行业</td> <td class="">(.*?)</td>', html, re.S)[0].strip() else: item['company_trade'] = '-' if re.findall(r'经营范围</td> <td class="" colspan="3">(.*?)</td>', html, re.S): item['company_business_scope'] = re.findall(r'经营范围</td> <td class="" colspan="3">(.*?)</td>', html, re.S)[ 0].strip() else: item['company_business_scope'] = '-' yield item time.sleep(random.uniform(0.5, 1))
3、pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import pymysql from .settings import * class QichachaPipeline(object): def process_item(self, item, spider): print([item['company_name'], item['company_person'], item['company_money'], item['company_establish'], item['company_email'], item['company_phone'], item['company_address'], item['company_risk'], item['company_status'], item['company_type'], item['company_trade'], item['company_link'], item['company_city'], item['company_business_scope']]) return item class MysqlPipeline(object): def open_spider(self, spider): self.db = pymysql.connect(host=MYSQL_HOST, port=MYSQL_PORT, user=MYSQL_USER, password=MYSQL_PWD, database=MYSQL_DB, charset='utf8') self.cursor = self.db.cursor() def process_item(self, item, spider): ins = 'insert into qichachatab values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' info_list = [item['company_name'], item['company_city'], item['company_person'], item['company_money'], item['company_establish'], item['company_email'], item['company_phone'], item['company_address'], item['company_risk'], item['company_status'], item['company_type'], item['company_trade'], item['company_link'], item['company_business_scope']] self.cursor.execute(ins, info_list) self.db.commit() return item def close_spider(self, spider): self.cursor.close() self.db.close()
4、middlewares.py
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals class QichachaSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class QichachaDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) from fake_useragent import UserAgent class RandomUserAgentDownloaderMiddleware(object): def process_request(self, requset, spider): ua = UserAgent().random requset.headers['User-Agent'] = ua import redis from .settings import * from .proxies import ProxypoolSpider class RandomProxyDownloaderMiddleware(object): def __init__(self): self.db = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, password=REDIS_PWD) def process_request(self, request, spider): proxy_list = self.db.zrangebyscore(REDIS_PROXY_KEY, 90, 100, withscores=True) if len(proxy_list) == 0: proxy_spider = ProxypoolSpider() proxy_spider.get_proxy() proxy = random.choice(proxy_list)[0].decode('utf-8') request.meta['proxy'] = proxy def process_response(self, request, response, spider): print(response.status, ": ", request.url) return request def process_exception(self, request, exception, spider): cur_proxy = request.meta['proxy'] print('异常来了') self.db.zincrby(REDIS_PROXY_KEY, -1, cur_proxy) del request.meta['proxy'] return request
5、settings.py
# -*- coding: utf-8 -*- # Scrapy settings for Qichacha project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html import time import random BOT_NAME = 'Qichacha' SPIDER_MODULES = ['Qichacha.spiders'] NEWSPIDER_MODULE = 'Qichacha.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) CONCURRENT_REQUESTS = 10 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = time.sleep(random.uniform(10, 20)) # The download delay setting will honor only one of: # CONCURRENT_REQUESTS_PER_DOMAIN = 16 # CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) # TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'Cookie': 'UM_distinctid=16bb7adb9252d8-09b3389bed6ae2-3a65420e-1fa400-16bb7adb92636e; zg_did=%7B%22did%22%3A%20%2216bb7adbb84740-04a7e287a3fa12-3a65420e-1fa400-16bb7adbb85669%22%7D; _uab_collina=156215474498922246746771; zg_63e87cf22c3e4816a30bfbae9ded4af2=%7B%22sid%22%3A%201562193465906%2C%22updated%22%3A%201562193465917%2C%22info%22%3A%201562193465914%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%7D; QCCSESSID=lnr0huo5t5s058h9tmlso56nu1; acw_tc=65e21c2915648350628141700eeaf85114e84964375db3a9f1b718d751; CNZZDATA1254842228=845561946-1562153840-https%253A%252F%252Fwww.baidu.com%252F%7C1565064428; hasShow=1; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1565048078,1565048449,1565048590,1565067806; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201565067805845%2C%22updated%22%3A%201565069298085%2C%22info%22%3A%201564658796236%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22sp0.baidu.com%22%2C%22cuid%22%3A%20%22d48f6830513b318400fcc23636a23a7f%22%7D; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1565069298', 'Referer': 'https://www.qichacha.com/', } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html # SPIDER_MIDDLEWARES = { # 'Qichacha.middlewares.QichachaSpiderMiddleware': 543, # } # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'Qichacha.middlewares.QichachaDownloaderMiddleware': 543, # 'Qichacha.middlewares.RandomUserAgentDownloaderMiddleware': 200, # 'Qichacha.middlewares.RandomProxyDownloaderMiddleware': 250, } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html # EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, # } # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'Qichacha.pipelines.QichachaPipeline': 300, 'Qichacha.pipelines.MysqlPipeline': 100, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html # AUTOTHROTTLE_ENABLED = True # The initial download delay # AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies # AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: # AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # HTTPCACHE_ENABLED = True # HTTPCACHE_EXPIRATION_SECS = 0 # HTTPCACHE_DIR = 'httpcache' # HTTPCACHE_IGNORE_HTTP_CODES = [] # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' MYSQL_HOST = '127.0.0.1' MYSQL_PORT = 3306 MYSQL_DB = 'qichachadb' MYSQL_USER = 'root' MYSQL_PWD = '123456' # Redis数据库定义 REDIS_HOST = '127.0.0.1' REDIS_PORT = 6379 REDIS_PWD = '123456' REDIS_DB = 0 REDIS_PROXY_KEY = 'proxy' # 日志管理 LOG_LEVEL = 'WARNING' # LOG_FILE = 'qichacha.log' # 编码设置 FEED_EXPORT_ENCODING = 'utf-8'
6、run.py
from scrapy import cmdline cmdline.execute('scrapy crawl qichacha'.split())