#coding:utf8 #爬取网上车市[http://www.cheshi.com/]的数据 import requests, json, time, re, os, sys, time,urllib2,shutil,string import threading import MySQLdb import redis from pyquery import PyQuery as pq from urlparse import urljoin from selenium import webdriver #设置utf-8编码格式 reload(sys) sys.setdefaultencoding( "utf-8" ) #读取文件内容 def getLines(filename): file_object = open(filename,'rb') lines = file_object.readlines() return lines #根据url_name获取url_type_id def get_url_type_id(v_url_name): #3000 品牌 奥迪 #4000 奥迪 奥迪A6 url_type_id = '' for line in getLines('/home/shutong/crawl/car/script/brand.ini'): line = line.strip() url_cate = line.split(',')[1] url_name = line.split(',')[2] if v_url_name.strip() == url_name.strip(): url_type_id = line.split(',')[0] return url_type_id break else : continue return url_type_id class ResultData(): '''数据文件类''' def __init__(self,industry_id,url_type_id,url_name,url_value,web_type_id,web_name,date_id): self.industry_id = industry_id self.url_type_id = url_type_id self.url_name = url_name self.url_value = url_value self.web_type_id = web_type_id self.web_name = web_name self.date_id = date_id def __str__(self): return self.industry_id,self.url_type_id,self.url_name,self.url_value,self.web_type_id,self.self.web_name,ResultData.date_id class Base(object): '''文件保存的基类''' def __init__(self,dev_prd_flag): self.dev_prd_flag = dev_prd_flag pass #print "This is init function" #保存数据到文件文件 def _saveContext(self,filename,*name): format = '^' context = name[0] for i in name[1:]: context = context + format + str(i) context = str(context).replace('(','(').replace(')',')').replace(',',',').replace(':',':') if self.dev_prd_flag != 'prd': print context else: #去除文件路径名首位空格 filename = filename.strip() #读取目录名称 path = os.path.dirname(filename) #如果目录不存在则创建目录 if not os.path.exists(path): os.makedirs(path) #读取文件名称,以追加的方式写文件 name = os.path.basename(filename) fp = open(filename,'a') fp.write(context+' ') fp.close() def saveData(self,filename,ResultData): if ResultData.url_type_id: self._saveContext(filename,ResultData.industry_id,ResultData.url_type_id,ResultData.url_name,ResultData.url_value,ResultData.web_type_id,ResultData.web_name,ResultData.date_id) else: #将数据进行保存在redis中 r = redis.Redis(host='192.168.122.140',port=6379,db=0) r.sadd('errorList',ResultData.industry_id+'^'+ResultData.url_name+'^'+ResultData.url_value) def __str__(self): return '保存文件的基类' class Crawl(Base): '''爬虫基础类''' driver = None #构造方法 def __init__(self,name,dev_prd_flag): super(Crawl,self).__init__(dev_prd_flag='dev') self.dev_prd_flag = dev_prd_flag self.name = name #self.driver = init_driver() '''初始化启动浏览器''' def init_driver(self): ua = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.3 Safari/537.36" cap = webdriver.DesiredCapabilities.PHANTOMJS cap["phantomjs.page.settings.resourceTimeout"] = 20000 cap["phantomjs.page.settings.loadImages"] = True cap["phantomjs.page.settings.disk-cache"] = True cap["phantomjs.page.settings.userAgent"] = ua cap["phantomjs.page.customHeaders.User-Agent"] =ua cap["phantomjs.page.customHeaders.Referer"] = "http://tj.ac.10086.cn/login/" driver = webdriver.PhantomJS(executable_path='/home/shutong/phantomjs/bin/phantomjs',desired_capabilities=cap, service_args=['--ignore-ssl-errors=true']) driver.set_page_load_timeout(60) driver.set_script_timeout(60) #return driver self.driver = driver #获取网页文本 def getHtml(self,url,code='utf-8'): html = '' try: if self.driver: self.driver.get(url) html = self.driver.page_source else : headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} request = urllib2.Request(url,headers=headers) response = urllib2.urlopen(request,data=None,timeout=60) if code: if code == 'gbk': html = unicode(response.read(),'gbk') else: html = unicode(response.read(),str(code)) except: pass finally: return html '''析构方法''' def __del__(self): if self.driver: self.driver.quit() print "浏览器成功关闭" else: print "浏览器未打开使用" def __str__(self): return "爬虫基础类" def start_crawl(url): #连接redis数据库 r = redis.Redis(host='192.168.122.140',port=6379,db=0) urllist = [] html = crawl.getHtml(url,'gbk') d = pq(html) for a in d('a'): a = pq(a) try: url_value = urljoin(url,a.attr('href')) name = a.text() #if re.compile(r'([a-z]+) ([a-z]+)', re.I) #http://newcar.xcar.com.cn/162/ if re.match( r'http://newcar.xcar.com.cn/[0-9]{1,10}/$', url_value, re.M|re.I): #print url_value,name #urllist.append(url_value) #将数据存储在redis中 #r.sadd('urllist',url_value) pass elif re.match(r'http://newcar.xcar.com.cn/m[0-9]{1,10}/$',url_value,re.M|re.I): r.sadd('urllist',url_value) except: pass #for index in urllist: for index in list(set(urllist)): print index try: #return start_crawl(index) pass except: pass def start_wscs_crawl(url): #生产或者测试标志 dev为测试 prd为生产 flag = 'prd' #汽车行业ID industry_id = '004004' #移动PC端web_type_id web_type_id = '0' #网站名称 web_name = '网上车市' crawl = Crawl('网上车市',flag) #加载浏览器 #crawl.init_driver() html = crawl.getHtml(url) d = pq(html) for div in d('div').filter('.list-box'): div = pq(div) #品牌 brand = div('div').filter('.lb').find('span').text() #品牌url brand_url = urljoin(url,div('div').filter('.lb')('a').attr('href')) #print brand,brand_url url_type_id = '3000' url_name = brand url_value = brand_url #保存品牌数据 #crawl._saveContext(filename,industry_id,url_type_id,url_name,url_value,web_type_id,web_name) resultData = ResultData(industry_id,url_type_id,url_name,url_value,web_type_id,web_name,date_id) crawl.saveData(filename,resultData) brand = div('div').filter('.rb')('dl')('dt')('a').text().replace('>>','') brand_url = urljoin(url,div('div').filter('.rb')('dl')('dt')('a').attr('href')) #print brand,brand_url url_type_id = '3000' url_name = brand url_value = brand_url #保存品牌数据 resultData = ResultData(industry_id,url_type_id,url_name,url_value,web_type_id,web_name,date_id) crawl.saveData(filename,resultData) #crawl._saveContext(filename,industry_id,url_type_id,url_name,url_value,web_type_id,web_name) for dd in div('div').filter('.rb')('dl')('dd'): dd = pq(dd) car_name = dd('div').filter('.con')('h4').text() car_url = urljoin(url,dd('div').filter('.con')('h4')('a').attr('href')) #print car_name,car_url url_type_id = get_url_type_id(car_name) url_name = car_name url_value = car_url #保存车系数据 #crawl._saveContext(filename,industry_id,url_type_id,url_name,url_value,web_type_id,web_name) resultData = ResultData(industry_id,url_type_id,url_name,url_value,web_type_id,web_name,date_id) crawl.saveData(filename,resultData) #制作汽车实体信息 #品牌 子品牌 车系名称 价位 图片url 网站名称 #多线程启动 def start_mutli_crawl(): list = [] for word in string.uppercase: #url = 'http://www.autohome.com.cn/grade/carhtml/%s.html' %(word) url = 'http://product.cheshi.com/static/selectcar/%s.html?t=1519713137030' % (word) list.append(url) #定义线程数组 threads = [] #创建线程 for i in range(len(list)): t = threading.Thread(target=start_wscs_crawl,args=(list[i],)) threads.append(t) #开启线程 for i in range(len(list)): threads[i].start() for i in range(len(list)): threads[i].join() #filename = '/home/shutong/crawl/car/script/wscs.csv' #date_id = '20180227' date_id = sys.argv[1] filename = sys.argv[2] #url = 'http://product.cheshi.com/static/selectcar/B.html?t=1519713137030' #start_wscs_crawl(url) #多线程启动 start_mutli_crawl()