• Python爬取网上车市[http://www.cheshi.com/]的数据


    #coding:utf8
    #爬取网上车市[http://www.cheshi.com/]的数据
    import requests, json, time, re, os, sys, time,urllib2,shutil,string
    import threading
    import MySQLdb
    import redis
    from pyquery import PyQuery as pq
    from urlparse import urljoin 
    from selenium import webdriver
    
    #设置utf-8编码格式
    reload(sys)
    sys.setdefaultencoding( "utf-8" )
    
    #读取文件内容
    def getLines(filename):
        file_object = open(filename,'rb')
        lines = file_object.readlines()
        return lines
    
    #根据url_name获取url_type_id
    def get_url_type_id(v_url_name):
        #3000 品牌  奥迪
        #4000 奥迪  奥迪A6
        url_type_id = ''
        for line in getLines('/home/shutong/crawl/car/script/brand.ini'):
            line = line.strip()
            url_cate = line.split(',')[1]
            url_name = line.split(',')[2]
            if v_url_name.strip() == url_name.strip():
                url_type_id = line.split(',')[0]
                return url_type_id
                break
            else :
                continue
        return url_type_id
    
    class ResultData():
        '''数据文件类'''
        def __init__(self,industry_id,url_type_id,url_name,url_value,web_type_id,web_name,date_id):
            self.industry_id = industry_id
            self.url_type_id = url_type_id
            self.url_name = url_name
            self.url_value = url_value
            self.web_type_id = web_type_id
            self.web_name = web_name
            self.date_id = date_id
        
        def __str__(self):
            return self.industry_id,self.url_type_id,self.url_name,self.url_value,self.web_type_id,self.self.web_name,ResultData.date_id
    
        
    class Base(object):
        '''文件保存的基类'''
        
        def __init__(self,dev_prd_flag):
            self.dev_prd_flag = dev_prd_flag
            pass 
            #print "This is init function"
            
        #保存数据到文件文件
        def _saveContext(self,filename,*name):
            format = '^'
            context = name[0]
            for i in name[1:]:
                context = context + format + str(i)
            context = str(context).replace('','(').replace('',')').replace('',',').replace('',':')
            if self.dev_prd_flag != 'prd':
                print context
            else:
                #去除文件路径名首位空格
                filename = filename.strip()
                #读取目录名称
                path = os.path.dirname(filename)
                #如果目录不存在则创建目录
                if not os.path.exists(path):
                    os.makedirs(path) 
                #读取文件名称,以追加的方式写文件
                name = os.path.basename(filename)
                fp = open(filename,'a')
                fp.write(context+'
    ')
                fp.close()
        
        
        def saveData(self,filename,ResultData):
            if ResultData.url_type_id:
                self._saveContext(filename,ResultData.industry_id,ResultData.url_type_id,ResultData.url_name,ResultData.url_value,ResultData.web_type_id,ResultData.web_name,ResultData.date_id)
            else:
                #将数据进行保存在redis中
                r = redis.Redis(host='192.168.122.140',port=6379,db=0)
                r.sadd('errorList',ResultData.industry_id+'^'+ResultData.url_name+'^'+ResultData.url_value)
        
        def __str__(self):
            return '保存文件的基类'
    
    class Crawl(Base):
        '''爬虫基础类'''
        driver = None
        #构造方法
        def __init__(self,name,dev_prd_flag):
            super(Crawl,self).__init__(dev_prd_flag='dev')
            self.dev_prd_flag = dev_prd_flag
            self.name = name
            #self.driver = init_driver()
        
        '''初始化启动浏览器'''
        def init_driver(self):
            ua = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.3 Safari/537.36"
            cap = webdriver.DesiredCapabilities.PHANTOMJS
            cap["phantomjs.page.settings.resourceTimeout"] = 20000
            cap["phantomjs.page.settings.loadImages"] = True
            cap["phantomjs.page.settings.disk-cache"] = True
            cap["phantomjs.page.settings.userAgent"] = ua
            cap["phantomjs.page.customHeaders.User-Agent"] =ua
            cap["phantomjs.page.customHeaders.Referer"] = "http://tj.ac.10086.cn/login/"
            driver = webdriver.PhantomJS(executable_path='/home/shutong/phantomjs/bin/phantomjs',desired_capabilities=cap, service_args=['--ignore-ssl-errors=true'])
            driver.set_page_load_timeout(60)  
            driver.set_script_timeout(60)
            #return driver
            self.driver = driver
            
    
        #获取网页文本
        def getHtml(self,url,code='utf-8'):
            html = ''
            try:
                if self.driver:
                    self.driver.get(url)
                    html = self.driver.page_source
                else :
                    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}  
                    request = urllib2.Request(url,headers=headers)
                    response = urllib2.urlopen(request,data=None,timeout=60)
                    if code:
                        if code == 'gbk':
                            html = unicode(response.read(),'gbk')
                        else:
                            html = unicode(response.read(),str(code))
            except:
                pass
            finally:
                return html             
    
        '''析构方法'''
        def __del__(self):
            if self.driver:
                self.driver.quit()
                print "浏览器成功关闭"
            else:
                print "浏览器未打开使用"
            
        def __str__(self):
            return "爬虫基础类"
    
    
    def start_crawl(url):
        #连接redis数据库
        r = redis.Redis(host='192.168.122.140',port=6379,db=0)
        urllist = []
        html = crawl.getHtml(url,'gbk')
        d = pq(html)
        for a in d('a'):
            a = pq(a)
            try:
                url_value = urljoin(url,a.attr('href'))
                name = a.text()
                #if re.compile(r'([a-z]+) ([a-z]+)', re.I)
                #http://newcar.xcar.com.cn/162/
                if re.match( r'http://newcar.xcar.com.cn/[0-9]{1,10}/$', url_value, re.M|re.I):
                    #print url_value,name
                    #urllist.append(url_value)
                    #将数据存储在redis中
                    #r.sadd('urllist',url_value)
                    pass 
                elif re.match(r'http://newcar.xcar.com.cn/m[0-9]{1,10}/$',url_value,re.M|re.I):
                    r.sadd('urllist',url_value)
            except:
                pass 
        #for index in urllist:
        for index in list(set(urllist)):
            print index
            try:
                #return start_crawl(index)
                pass 
            except:
                pass 
    
    
    def start_wscs_crawl(url):
    
        #生产或者测试标志 dev为测试 prd为生产
        flag = 'prd'
        #汽车行业ID
        industry_id = '004004'
        #移动PC端web_type_id
        web_type_id = '0'
        #网站名称
        web_name = '网上车市'
        crawl = Crawl('网上车市',flag)
        #加载浏览器
        #crawl.init_driver()
        html = crawl.getHtml(url)
        d = pq(html)
        for div in d('div').filter('.list-box'):
            div = pq(div)
            #品牌
            brand = div('div').filter('.lb').find('span').text()
            #品牌url
            brand_url = urljoin(url,div('div').filter('.lb')('a').attr('href'))
            #print brand,brand_url
            url_type_id = '3000'
            url_name = brand
            url_value = brand_url
            #保存品牌数据
            #crawl._saveContext(filename,industry_id,url_type_id,url_name,url_value,web_type_id,web_name)
            resultData = ResultData(industry_id,url_type_id,url_name,url_value,web_type_id,web_name,date_id)
            crawl.saveData(filename,resultData)
            
            
            brand = div('div').filter('.rb')('dl')('dt')('a').text().replace('>>','')
            brand_url = urljoin(url,div('div').filter('.rb')('dl')('dt')('a').attr('href'))
            #print brand,brand_url
            url_type_id = '3000'
            url_name = brand
            url_value = brand_url
            #保存品牌数据 
            resultData = ResultData(industry_id,url_type_id,url_name,url_value,web_type_id,web_name,date_id)
            crawl.saveData(filename,resultData)
            #crawl._saveContext(filename,industry_id,url_type_id,url_name,url_value,web_type_id,web_name)
            
            for dd in div('div').filter('.rb')('dl')('dd'):
                dd = pq(dd)
                car_name = dd('div').filter('.con')('h4').text()
                car_url = urljoin(url,dd('div').filter('.con')('h4')('a').attr('href'))
                #print car_name,car_url
                url_type_id = get_url_type_id(car_name)
                url_name = car_name
                url_value = car_url 
                #保存车系数据
                #crawl._saveContext(filename,industry_id,url_type_id,url_name,url_value,web_type_id,web_name)
                
                resultData = ResultData(industry_id,url_type_id,url_name,url_value,web_type_id,web_name,date_id)
                crawl.saveData(filename,resultData)
                #制作汽车实体信息
                #品牌 子品牌  车系名称 价位  图片url  网站名称 
    
    
    
    
    
    #多线程启动
    def start_mutli_crawl():
        list = []
        for word in string.uppercase:
            #url = 'http://www.autohome.com.cn/grade/carhtml/%s.html' %(word)
            url = 'http://product.cheshi.com/static/selectcar/%s.html?t=1519713137030' % (word)
            list.append(url)
        #定义线程数组
        threads = []
        #创建线程
        for i in range(len(list)):
            t = threading.Thread(target=start_wscs_crawl,args=(list[i],))
            threads.append(t)
        
        #开启线程
        for i in  range(len(list)):
            threads[i].start()
        
        for i in range(len(list)):
            threads[i].join()
    
    #filename = '/home/shutong/crawl/car/script/wscs.csv'
    #date_id = '20180227'
    
    date_id = sys.argv[1]
    filename = sys.argv[2]
    #url = 'http://product.cheshi.com/static/selectcar/B.html?t=1519713137030'
    #start_wscs_crawl(url)
    #多线程启动
    start_mutli_crawl()
  • 相关阅读:
    IPv6隧道技术——6to4实验分析
    IPV6地址解析与DAD机制实验分析
    交换机的高级特性
    组播IGMP实验分析
    BGP实验分析(二)
    BGP实验分析(一)
    路由策略实验分析(二)
    路由策略实验分析(一)
    一线互联网拼多多、饿了么、蚂蚁金服、哈啰出行、携程、饿了么、2345、百度等一些Java面试题
    Java中的匿名内部类
  • 原文地址:https://www.cnblogs.com/Jims2016/p/8554928.html
Copyright © 2020-2023  润新知