• python 爬虫爬取历年双色球开奖信息


     目前写的这些爬虫都是些静态网页,对于一些高级网页(像经过JS渲染过的页面),目前技术并不能解决,自己也是在慢慢学习过程中,如有错误,欢迎指正;

    对面前端知识本人并不懂,过程中如果涉及到前端知识,也是百度而来,毕竟爬虫还是和前端页面打交道多,前端知识还是要多学习; 

    此篇还是继续静态页面,更换了不同的内容,以及涉及到多个python 模块和自己二次封装的模块,个人感觉这些模块不使用在爬虫方面也是很有用的;

    第一部分,封装了自带模块logging,其中使用了getpass 模块,用来记录日志的用户名,都是些简单的使用,关于注释,本来已写好,并未上传到git,导致此次上传代码没有下次注意,哈哈;

    个人建议:在学习python 过程中多练习写代码,在写的过程中去理解其中的用法;

    #!/usr/bin/env python
    #coding:utf-8
    #author chenjisheng
    #date 20171129
    import logging
    import getpass
    
    
    class MyLog(object):
        "this class will create log"
        def __init__(self):
            user = getpass.getuser()          
            self.logger = logging.getLogger(user)
            self.logger.setLevel(logging.DEBUG)
            logFile = './progress.log'
            formatter = logging.Formatter(
                '%(asctime) -12s %(levelname)-8s %(name) -10s %(message)-12s'
            )
            '''logfile output screen and files'''
            logHand = logging.FileHandler(logFile)
            logHand.setFormatter(formatter)
            logHand.setLevel(logging.ERROR)
            logHandt = logging.StreamHandler()
            logHandt.setFormatter(formatter)
            self.logger.addHandler(logHand)
            self.logger.addHandler(logHandt)
    
            '''five level and five functions '''
        def debug(self,msg):
            self.logger.debug(msg)
    
        def info(self,msg):
            self.logger.info(msg)
    
        def warn(self,msg):
            self.logger.warning(msg)
    
        def error(self,msg):
            self.logger.error(msg)
    
        def critical(self,msg):
            self.logger.critical(msg)
    
    if __name__ == "__main__":
        mylog = MyLog()
        mylog.debug('i am debug')
        mylog.info('i am info')
        mylog.warn('i am warning')
        mylog.error('i am error')
        mylog.critical('i am critical')
    

     第二部分:使用了re,urllib2,xlwt,bs4,sys模块;xlwt模块在之前的博客里已写过;bs4 模块大名鼎鼎,不过多解析,至于为什么用它,因为其简单,其它的爬虫模块也不会;也在学习当中;

    #!/usr/bin/env python
    #coding:utf-8
    """Created on 2017-11-29"""
    
    import re
    import urllib2
    import xlwt
    from bs4 import BeautifulSoup
    from myLog import MyLog as mylog
    import sys
    reload(sys)
    sys.setdefaultencoding('utf8')
    
    class DoubleColorBallItem(object):
        date = None
        order = None
        red1 = None
        red2 = None
        red3 = None
        red4 = None
        red5 = None
        red6 = None
        bule = None
        money = None
        firstPrize = None
        secondPrize = None
    
    class GetDoubleColorBallNumber(object):
        """capture BallNumbers"""
    
        def __init__(self):
            self.urls = []
            self.log = mylog()
            self.getUrls()
            self.items = self.spider(self.urls)
            self.pipelines(self.items,self)
    
    
        def getUrls(self):
            URL = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html'
            htmlContent = self.getResponseContent(URL)
            soup = BeautifulSoup(htmlContent,'lxml')
            tag = soup.find_all(re.compile('p'))[-1]
            # pages = tag.strong.get_text()
            pages = 2
            for i in xrange(1,int(pages)+1):
                url = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_'+str(i)+'.html'
                self.urls.append(url)
                self.log.info(u'append URL:%s to URLS 
    '%url)
    
        def getResponseContent(self,url):
            try:
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
                }
                req = urllib2.Request(url,headers=headers)
                response = urllib2.urlopen(req)
            except Exception,e:
                self.log.error(u'return datas failed URL:%s
    '%url)
            else:
                self.log.info(u'return datas successfuly URL:%s
    '%url)
                return response.read()
    
        def spider(self,urls):
            items = []
            for url in urls:
                htmlContent = self.getResponseContent(url)
                soup = BeautifulSoup(htmlContent,'lxml')
                tags = soup.find_all('tr',attrs={})
                for tag in tags:
                    if tag.find('em'):
                        item = DoubleColorBallItem()
                        tagTd = tag.find_all('td')
                        item.date = tagTd[0].get_text()
                        item.order = tagTd[1].get_text()
                        tagEm = tagTd[2].find_all('em')
                        item.red1 = tagEm[0].get_text()
                        item.red2 = tagEm[1].get_text()
                        item.red3 = tagEm[2].get_text()
                        item.red4 = tagEm[3].get_text()
                        item.red5 = tagEm[4].get_text()
                        item.red6 = tagEm[5].get_text()
                        item.bule = tagEm[6].get_text()
                        item.money = tagTd[3].find('strong').get_text()
                        item.firstPrize = tagTd[4].find('strong').get_text()
                        item.secondPrize = tagTd[5].find('strong').get_text()
                        items.append(item)
                        self.log.info(u'get date:%s datas OK
    '%item.date)
            return items
    
        def pipelines(self,items,nu):
            # fileName = 'DoubleBall.txt'
            # with open(fileName,'w') as fp:
            #     for item in items:
            #         fp.write('%s %s 	 %s %s %s %s %s %s  	 %s 	  %s   %s 
    '%(item.date,item.order,item.red1,item.red2,item.red3,item.red4,item.red5,
            #                                                                   item.red6,item.bule,item.firstPrize,item.secondPrize))
            #         self.log.info(u'write date:%s OK '%item.date)
            W = xlwt.Workbook('utf-8')
            ws = W.add_sheet(u"双色球记录")
            # ws.col(1).width = 6666
            # ws.col(2).width = 3333
            ws.write(0,1,label=u"时间")
            ws.write(0,2,label=u"期号")
            ws.write(0,3, label=u"红色1")
            ws.write(0,4, label=u"红色2")
            ws.write(0,5, label=u"红色3")
            ws.write(0,6, label=u"红色4")
            ws.write(0,7, label=u"红色5")
            ws.write(0,8, label=u"红色6")
            ws.write(0,9, label=u"蓝色")
            ws.write(0,10, label=u"一等奖")
            ws.write(0,11, label=u"二等奖")
            nu = 1
            for item in items:
                ws.write(nu,1,label=item.date)
                ws.write(nu,2,label=item.order)
                ws.write(nu,3,label=item.red1)
                ws.write(nu,4,label=item.red2)
                ws.write(nu,5,label=item.red3)
                ws.write(nu,6,label=item.red4)
                ws.write(nu,7,label=item.red5)
                ws.write(nu,8,label=item.red6)
                ws.write(nu,9,label=item.bule)
                ws.write(nu,10,label=item.firstPrize)
                ws.write(nu,11,label=item.secondPrize)
                nu += 1
            W.save(u"双色球记录表.xls")
    if __name__ == '__main__':
        GDCBN = GetDoubleColorBallNumber()
    

     以上部分,也是学习了别人经验,也从代码中学到了不少知识,愿本文也能给你带来灵感;

  • 相关阅读:
    hdu 4114 Disney's FastPass 状压dp
    lightoj 1381
    bzoj 2428: [HAOI2006]均分数据 随机化
    bzoj 3969: [WF2013]Low Power 二分
    套题:wf2013 (1/8)
    hdu 4119 Isabella's Message 模拟题
    hdu 4118 Holiday's Accommodation 树形dp
    UESTC 2015dp专题 N 导弹拦截 dp
    UESTC 2015dp专题 j 男神的约会 bfs
    UESTC 2015dp专题 H 邱老师选妹子 数位dp
  • 原文地址:https://www.cnblogs.com/Mail-maomao/p/7955389.html
Copyright © 2020-2023  润新知