• 获取博客积分排名,存入数据库,读取数据进行绘图(python,selenium,matplotlib)


          该脚本的目的:获取博客的排名和积分,将抓取时间,排名,积分存入数据库,然后把最近的积分和排名信息进行绘图,查看积分或者排名的变化情况。

          整个脚本的流程:是利用python3来编写,利用selnium获取网页的信息,使用re正则表达式解析积分score和排名rank,用pymysql连接mysql数据库,最后利用matplotlib进行绘图。

      首先创建db: xiaoshitou

      创建表blog_rank: 

    CREATE TABLE `blog_rank` (
    `id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
    `rank` varchar(255) NOT NULL DEFAULT '' COMMENT '排名',
    `score` varchar(255) NOT NULL DEFAULT '' COMMENT '积分',
    `create_time` varchar(255) NOT NULL DEFAULT '' COMMENT '添加时间',
    PRIMARY KEY (`id`)
    ) ENGINE=InnoDB AUTO_INCREMENT=27 DEFAULT CHARSET=utf8;

      现在来看下绘图的结果:

        数据库表,blog_rank表中存的数据:

        下面就来看实现过程:

        1、该文件是利用pymysql来连接数据库,新增和查询数据的(operation_mysql.py)

    #coding=utf-8
    
    import pymysql as MySQLdb
    import datetime
    
    host = '127.0.0.1'
    user = 'root'
    passwd = '123456'
    port = 3306
    db = 'xiaoshitou'
    class OperationMySQL(object):
    
        def __init__(self):
            """连接数据库"""
            try:
                self.conn = MySQLdb.connect(host=host,
                                   port=port,
                                   user=user,
                                   passwd=passwd,
                                   db=db,
                                   charset='utf8', )
                self.cur = self.conn.cursor()
            except Exception as e:
                print('Connect MySQL Database Fail: ' + e)
    
        def _close_connect(self):
            """关闭连接"""
            self.cur.close()
            self.conn.close()
    
        def insert_data(self, data):
            """插入数据"""
            sql = 'insert into blog_rank (rank,score,create_time) values ({0},{1},{2})'.format(data['rank'], data['score'], datetime.datetime.now().timestamp())
            res = self.cur.execute(sql)
            self.conn.commit()
            self._close_connect()
    
        def select_data(self, sql=None):
          """根据sql查询数据"""
          if sql is None:
            sql = 'select rank,score,create_time from blog_rank order by create_time'
          self.cur.execute(sql)
          result = self.cur.fetchall()
          self._close_connect()
          headers = ('rank', 'score', 'create_time')
          results = [dict(zip(headers, row)) for row in result]
          # print(results)
          return results
    
    
    if __name__ == '__main__':
        OperationMySQL().select_data()

      2、get_my_blog_score.py,这个文件包含:获取网页内容,解析排名和积分,将抓取的数据存入数据库,读取数据库进行绘图

    # coding=utf-8
    try:
        import requests
    except:
        import os
        os.system('pip install requests')
        import requests
    import re
    from selenium import webdriver
    from time import sleep
    from operation_mysql import OperationMySQL
    
    
    class GetMyBlogScore:
        """获取博客园积分和排名"""
        def __init__(self):
            pass
    
        def _get_blog_content(self):
            """获取博客的页面内容"""
            url = "http://www.cnblogs.com/xiaoshitoutest"
            driver = webdriver.Firefox()
            sleep(1)
            driver.get(url)
            sleep(1)
            self.content = driver.page_source
            driver.quit()
            return
            
        def _match_content(self, compile_str_args):
            """进行匹配内容"""
            compile_str = re.compile(compile_str_args)
            result = compile_str.findall(self.content)
            final_str = re.sub(r'D', '', result[0])
            return final_str
    
        def _save_database(self, data):
            """将结果写入数据库"""
            if isinstance(data, dict) and data is not None:
                OperationMySQL().insert_data(data)
                print('Insert Data Success.')
            else:
                print('The data is invalid.')
    
        def _show_map(self):
            """读取数据库中的值,画图表,保存结果"""
            datas = OperationMySQL().select_data()
            import matplotlib.pyplot as plt
            from datetime import datetime
            from matplotlib.dates import datestr2num,DateFormatter
            import matplotlib.dates as dates
    
            x_ = [ datetime.fromtimestamp(float(x['create_time'])).strftime('%Y-%m-%d %H:%M:%S') for x in datas]
            score = [x['score'] for x in datas]
            rank = [x['rank'] for x in datas]
    
            plt.rcParams['font.sans-serif'] = ['FangSong']
    
            fig, ax = plt.subplots()
            ax.xaxis.set_major_locator(dates.DayLocator())
            ax.xaxis.set_major_formatter(DateFormatter('%Y-%m-%d'))
    
            ax.plot_date(datestr2num(x_),score,'--')
            ax.set_xlabel('日期')
            ax.set_ylabel('积分')
            ax.set_title('博客园排名--积分')
            fig.autofmt_xdate()
            # plt.show()
            plt.savefig('./rank_score.png')
    
    
        def run(self):
            score = r'<li.*?class="liScore">([sS]*?)</li>'
            rank = r'<li.*?class="liRank">([sS]*?)</li>'
            self._get_blog_content()
            scores = self._match_content(score)
            ranks = self._match_content(rank)
            result = dict(zip(['score', 'rank'], [scores, ranks]))
            self._save_database(result)
            self._show_map()
    
    
    if __name__ == '__main__':
        GetMyBlogScore().run()

      直接运行该文件,就会在当前目录下生成一个rank_score.png的图片,就是关于积分的变化图。

      开始那张是:时间--积分的绘图,我在放一张。积分--排名变化图

  • 相关阅读:
    数学考试
    奇♂妙拆分
    11.25
    11.21
    11.20
    11.19
    11.18
    11.15
    11.14作业
    11.14
  • 原文地址:https://www.cnblogs.com/xiaoshitoutest/p/6486131.html
Copyright © 2020-2023  润新知