该脚本的目的:获取博客的排名和积分,将抓取时间,排名,积分存入数据库,然后把最近的积分和排名信息进行绘图,查看积分或者排名的变化情况。
整个脚本的流程:是利用python3来编写,利用selnium获取网页的信息,使用re正则表达式解析积分score和排名rank,用pymysql连接mysql数据库,最后利用matplotlib进行绘图。
首先创建db: xiaoshitou
创建表blog_rank:
CREATE TABLE `blog_rank` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
`rank` varchar(255) NOT NULL DEFAULT '' COMMENT '排名',
`score` varchar(255) NOT NULL DEFAULT '' COMMENT '积分',
`create_time` varchar(255) NOT NULL DEFAULT '' COMMENT '添加时间',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=27 DEFAULT CHARSET=utf8;
现在来看下绘图的结果:
数据库表,blog_rank表中存的数据:
下面就来看实现过程:
1、该文件是利用pymysql来连接数据库,新增和查询数据的(operation_mysql.py)
#coding=utf-8 import pymysql as MySQLdb import datetime host = '127.0.0.1' user = 'root' passwd = '123456' port = 3306 db = 'xiaoshitou' class OperationMySQL(object): def __init__(self): """连接数据库""" try: self.conn = MySQLdb.connect(host=host, port=port, user=user, passwd=passwd, db=db, charset='utf8', ) self.cur = self.conn.cursor() except Exception as e: print('Connect MySQL Database Fail: ' + e) def _close_connect(self): """关闭连接""" self.cur.close() self.conn.close() def insert_data(self, data): """插入数据""" sql = 'insert into blog_rank (rank,score,create_time) values ({0},{1},{2})'.format(data['rank'], data['score'], datetime.datetime.now().timestamp()) res = self.cur.execute(sql) self.conn.commit() self._close_connect() def select_data(self, sql=None): """根据sql查询数据""" if sql is None: sql = 'select rank,score,create_time from blog_rank order by create_time' self.cur.execute(sql) result = self.cur.fetchall() self._close_connect() headers = ('rank', 'score', 'create_time') results = [dict(zip(headers, row)) for row in result] # print(results) return results if __name__ == '__main__': OperationMySQL().select_data()
2、get_my_blog_score.py,这个文件包含:获取网页内容,解析排名和积分,将抓取的数据存入数据库,读取数据库进行绘图
# coding=utf-8 try: import requests except: import os os.system('pip install requests') import requests import re from selenium import webdriver from time import sleep from operation_mysql import OperationMySQL class GetMyBlogScore: """获取博客园积分和排名""" def __init__(self): pass def _get_blog_content(self): """获取博客的页面内容""" url = "http://www.cnblogs.com/xiaoshitoutest" driver = webdriver.Firefox() sleep(1) driver.get(url) sleep(1) self.content = driver.page_source driver.quit() return def _match_content(self, compile_str_args): """进行匹配内容""" compile_str = re.compile(compile_str_args) result = compile_str.findall(self.content) final_str = re.sub(r'D', '', result[0]) return final_str def _save_database(self, data): """将结果写入数据库""" if isinstance(data, dict) and data is not None: OperationMySQL().insert_data(data) print('Insert Data Success.') else: print('The data is invalid.') def _show_map(self): """读取数据库中的值,画图表,保存结果""" datas = OperationMySQL().select_data() import matplotlib.pyplot as plt from datetime import datetime from matplotlib.dates import datestr2num,DateFormatter import matplotlib.dates as dates x_ = [ datetime.fromtimestamp(float(x['create_time'])).strftime('%Y-%m-%d %H:%M:%S') for x in datas] score = [x['score'] for x in datas] rank = [x['rank'] for x in datas] plt.rcParams['font.sans-serif'] = ['FangSong'] fig, ax = plt.subplots() ax.xaxis.set_major_locator(dates.DayLocator()) ax.xaxis.set_major_formatter(DateFormatter('%Y-%m-%d')) ax.plot_date(datestr2num(x_),score,'--') ax.set_xlabel('日期') ax.set_ylabel('积分') ax.set_title('博客园排名--积分') fig.autofmt_xdate() # plt.show() plt.savefig('./rank_score.png') def run(self): score = r'<li.*?class="liScore">([sS]*?)</li>' rank = r'<li.*?class="liRank">([sS]*?)</li>' self._get_blog_content() scores = self._match_content(score) ranks = self._match_content(rank) result = dict(zip(['score', 'rank'], [scores, ranks])) self._save_database(result) self._show_map() if __name__ == '__main__': GetMyBlogScore().run()
直接运行该文件,就会在当前目录下生成一个rank_score.png的图片,就是关于积分的变化图。
开始那张是:时间--积分的绘图,我在放一张。积分--排名变化图