用python解析博客园RSS订阅的xml文本
源码
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# Author:Jruing
# FileName:RSS
# DateTime:2020/5/29 13:59
# SoftWare: PyCharm
from xml.dom.minidom import parseString
import requests
class RSS():
def __init__(self, rss_url):
self.rss_url = rss_url
def get_context(self):
response = requests.get(self.rss_url).text
self.parse_context(response)
def parse_context(self, response):
# 创建解析对象
domtree = parseString(response)
collect = domtree.documentElement
# 根据标签获取博主名称
author = collect.getElementsByTagName('author')
print(author[0].getElementsByTagName('name')[0].childNodes[0].data)
tags = collect.getElementsByTagName('entry')
# 解析文章属性信息
for info in tags:
art_url = info.getElementsByTagName('id')[0].childNodes[0].data
art_title = info.getElementsByTagName('title')[0].childNodes[0].data
art_publish = info.getElementsByTagName('published')[0].childNodes[0].data
art_update = info.getElementsByTagName('updated')[0].childNodes[0].data
for j in info.getElementsByTagName('author'):
art_author = j.getElementsByTagName('name')[0].childNodes[0].data
data = f"""
文章标题:{'-'.join(art_title.split('-')[:-1])}
文章作者:{art_author}
文章地址:{art_url}
发布时间:{art_publish}
更新时间:{art_update}
"""
data = {"art_url": '-'.join(art_title.split('-')[:-1]),
"art_title": art_title,
"art_publish": art_publish,
"art_update": art_update,
"art_author": art_author}
print(data)
return data
if __name__ == '__main__':
rss = RSS("http://feed.cnblogs.com/blog/u/565725/rss/")
rss.get_context()