【python练习】截取网页里最新的新闻

需求：

在下面这个网页，抓取最新的新闻，按天划分。

http://blog.eastmoney.com/13102551638/bloglist_0_1.html

实现方法1：使用递归

import urllib
import re
import time

#读取网页内容
content = urllib.urlopen('http://blog.eastmoney.com/13102551638/bloglist_0_1.html').read()
#print content

#截取一部分
pre = re.compile('<li><a href="(.+?)" target="_blank">(.+?)</a><span class="time">(.+?)</span></li>')
new = re.findall(pre,content)
#print new

class News:
#当前年月日
t=int(time.strftime("%Y%m%d ",time.localtime()))

def __init__(self,ct):
self.ct = ct

def search(self):
News.t-=1
#循环这个列表
for item in self.ct:
#列表里，新闻的时间
date = int(item[2][1:5]+item[2][6:8]+item[2][9:11])
#如果新闻是今天发的
if date >= News.t:
#输出这个新闻的标题
title=item[1]
return title
#否则，继续递归search函数
else:
News.search()

aaa=News(new)
cc=aaa.search()
print(cc)

实现方法2：使用while循环

import urllib
import re
import time

#读取网页内容
content = urllib.urlopen('http://blog.eastmoney.com/13102551638/bloglist_0_1.html').read()
#print content

#截取一部分
pre = re.compile('<li><a href="(.+?)" target="_blank">(.+?)</a><span class="time">(.+?)</span></li>')
new = re.findall(pre,content)
#print new

class Good:

def __init__(self,ct):
self.ct = ct

def search(self):
cc=self.ct
i=0
#第一条新闻时间和下一条新闻时间对比，一次类推。如果一样，输出第一条新闻的标题，继续循环
while cc[i][2][0:11] == cc[i+1][2][0:11]:
print(cc[i][1])
i+=1
#如果不一样，输出刚才对比的第一条新闻的标题
else:
print(cc[i][1])

aaa=Good(new)
cc=aaa.search()

相关阅读:
对html与body的一些研究与理解
关于文字内容溢出用点点点(...)省略号表示
CSS3中border-image属性详解
从TCP协议的原理来谈谈rst复位攻击
关于Oracle中sysoper这个系统权限的问题
翻翻git之---有用的欢迎页开源库 AppIntro
椒盐噪声
Codeforces Beta Round #1 A. Theatre Square
log4j:WARN Please initialize the log4j system properly解决的方法
微信平台开发——日历服务

原文地址：https://www.cnblogs.com/mogujiang/p/5528057.html