爬取博客数据

#coding:utf-8

import urllib
import time

url = ['']*350
page = 1
link = 1
while page <= 7:
    con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+str(page)+'.html').read()
    i = 0
    title = con.find(r'<a title=')
    href = con.find(r'href=',title)
    html = con.find(r'.html',href)

    while title != -1 and href != -1 and html != -1 and i < 50:
        url[i] = con[href + 6 : html + 5]
        print link, url[i]
        content = urllib.urlopen(url[i]).read()
        open(r'hanhan/'+url[i][-26:],'w+').write(content)
        print 'downloading', url[i]
        time.sleep(1)
        title = con.find(r'<a title=', html)
        href = con.find(r'href=', title)
        html = con.find(r'.html', href)
        i = i + 1
        link = link + 1
    else:
        print page,'find end!'
    page = page + 1
else:
    print 'all find end'
    print 'all find end'

相关阅读:
阅读 video in to axi4-stream v4.0 笔记
python 字符串操作
python 基本语句
Python 算术运算符
芯片企业研报阅读
量化分析v1
基于MATLAB System Generator 搭建Display Enhancement模型
System Generator 生成IP核在Vivado中进行调用
FPGA 中三角函数的实现
System Generator 使用离散资源

原文地址：https://www.cnblogs.com/XDJjy/p/5426510.html