• python 爬虫 爬取序列博客文章列表


    python中写个爬虫真是太简单了

    
    
    
    import urllib.request
    from pyquery import PyQuery as PQ
    
    # 根据URL获取内容并解码为UTF-8
    def getHtml(url):
        page = urllib.request.urlopen(url)
        html = page.read()
        html = html.decode('UTF-8')
        return html
    
    # 解析返回的html
    def getArtical(html, results):
        doc = PQ(html)
        # data = doc('.searchAtcList .searchAtc_top a')
        data = doc('.atc_title a')
        for x in data.items():
            title = x.text()
            href = x.attr('href')
            if title.find('教你炒股票') >= 0:
                # 标题被截断的需要根据URL获取完整的标题
                if title.find('…') >= 0:
                    title = getArticalDetail(x.attr('href'))
    
                r = '[' + title + '](' + href + ')'
                index = title[5 : title.index(':')]
                results.append((int(index),r))
    
    # 获取文章标题
    def getArticalDetail(url):
        html = getHtml(url)
        doc = PQ(html)
        data = doc('.articalTitle h2')
        title = data.text()
        return title
    
    blog3 = 'http://blog.sina.com.cn/s/articlelist_1215172700_0_'
    # http://blog.sina.com.cn/s/articlelist_1215172700_0_1.html
    # http://blog.sina.com.cn/s/articlelist_1215172700_0_15.html
    # blog = 'http://control.blog.sina.com.cn/search/search.php?uid=1215172700&keyword=%E8%82%A1%E7%A5%A8&page='
    # blog2 = 'http://control.blog.sina.com.cn/search/search.php?uid=1215172700&keyword=%E8%82%A1%E7%A5%A8&page='
    
    results = []
    
    # 总共有23页
    for i in range(1, 24):
        url = blog3 + str(i) + '.html'
        print(url)
        html = getHtml(url)
        getArtical(html, results)
    
    # 排序后输出
    results.sort()
    for x in results:
        print(x[1])
    
    
    
    
  • 相关阅读:
    Zuul的核心源码解析
    基于Sentinel的服务保护
    Sentinel
    windows进行配置转发
    Hystrix断路器
    服务熔断Hystrix高级
    微服务架构的高并发问题
    Feign的高级配置
    倒排序原理和实例
    云计算技术的产生、概念、原理、应用和前景
  • 原文地址:https://www.cnblogs.com/wancy86/p/6377971.html
Copyright © 2020-2023  润新知