• python crawler


    crawl blog website: www.apress.com

    # -*- coding: utf-8 -*-
    """
    Created on Wed May 10 18:01:41 2017
    @author: Raghav Bali
    """
    
    """
    This script crawls apress.com's blog page to:
        + extract list of recent blog post titles and their URLS
        + extract content related to each blog post in plain text
    using requests and BeautifulSoup packages
    ``Execute``
            $ python crawl_bs.py
    """
    
    
    import requests
    from time import sleep
    from bs4 import BeautifulSoup
    
    
    def get_post_mapping(content):
        """This function extracts blog post title and url from response object
        Args:
            content (request.content): String content returned from requests.get
        Returns:
            list: a list of dictionaries with keys title and url
        """
        post_detail_list = []
        post_soup = BeautifulSoup(content,"lxml")
        h3_content = post_soup.find_all("h3")
        
        for h3 in h3_content:
            post_detail_list.append(
                {'title':h3.a.get_text(),'url':h3.a.attrs.get('href')}
                )
        
        return post_detail_list
    
    
    def get_post_content(content):
        """This function extracts blog post content from response object
        Args:
            content (request.content): String content returned from requests.get
        Returns:
            str: blog's content in plain text
        """
        plain_text = ""
        text_soup = BeautifulSoup(content,"lxml")
        para_list = text_soup.find_all("div",
                                       {'class':'cms-richtext'})
        
        for p in para_list[0]:
            plain_text += p.getText()
        
        return plain_text
        
        
    
    if __name__ =='__main__':
        
        crawl_url = "http://www.apress.com/in/blog/all-blog-posts"
        post_url_prefix = "http://www.apress.com"
        
        print("Crawling Apress.com for recent blog posts...
    
    ")    
        
        response = requests.get(crawl_url)
        
        if response.status_code == 200:
            blog_post_details = get_post_mapping(response.content)
        
        if blog_post_details:
            print("Blog posts found:{}".format(len(blog_post_details)))
            
            for post in blog_post_details:
                print("Crawling content for post titled:",post.get('title'))
                post_response = requests.get(post_url_prefix+post.get('url'))
                
                if post_response.status_code == 200:
                    post['content'] = get_post_content(post_response.content)
                
                print("Waiting for 10 secs before crawling next post...
    
    ")
                sleep(10)
        
            print("Content crawled for all posts")
            
            # print/write content to file
            for post in blog_post_details:
                print(post)
    
    不要高估三年后的自己,更不要低估十年后的自己。
  • 相关阅读:
    java socket HTTPClient GET
    Spring的JdbcTemplate使用update或insert操作的三种使用例子
    windows XP 解决QQ和360软件冲突的办法
    J2EE 应用服务器集群常用方法
    javaeye被关闭了 被政 府和谐感
    P6SPY(JDBC SQL拦截)的安装和使用
    让QQ与360并存,不卸载360正常使用QQ
    Spring2.5注解(标注)学习笔记(使用annotation代替XML)
    centos vnc配置
    HTTP协议内容具体含义
  • 原文地址:https://www.cnblogs.com/zwk-coder/p/11137340.html
Copyright © 2020-2023  润新知