• 网络爬虫: 抓取allitebooks.com书籍信息及ISBN码, save books name and ISBN into .csv(2) from backslash112


    from urllib2 import urlopen
    from bs4 import BeautifulSoup
    import csv
    
    # Get the next page url from the current page url
    def get_next_page_url(url):
        page = urlopen(url)
        soup_page = BeautifulSoup(page, 'lxml')
        page.close()
        # Get current page and next page tag
        current_page_tag = soup_page.find(class_="current")
        next_page_tag = current_page_tag.find_next_sibling()
        # Check if the current page is the last one
        if next_page_tag is None:
            next_page_url = None
        else:
            next_page_url = next_page_tag['href']
        return next_page_url 
    
    # Get the book detail urls by page url 
    def get_book_detail_urls(url):
        page = urlopen(url)
        soup = BeautifulSoup(page, 'lxml')
        page.close()
        urls = []
        book_header_tags = soup.find_all(class_="entry-title")
        for book_header_tag in book_header_tags:
            urls.append(book_header_tag.a['href'])
        return urls
        
    
    # Get the book detail info by book detail url
    def get_book_isbn(url):
        page = urlopen(url)
        book_isbn_soup = BeautifulSoup(page, 'lxml')
        page.close()
        title_tag = book_isbn_soup.find(class_="single-title")
        title = title_tag.string
        isbn_key_tag = book_isbn_soup.find(text="ISBN-10:").parent
        isbn_tag = isbn_key_tag.find_next_sibling()
        isbn = isbn_tag.string.strip() # Remove the whitespace with the strip method
        return [  title, isbn ]
        # return ['title:', title, 'isbn:', isbn]
    
        
    def save_to_csv(book_info_list):
        path='/Users/zhouxin/Desktop/books.csv'
        with open(path, 'w') as fp:
            a = csv.writer(fp, delimiter=',')
            a.writerow(['title','isbn'])
            a.writerows(book_info_list)
        
    def start():
        url = "http://www.allitebooks.com/marketing-seo/"
        book_info_list = []
        def next_page(page_url):
            book_detail_urls = get_book_detail_urls(page_url)
            for book_detail_url in book_detail_urls: #print all books ISBN one by one
                # print(book_detail_url)
                book_info = get_book_isbn(book_detail_url)
                # print (book_info)     
                print",".join(book_info) # print ISBD
                book_info_list.append(book_info)
            next_page_url = get_next_page_url(page_url)
            if next_page_url is not None:
                next_page(next_page_url)
            else:
                return 
        next_page(url)
        save_to_csv(book_info_list)
    
            

    Output:

  • 相关阅读:
    iOS开发之结合asp.net webservice实现文件上传下载
    iOS开发之结合asp.net webservice实现文件上传下载
    底部粘连(stiky footer)布局
    底部粘连(stiky footer)布局
    我在项目中运用 IOC(依赖注入)--实战篇
    xgqfrms™, xgqfrms® : xgqfrms's offical website of GitHub!
    xgqfrms™, xgqfrms® : xgqfrms's offical website of GitHub!
    xgqfrms™, xgqfrms® : xgqfrms's offical website of GitHub!
    Facebook Libra 白皮书全文:采用拜占庭共识,可扩展至数十亿客户
    DBA遇到问题时的30 个反应,你是哪一种?
  • 原文地址:https://www.cnblogs.com/XinZhou-Annie/p/7157240.html
Copyright © 2020-2023  润新知