• Python 爬取 书籍


    ...

    import requests
    from bs4 import BeautifulSoup
    
    
    def gethtml(url,h):
        r = requests.get(url,headers=h)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    
    def getburl(r):
        soup = BeautifulSoup(r,'lxml')
        burls = []
        for url in soup.find_all(class_='top-tit'):
            base_u = url.p.a.attrs['href']
            b_url = 'http://www.jb51.net' + base_u
            burls.append(b_url)
        return burls
    def getbhtml(url):
        he = {
            'Host': 'www.jb51.net',
            'Referer': url,
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
        }
        r = requests.get(url, headers=he)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    
    def bookinfo(r):
        soup = BeautifulSoup(r, 'lxml')
        name = soup.find(class_='new2').h1.text
        downurl = []
        durls = soup.find(class_='content greena clearfix')
        for a in durls.find_all('a'):
            downurl.append(a.attrs['href'])
        return name,downurl
    
    if __name__ == "__main__":
    
        h = {
            'Host': 'www.jb51.net',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
        }
        # file = open('booksdown.txt', 'a')
        for page in range(1,4):
            url = 'http://www.jb51.net/books/list476_{}.html'.format(str(page))
            r = gethtml(url,h)
            burls = getburl(r)
            for burl in burls:
                br = getbhtml(burl)
                binfos = bookinfo(br)
                print(binfos)
                print('书籍原地址:{}'.format(burl))
                print('
    
    ')
        #         file.write(str(binfos))
        #         file.write('
    
    ')
        # file.close()
  • 相关阅读:
    C#基础知识(以宝马,车,车轮为例)
    JAVA之本地图片复制
    JAVA获取PC每个盘符,获取每个盘符总大小,剩余空间大小
    Extjs window autoload
    jquery入门(一)
    extjs 鼠标滑过grid时产生提示grid列中的值
    poi导出
    Extjs 报表同值合并方法
    强大的grep命令
    job
  • 原文地址:https://www.cnblogs.com/mysterious-killer/p/10157119.html
Copyright © 2020-2023  润新知