...
import requests from bs4 import BeautifulSoup def gethtml(url,h): r = requests.get(url,headers=h) r.raise_for_status() r.encoding = r.apparent_encoding return r.text def getburl(r): soup = BeautifulSoup(r,'lxml') burls = [] for url in soup.find_all(class_='top-tit'): base_u = url.p.a.attrs['href'] b_url = 'http://www.jb51.net' + base_u burls.append(b_url) return burls def getbhtml(url): he = { 'Host': 'www.jb51.net', 'Referer': url, 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36' } r = requests.get(url, headers=he) r.raise_for_status() r.encoding = r.apparent_encoding return r.text def bookinfo(r): soup = BeautifulSoup(r, 'lxml') name = soup.find(class_='new2').h1.text downurl = [] durls = soup.find(class_='content greena clearfix') for a in durls.find_all('a'): downurl.append(a.attrs['href']) return name,downurl if __name__ == "__main__": h = { 'Host': 'www.jb51.net', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36' } # file = open('booksdown.txt', 'a') for page in range(1,4): url = 'http://www.jb51.net/books/list476_{}.html'.format(str(page)) r = gethtml(url,h) burls = getburl(r) for burl in burls: br = getbhtml(burl) binfos = bookinfo(br) print(binfos) print('书籍原地址:{}'.format(burl)) print(' ') # file.write(str(binfos)) # file.write(' ') # file.close()