https://study.163.com/course/courseMain.htm?courseId=1006183019&share=2&shareId=400000000398149
# -*- coding: utf-8 -*- """ Created on Sat Mar 26 08:49:42 2016 @author: daxiong """ #开始时间 import requests, os, bs4,time,threading timeBegin=time.time() url = 'http://xkcd.com/' # starting url #urls_list=[] bad_urls=[] filename="urls.txt" os.makedirs('xkcd') # store comics in ./xkcd #爬虫获取所有图片网址 def crawl_urls(url): while not url.endswith('#'): res = requests.get(url) soup = bs4.BeautifulSoup(res.text,"lxml") prevLink = soup.select('a[rel="prev"]')[0] url = 'http://xkcd.com' + prevLink.get('href') #print("get url:",url) urls_list.append(url) #构造法获取网址,从文件中读取网址 def get_url_from_file(filename): file=open(filename,"r") urls_list=file.readlines() file.close() new_urls_list=[] for url in urls_list: new_url=url.strip(" ") new_urls_list.append(new_url) return new_urls_list #写入网址到文件夹 def file_urls(urls_list): for url in urls_list: file.write(url) file.write(" ") file.close() #下载一个网址的图片 def download_image(url): res = requests.get(url) soup = bs4.BeautifulSoup(res.text,"lxml") comicElem = soup.select('#comic img') comicUrl = 'http:' + comicElem[0].get('src') print('Downloading image %s...' % (comicUrl)) res = requests.get(comicUrl) imageFile = open(os.path.join('xkcd', os.path.basename(comicUrl)), 'wb') for chunk in res.iter_content(100000): imageFile.write(chunk) imageFile.close() #下载所有网址的图片 def download_all(urls_list): for url in urls_list: try: download_image(url) except: bad_urls.append(url) continue print("well Done") #下载某范围网址的图片 def download_range(start,end): urls_list_range1=urls_list[start:end] for url in urls_list_range1: try: download_image(url) except: bad_urls.append(url) continue #print("well Done") #获取截取数 def Step(urls_list): step=len(urls_list)/20.0 step=int(round(step,0)) return step def TimeCount(): timeComsuming=timeEnd-timeBegin print ("time Comsuming:%f seconds" % timeComsuming) return timeComsuming urls_list=get_url_from_file(filename) step=Step(urls_list) #urls_list_range1=urls_list[:step] downloadThreads = [] # a list of all the Thread objects for i in range(0, len(urls_list), step): # loops 14 times, creates 14 threads downloadThread = threading.Thread(target=download_range, args=(i, i +step)) downloadThreads.append(downloadThread) downloadThread.start() # Wait for all threads to end. for downloadThread in downloadThreads: downloadThread.join() print('Done.') #结束时间 timeEnd=time.time() #计算程序消耗时间,可改进算法 timeComsuming=TimeCount() ''' # Create and start the Thread objects. downloadThreads = [] # a list of all the Thread objects for i in range(0, 1400, 100): # loops 14 times, creates 14 threads downloadThread = threading.Thread(target=downloadXkcd, args=(i, i + 99)) downloadThreads.append(downloadThread) downloadThread.start() # Wait for all threads to end. for downloadThread in downloadThreads: downloadThread.join() print('Done.') download_all(urls_list) '''
https://study.163.com/provider/400000000398149/index.htm?share=2&shareId=400000000398149(博主视频教学主页)