import requests from pyquery import PyQuery as pq import time import os import random #自定义header header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' } #自定义代理ip proxyip = { 'https': '180.109.124.30:4216' } #获取页面 url 注: url_list 为一个列表,第一个元素为首页,page_url 为2-9 页面的url; def page_url(): url_list=["http://www.netbian.com/meinv/"] url = "http://www.netbian.com/meinv/" for i in range(2,10): page_url = url + "index_" +str(i) + ".htm" url_list.append(page_url) # print(url_list) return url_list # 请求网页,获取源码,提取文本就用text;提取图片、文件,就要用到content def start_request(): url = page_url() y = 0 for i in url: r = requests.get(i,headers=header,proxies=proxyip) r.encoding = 'GBK' html = r.text doc = pq(html) # 匹配大致图片地址 images = doc('div.list ul li img').items() for image in images: #print (image) #精准匹配图片url img_url = image.attr('src') print (img_url) #提取文本就用text;提取图片、文件,就要用到content img = requests.get(img_url,headers=header,proxies=proxyip).content dirs = "F:image" if not os.path.exists(dirs): os.mkdir(dirs,777) path = "F:\image\" + str(y) + ".jpg" with open(path,'wb') as f: f.write(img) time.sleep(2) print('正在下载第{}张图片'.format(y)) print ("写入完成") y +=1 def main(): start_request() if __name__ == "__main__": main()
效果如下