• 【搜索引擎Jediael开发笔记2】使用HttpClient下载网页至本地文件


    本文使用HttpClient根据url进行网页下载。其中

    (1)HttpClient的相关知识请参见 HttpClient基础教程

    (2)


    package org.ljh.search.downloadpage;
    
    import java.io.FileNotFoundException;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.PrintWriter;
    import java.io.Writer;
    import java.util.Scanner;
    
    import org.apache.http.HttpEntity;
    import org.apache.http.HttpStatus;
    import org.apache.http.client.ClientProtocolException;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    
    //本类用于将指定url对应的网页下载至本地一个文件。
    public class PageDownloader {
    
    	public static void downloadPageByGetMethod(String url) throws IOException {
    
    		// 1、通过HttpGet获取到response对象
    		CloseableHttpClient httpClient = HttpClients.createDefault();
    		// 注意,必需要加上http://的前缀,否则会报:Target host is null异常。
    		HttpGet httpGet = new HttpGet(url);
    		CloseableHttpResponse response = httpClient.execute(httpGet);
    
    		InputStream is = null;
    		if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
    			try {
    				// 2、获取response的entity。
    				HttpEntity entity = response.getEntity();
    
    				// 3、获取到InputStream对象,并对内容进行处理
    				is = entity.getContent();
    
    				String fileName = getFileName(url);
    				saveToFile("D:\tmp\", fileName, is);
    			} catch (ClientProtocolException e) {
    				e.printStackTrace();
    			} finally {
    
    				if (is != null) {
    					is.close();
    				}
    				if (response != null) {
    					response.close();
    				}
    			}
    		}
    	}
    
    	//将输入流中的内容输出到path指定的路径,fileName指定的文件名
    	private static void saveToFile(String path, String fileName, InputStream is) {
    		Scanner sc = new Scanner(is);
    		Writer os = null;
    		try {
    			os = new PrintWriter(path + fileName);
    			while (sc.hasNext()) {
    				os.write(sc.nextLine());
    			}
    		} catch (FileNotFoundException e) {
    			e.printStackTrace();
    		} catch (IOException e) {
    			e.printStackTrace();
    		} finally {
    			if (sc != null) {
    				sc.close();
    			}
    			if (os != null) {
    				try{
    				os.flush();
    				os.close();
    				}catch(IOException e){
    					e.printStackTrace();
    					System.out.println("输出流关闭失败!");
    				}
    			}
    		}
    	}
    
    	// 将url中的特殊字符用下划线代替
    	private static String getFileName(String url) {
    		url = url.substring(7);
    		String fileName = url.replaceAll("[\?:*|<>"/]", "_") + ".html";
    		return fileName;
    	}
    
    }
    



  • 相关阅读:
    httpd添加新模块
    编译httpd细节
    apache配置文件说明及一些指令
    xen原理
    EXSI的使用
    VMWare ESX server安装
    虚拟化技术
    Kvm命令集管理虚拟机
    RAID几种方式
    BZOJ1011 [HNOI2008]遥远的行星 【奇技淫巧】
  • 原文地址:https://www.cnblogs.com/jediael/p/4304147.html
Copyright © 2020-2023  润新知