package com.tl.spider.download; import com.tl.spider.utils.StaticValue; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * @ClassName CharsetDetectorUtil * @Description 给定一个url,获取其网页源代码的编码 * @Author Administrator * @Date 2019/5/19 13:57 * @Version 1.0 **/ public class CharsetDetectorUtil { /** * 获取网页源代码的编码方式 * @param url * @return * @throws IOException */ public static String getCharset(String url) throws Exception { String charSet = null; URL urlObject = new URL(url); URLConnection urlConnection = urlObject.openConnection(); Map<String, List<String>> map = urlConnection.getHeaderFields(); List<String> list = map.get("Content-Type"); if(list != null && !list.isEmpty()) { String line = list.get(0); String[] array = line.split(";"); for(String str : array) { String[] eleArray = str.split("="); if(eleArray.length == 2) { if(eleArray[0].equals("charset")) { charSet = eleArray[1].trim(); } } } } /** * 由于网页的编码方式的说明只在网页源代码的前几行,所以不需要获取所有的网页源代码 */ if(charSet == null) { // 启用meta获取网页的编码方式 BufferedReader bufferedReader = WebPageDownLoadUtil.getBR(url, StaticValue.ENCODING_DEFAULT); String tmp = null; while((tmp = bufferedReader.readLine()) != null) { tmp = tmp.toLowerCase(); String charset = getCharSetValue4Line(tmp); if(charset != null) { charSet = charset; break; } if(tmp.contains("</head>")) { break; } } if(bufferedReader != null) { bufferedReader.close(); } } return charSet; } /** * * @param line * @return */ public static String getCharSetValue4Line(String line) { String charsetValue = null; String regex = "charset="?(.+?)"?\s?/?>"; // 这个地方需要综合多个网页进行相应的修改 Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(line); if(matcher.find()) { charsetValue = matcher.group(1); } return charsetValue; } public static void main(String[] args) throws Exception { String url = "http://news.youth.cn/"; //String url = "https://www.baidu.com/"; //String url= "https://hao.360.com/?s0001"; String charSet = getCharset(url); System.out.println(charSet); } }
其中getBr函数为:
/** * 获取BufferedReader * @param url * @param charset * @return * @throws Exception */ public static BufferedReader getBR(String url, String charset) throws Exception { URL urlObject = new URL(url); InputStream inputStream = urlObject.openStream(); InputStreamReader inputStreamReader = new InputStreamReader(inputStream, charset); BufferedReader bufferedReader = new BufferedReader(inputStreamReader); return bufferedReader; }