java爬虫遇到个页面加密的东西,找了些资料学习学习
做了个java运行js的工具类,希望对大家有用,其中用到client(获取js)可以自行换成自己的client。主要是用了
Rhino就是JavaScript引擎,它的目的就是实现Java与JavaScript的互操作性。rhino-1.7R1.jar
Envjs一个纯js方式在无浏览器环境下模拟浏览器的行为。envjs-1.2.js
一般网站js中都会用到jauery,所以还用了jauery.js
import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.Reader; import java.lang.ref.SoftReference; import java.net.URI; import java.nio.charset.Charset; import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.Validate; import org.apache.http.Header; import org.apache.http.HeaderElement; import org.apache.http.HttpEntity; import org.apache.http.ParseException; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.protocol.RequestAcceptEncoding; import org.apache.http.impl.DefaultConnectionReuseStrategy; import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.util.Args; import org.apache.http.util.ByteArrayBuffer; import org.jsoup.Jsoup; import org.mozilla.javascript.Context; import org.mozilla.javascript.ContextFactory; import org.mozilla.javascript.Function; import org.mozilla.javascript.Scriptable; import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; //import net.sourceforge.htmlunit.corejs.javascript.Context; //import net.sourceforge.htmlunit.corejs.javascript.ContextFactory; //import net.sourceforge.htmlunit.corejs.javascript.Function; //import net.sourceforge.htmlunit.corejs.javascript.Scriptable; /** * 参照http://mybeautiful.iteye.com/blog/1442839 * http://m.oschina.net/blog/121347 * http://blog.csdn.net/dwjmantou/article/details/45276967 * http://lcllcl987.iteye.com/blog/87423 * ***不可使用htmlunit的包******Cannot call method "setOptimizationLevel" of null * @author 5432 * */ public class RhinoScaper { private Context context; private Scriptable scriptable; /** * 初始化方法 */ public void init(){ context = ContextFactory.getGlobal().enterContext(); scriptable =context.initStandardObjects(null); context.setOptimizationLevel(-1); context.setLanguageVersion(Context.VERSION_1_5); // 初始化测试用,并定义envjs-1.2.js未定义print context.evaluateString(scriptable, "var v='sssaass';" + "var print = function(v) {"+ " java.lang.System.out.println(v);return v ;"+ " };function hah(){return v }", "print",1,null); // System.out.println("v == " + scriptable.get("v", scriptable) ); Function prf = (Function)scriptable.get("print", scriptable); Object call = prf.call(Context.getCurrentContext(), scriptable, prf, new Object[]{"test"}); // System.out.println("print == "+call.toString()); Object invokFunction = invokFunction("hah"); // System.out.println(invokFunction.toString()); String[] file = { this.getClass().getResource("/")+"envjs-1.2.js", "./lib/jquery.js" }; for (String f : file) { evaluateJs(f); } } /** * 调用函数 * @param functionName * @param functionArags * @return */ public Object invokFunction(String functionName,Object... functionArags) { Validate.notNull(context, "context is null"); Validate.notNull(scriptable, "scriptable is null"); Function function = (Function) scriptable.get(functionName, scriptable); Object call = function.call(Context.getCurrentContext(), scriptable, function, functionArags); // System.out.println("reslult = "+call.toString()); return call; } /** * 加载js文件 * (当没有找到对应文件, * 且要加载文件名路径包含‘envjs-1.2.js’ 会访问 https://raw.githubusercontent.com/ryan-roemer/envjs-1.2/master/env.rhino.1.2.js * 文件名路径包含‘jquery.js’ 会访问 http://apps.bdimg.com/libs/jquery/1.6.0/jquery.js * 加载js文件 ) * @param f 文件名路径 */ public void evaluateJs(String f) { Validate.notNull(context, "context is null"); Validate.notNull(scriptable, "scriptable is null"); FileReader in = null; try { // FileInputStream fI = new FileInputStream(f); // String js = IOUtils.toString(fI, "UTF-8");//设置默认js文件编码为utf-8 // context.evaluateString(scriptable, js, f, 1, null); in = new FileReader(f); context.evaluateReader(scriptable, in, f, 1, null); } catch (FileNotFoundException e1) { // e1.printStackTrace(); if (f.contains("envjs-1.2.js")) { String envjs ="https://raw.githubusercontent.com/ryan-roemer/envjs-1.2/master/env.rhino.1.2.js"; try { SoftReference<String> htmlString = Client.getHtmlString(envjs); String jqueryStr = htmlString==null?"":htmlString.get(); // DefaultClient defaultClient = new DefaultClient(); // String jqueryStr =defaultClient.get(envjs).asHtml(); context.evaluateString(scriptable, jqueryStr, envjs, 1, null); } catch (Exception e) { e.printStackTrace(); } } else if (f.contains("jquery.js")) { String jquery = "http://apps.bdimg.com/libs/jquery/1.6.0/jquery.js"; Reader bufR =null; try { SoftReference<Reader> htmlReader = Client.getHtmlReader(jquery); bufR = htmlReader==null?new BufferedReader(null):htmlReader.get(); // String js = IOUtils.toString(bufR); context.evaluateReader(scriptable, bufR , jquery, 1, null); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); }finally { // close(bufR); IOUtils.closeQuietly(bufR); } } else{ throw new RuntimeException("unknown file "+f); } } catch (IOException e1) { e1.printStackTrace(); }finally { // close(in); IOUtils.closeQuietly(in); } } public static void main(String[] args) { RhinoScaper rhinoScaper = new RhinoScaper(); rhinoScaper.init(); // rhinoScaper.JSloadString("jsString", "jsname"); // rhinoScaper.evaluateJs("E:/Desktop/loginjs.js"); // rhinoScaper.loadJS("", classpathURI); // 电信登录加密测试 String pwd="111"; StringBuilder ascending = new StringBuilder(); SoftReference<String> htmlString = null; try { htmlString = Client.getHtmlString("http://login.189.cn/bundles/jquery?v=h3Pl8XT8zdNkoI1VbV5sEZOBrSqsxRXX0TIQ9S_lAlM1"); } catch (Exception e) { e.printStackTrace(); } String jsStr =htmlString==null?"":htmlString.get(); jsStr = jsStr.replaceAll("float:", "floats:").replaceAll("throws", "throwss"); ascending.append(jsStr); ascending.append("; var input=document.createElement("input");input.value='"+pwd+"';;input.id= 'pass';input.type='password';"); ascending.append(" function getpassword(){ return $(input).valAesEncryptSet()}"); rhinoScaper.JSloadString(ascending.toString(), "jsname"); Object result = rhinoScaper.invokFunction("getpassword"); System.out.println(result); try { htmlString = Client.getHtmlString("http://www.youdaili.net/Daili/"); jsStr =htmlString==null?"":htmlString.get(); String runScript = rhinoScaper.runScript(jsStr); System.out.println(runScript); } catch (Exception e) { e.printStackTrace(); } } /** * 运行js * @param html * @return */ private String runScript(String html) { String function = null;int jsfrom = 0; Pattern p = Pattern.compile("setTimeout\("(.*)\((.*)\)", 200\);"); Matcher m = p.matcher(html); if(m.find()){ function = m.group(1);//函数名 jsfrom = Integer.parseInt(m.group(2));//参数 } JSloadString(Jsoup.parse(html).select("script").html().replace("eval("qo=eval;qo(po);")", "return po"), "jsname"); Object result = invokFunction(function, jsfrom); return result.toString(); } /** * 加载js文件 * @param sourceName 名称 * @param classpathURI 文件路径 */ public void loadJS(String sourceName, String classpathURI) { Validate.notNull(context, "context is null"); Validate.notNull(scriptable, "scriptable is null"); String js = null; InputStream inputStream = null; try { inputStream = getClass().getResourceAsStream(classpathURI); js = IOUtils.toString(inputStream, "UTF-8");//设置默认js文件编码为utf-8 } catch (IOException e) { e.printStackTrace(); } finally { IOUtils.closeQuietly(inputStream); } context.evaluateString(scriptable, js, sourceName, 1, null); } /** * 加载js字符串 * @param source js字符串(注意处理js中由于变量名为throws,float类似名称导致的报错) * @param sourceName 名称 */ public void JSloadString(String source, String sourceName){ Validate.notNull(context, "context is null"); Validate.notNull(scriptable, "scriptable is null"); context.evaluateString(scriptable, source, sourceName, 1, null); } } class Client{ public static void close(AutoCloseable close) { if (close != null) { try { close.close(); } catch (Exception e) { e.printStackTrace(); } } } public static CloseableHttpResponse HttpGetResponse(String url) throws IOException, ClientProtocolException { HttpGet httpGet = new HttpGet(URI.create(url)); BasicCookieStore cookieStore = new BasicCookieStore(); HttpClientBuilder builder = HttpClientBuilder.create().disableContentCompression() .setConnectionReuseStrategy(new DefaultConnectionReuseStrategy()).setUserAgent("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36"); builder.addInterceptorLast(new RequestAcceptEncoding()); builder.setDefaultCookieStore(cookieStore); CloseableHttpClient client = builder.build(); CloseableHttpResponse execute = client.execute(httpGet); return execute; } public static SoftReference<String> getHtmlString(String url)throws Exception { CloseableHttpResponse execute = null; byte[] binary =null;//初次解析内容 SoftReference<String> result = null; try { execute = HttpGetResponse(url); // content = execute.getEntity().getContent(); binary = HttpEntityTOByte(execute.getEntity()); }finally { close(execute); } String html; byte[] decode; try { System.out.println(execute.getStatusLine().toString()); System.out.println(execute.getEntity().getContentEncoding()); Args.notNull(binary, "binary"); decode= decode(binary,execute.getEntity()); try { String charset = getContentCharSet(execute.getEntity().getContentType().getValue()); if (charset != null) { html = new String(decode, Charset.forName(charset)); } else { CharsetMatch match = new CharsetDetector().setText(decode) .detect(); html = match.getString(); } } catch (Exception e) { throw new Exception(e); } result = new SoftReference<String>(html); }finally { binary =null; decode =null; html=null; } return result; } public static SoftReference<Reader> getHtmlReader(String url)throws Exception { CloseableHttpResponse execute = null; byte[] binary =null;//初次解析内容 SoftReference<Reader> result = null; try { execute = HttpGetResponse(url); binary = HttpEntityTOByte(execute.getEntity()); }finally { close(execute); } byte[] decode; Reader bufR = null; try { System.out.println(execute.getStatusLine().toString()); System.out.println(execute.getEntity().getContentEncoding().toString()); Args.notNull(binary, "binary"); decode= decode(binary,execute.getEntity()); bufR= new BufferedReader(new InputStreamReader(new ByteArrayInputStream(decode))); result=new SoftReference<Reader>(bufR); }finally { binary =null; decode =null; //close(bufR); } return result; } private static String getContentCharSet(String contentType) throws ParseException { String charset = null; if (StringUtils.isNotEmpty(contentType)) { String[] strs = contentType.split(";"); for (String string : strs) { if (string.contains("charset")) { String[] tmp = string.split("="); if (tmp.length == 2) { return tmp[1]; } } } } return charset; } public static final int BUFFER = 1024; /** * 数据解压缩 gizp * * @param data * @return * @throws Exception * @author http://snowolf.iteye.com/blog/643010 */ public static byte[] decompress(byte[] data) throws Exception { ByteArrayInputStream bais = new ByteArrayInputStream(data); ByteArrayOutputStream baos = new ByteArrayOutputStream(); // 解压缩 decompress(bais, baos); data = baos.toByteArray(); baos.flush(); close(baos); close(bais); // baos.close(); // bais.close(); return data; } /** * 数据解压缩 * * @param is * @param os * @throws Exception */ public static void decompress(InputStream is, OutputStream os) throws Exception { GZIPInputStream gis =null; byte data[]; try { gis = new GZIPInputStream(is); int count; data = new byte[BUFFER]; while ((count = gis.read(data, 0, BUFFER)) != -1) { os.write(data, 0, count); } } finally{ data = null; close(gis); // gis.close(); } } /** * gizp解压 * @param binary * @param res * @param entity * @return * @throws Exception * */ public static byte[] decode(byte[] binary, final HttpEntity entity) throws Exception { if (entity != null && entity.getContentLength() != 0) { final Header ceheader = entity.getContentEncoding(); if (ceheader != null) { final HeaderElement[] codecs = ceheader.getElements(); for (final HeaderElement codec : codecs) { final String codecname = codec.getName().toLowerCase(Locale.US); if ("gzip".equals(codecname) || "x-gzip".equals(codecname)) { return decompress(binary); } else if ("deflate".equals(codecname)) { return binary; } else if ("identity".equals(codecname)) { /* Don't need to transform the content - no-op */ return binary; } else { throw new Exception("Unsupported Content-Coding: "+codecname ); } } } } return binary; } /** * 将HttpEntity转换成byte数组 * @param entity HttpEntity * @return byte[] * @throws IOException * @author EntityUtils.toByteArray(entity) */ public static byte[] HttpEntityTOByte(HttpEntity entity) throws IOException{ final InputStream instream = entity.getContent(); if (instream == null) { return null; } try { Args.check(entity.getContentLength() <= Integer.MAX_VALUE, "HTTP entity too large to be buffered in memory"); int i = (int)entity.getContentLength(); if (i < 0) { i = 4096; } final ByteArrayBuffer buffer = new ByteArrayBuffer(i); final byte[] tmp = new byte[4096]; int l; while((l = instream.read(tmp)) != -1) { buffer.append(tmp, 0, l); } return buffer.toByteArray(); } finally { instream.close(); } } }