最近在学习用java来做爬虫但是发现不管用那种方式都是爬取的代码比网页的源码少了很多
在网上查了很多都说是inputStream的缓冲区太小而爬取的网页太大导致读取出来的网页代码不完整,但是后面发现并不是这个问
这个是用HttoClient所作的
public static String getHtml2(String url) { try { HttpGet httpRequest = new HttpGet(url); HttpClient httpclient = new DefaultHttpClient(); HttpResponse httpResponse = httpclient.execute(httpRequest); if (httpResponse.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { InputStream input = httpResponse.getEntity().getContent(); byte[] b = new byte[1024]; int len = 0; StringBuffer buff = new StringBuffer(); while ((len = input.read(b)) != -1) { buff.append(new String(b)); } return buff.toString(); // 使用如下代码只返回40K // return EntityUtils.toString(httpResponse.getEntity(),"UTF-8"); } }catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; }
这个使用HttpURLConnection做的
//使用HttpURLConnection获取网页内容 public static String getHtml(String url) {//获取网页内容 StringBuffer html=new StringBuffer(); if(!url.startsWith("http")) { url="https://"+url; } InputStreamReader inReader=null; BufferedReader bReader=null; HttpURLConnection htcon=null; try { URL u=new URL(url); //设置请求头为获取与源码一样的代码 htcon=(HttpURLConnection)u.openConnection(); htcon.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"); htcon.setReadTimeout(2000);//设置读取超时 htcon.setRequestMethod("POST");//设置请求方式 htcon.setConnectTimeout(2000);//设置连接超时 if(htcon.getResponseCode() == 200) {//如果页面响应的话 inReader=new InputStreamReader(htcon.getInputStream(),"utf-8");//获得页面的输入流 bReader=new BufferedReader(inReader); String line=""; while((line=bReader.readLine())!=null) { html.append(line); html.append(" "); } } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); }finally { try { if(inReader!=null) { inReader.close(); } if(bReader!=null) { bReader.close(); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } return html.toString(); }
在线等解决方法,或等更新