有关爬虫,自己半年前写了一个,写过就忘记了,今天才看到
1 import java.io.IOException; 2 import java.net.URISyntaxException; 3 import java.nio.charset.StandardCharsets; 4 import java.util.ArrayList; 5 import java.util.Date; 6 import java.util.HashMap; 7 import java.util.List; 8 import java.util.Map; 9 10 import org.apache.commons.lang.time.DateFormatUtils; 11 import org.apache.http.HttpEntity; 12 import org.apache.http.ParseException; 13 import org.apache.http.client.ClientProtocolException; 14 import org.apache.http.client.methods.CloseableHttpResponse; 15 import org.apache.http.client.methods.HttpPost; 16 import org.apache.http.client.utils.URIBuilder; 17 import org.apache.http.entity.StringEntity; 18 import org.apache.http.impl.client.CloseableHttpClient; 19 import org.apache.http.impl.client.HttpClientBuilder; 20 import org.apache.http.util.EntityUtils; 21 import org.apache.xerces.util.URI; 22 import org.jsoup.Jsoup; 23 import org.jsoup.nodes.Document; 24 import org.jsoup.nodes.Element; 25 import org.jsoup.select.Elements; 26 import org.quartz.Job; 27 import org.quartz.JobExecutionContext; 28 import org.quartz.JobExecutionException; 29 30 /* 31 * @说明 32 * 由于针对网页取数据,1页有100条数据,二期数据一直是变化的; 33 * 目前设计是每5分钟抓取一次,所以抓一次,存一次,之前的数据仍旧保留,但是只抓第一页数据。 34 * 其他排名靠后的数据就不抓了 35 * 默认排序为 totalvolpct 总成交占比 36 * 37 * 38 * <th>名称</th> 39 <th><a href="#" onclick="return window['sortTable']('symbol', ' ');">代码</a> </th> 40 <th><a href="#" onclick="return window['sortTable']('totalvol', ' ');">总成交量(万股)</a> </th> 41 <td><a href="#" onclick="return window['sortTable']('totalvolpct', ' ');">总成交量占比</a> </td> ----百分比,入库时是去掉百分号入库的 42 <td><a href="#" onclick="return window['sortTable']('totalamt', ' ');">总成交额(万元)</a> </td> 43 <td><a href="#" onclick="return window['sortTable']('totalamtpct', ' ');">总成交额占比</a> </td>----百分比,入库时是去掉百分号入库的 44 <td><a href="#" onclick="return window['sortTable']('avgprice', ' ');">平均成交价(元)</a> </td> 45 <td><a href="#" onclick="return window['sortTable']('kuvolume', '↓');">主买量(万股)</a>↓</td> 46 <td><a href="#" onclick="return window['sortTable']('kevolume', ' ');">中性量(万股)</a> </td> 47 <td><a href="#" onclick="return window['sortTable']('kdvolume', ' ');">主卖量(万股)</a> </td> 48 <th>详情 </th> 49 50 51 建数据库表时,字段顺序一定要按照上面的顺序来建,否则会有问题 52 53 * 54 * 55 */ 56 57 58 public class Crawler implements Job{ 59 60 61 private String url="http://vip.stock.finance.sina.com.cn/quotes_service/view/cn_bill_sum.php?num=100&sort=totalvolpct&asc=0&volume=40000&type=0&dpc=1&page=1"; 62 private String encode="UTF-8"; 63 64 public String getUrlData() { 65 66 String out=new String(); 67 68 //---大单分析--- 69 //---http://vip.stock.finance.sina.com.cn/quotes_service/view/cn_bill_sum.php?num=100&sort=totalvolpct&asc=0&volume=40000&type=0&dpc=1&page=2 70 71 // 获得Http客户端(可以理解为:你得先有一个浏览器;注意:实际上HttpClient与浏览器是不一样的) 72 CloseableHttpClient httpClient = HttpClientBuilder.create().build(); 73 74 // // 创建Post请求 75 76 HttpPost httpPost = new HttpPost(url); 77 78 //---下面这句话暂时没有起作用,不知道原因;其实参数是可以不用放在上面的httpPost对象中的 79 StringEntity entity = new StringEntity("num=100&sort=totalvolpct&asc=0&volume=40000&type=0&dpc=1&page=1", encode); 80 81 // post请求是将参数放在请求体里面传过去的;这里将entity放入post请求体中 82 httpPost.setEntity(entity); 83 84 httpPost.setHeader("Content-Type", "text/html;charset=utf8"); 85 86 // 响应模型 87 CloseableHttpResponse response = null; 88 try { 89 // 由客户端执行(发送)Post请求 90 response = httpClient.execute(httpPost); 91 // 从响应模型中获取响应实体 92 HttpEntity responseEntity = response.getEntity(); 93 94 System.out.println("响应状态为:" + response.getStatusLine()); 95 if (responseEntity != null) { 96 System.out.println("响应内容长度为:" + responseEntity.getContentLength()); 97 //System.out.println("响应内容为:" + EntityUtils.toString(responseEntity,"GBK")); 98 out=EntityUtils.toString(responseEntity,"GBK"); 99 } 100 } catch (ClientProtocolException e) { 101 e.printStackTrace(); 102 } catch (ParseException e) { 103 e.printStackTrace(); 104 } catch (IOException e) { 105 e.printStackTrace(); 106 } finally { 107 try { 108 // 释放资源 109 if (httpClient != null) { 110 httpClient.close(); 111 } 112 if (response != null) { 113 response.close(); 114 } 115 } catch (IOException e) { 116 e.printStackTrace(); 117 } 118 } 119 120 121 122 return out; 123 124 } 125 126 127 //---想直接跳过字符串处理,暂时没成功---// 128 public void DealUrlString(String inStr,org.springframework.jdbc.core.JdbcTemplate db) { 129 130 String out=new String(); 131 132 Document doc = Jsoup.parseBodyFragment(inStr); 133 134 Element et = doc.getElementById("divListTemplate"); 135 Elements et_tab = et.getElementsByTag("table"); 136 137 Elements trs = et_tab.first().getElementsByTag("tr"); 138 139 140 System.out.println("====size===="+trs.size()); 141 142 143 int n=0; 144 145 146 147 for (Element element : trs) { 148 if(n==0) { 149 n++; 150 } 151 else { 152 StringBuffer insert_sql=new StringBuffer(); 153 154 insert_sql.append("insert into stock_bigdeal_analyse (cn_name,symbol,totalvol,totalvolpct,totalamt,totalamtpct,avgprice,kuvolume,kevolume,kdvolume,input_time) values ( "); 155 156 Elements ele_ths= element.getElementsByTag("th"); 157 insert_sql.append( "'" + ele_ths.get(0).text().trim()+"', "); 158 insert_sql.append( "'" + ele_ths.get(1).text().replaceAll(" ", "")+"', ");//--看不见的特殊符号, 159 160 161 Elements ele_tds= element.getElementsByTag("td"); 162 163 insert_sql.append( "'" + ele_tds.get(0).text().trim().replaceAll(" ", "").replaceAll(",", "")+"', "); 164 insert_sql.append( "'" + ele_tds.get(1).text().trim().replaceAll(" ", "").replaceAll("%", "")+"',"); 165 insert_sql.append( "'" + ele_tds.get(2).text().trim().replaceAll(" ", "").replaceAll(",", "")+"',"); 166 insert_sql.append( "'" + ele_tds.get(3).text().trim().replaceAll(" ", "").replaceAll("%", "")+"',"); 167 insert_sql.append( "'" + ele_tds.get(4).text().trim().replaceAll(" ", "").replaceAll(",", "")+"',"); 168 insert_sql.append( "'" + ele_tds.get(5).text().trim().replaceAll(" ", "").replaceAll(",", "")+"',"); 169 insert_sql.append( "'" + ele_tds.get(6).text().trim().replaceAll(" ", "").replaceAll(",", "")+"',"); 170 insert_sql.append( "'" + ele_tds.get(7).text().trim().replaceAll(" ", "").replaceAll(",", "")+"',"); 171 172 insert_sql.append( "'" +DateFormatUtils.format(new Date(),"yyyyMMddHHmmssSSS")+"') "); 173 174 // System.out.println(insert_sql); 175 176 db.execute(insert_sql.toString()); 177 178 } 179 } 180 181 182 183 } 184 185 186 187 public static void main(String args[]) { 188 189 Crawler cr= new Crawler(); 190 191 SpringDb sd = new SpringDb(); 192 org.springframework.jdbc.core.JdbcTemplate db= sd.getJdbc(); 193 194 String data_in = cr.getUrlData(); 195 cr.DealUrlString(data_in,db); 196 197 198 } 199 200 201 @Override 202 public void execute(JobExecutionContext arg0) throws JobExecutionException { 203 204 Crawler cr= new Crawler(); 205 206 SpringDb sd = new SpringDb(); 207 org.springframework.jdbc.core.JdbcTemplate db= sd.getJdbc(); 208 209 String data_in = cr.getUrlData(); 210 cr.DealUrlString(data_in,db); 211 212 } 213 214 215 216 217 218 }