• 我自己随便写的爬虫


    有关爬虫,自己半年前写了一个,写过就忘记了,今天才看到

      1 import java.io.IOException;
      2 import java.net.URISyntaxException;
      3 import java.nio.charset.StandardCharsets;
      4 import java.util.ArrayList;
      5 import java.util.Date;
      6 import java.util.HashMap;
      7 import java.util.List;
      8 import java.util.Map;
      9 
     10 import org.apache.commons.lang.time.DateFormatUtils;
     11 import org.apache.http.HttpEntity;
     12 import org.apache.http.ParseException;
     13 import org.apache.http.client.ClientProtocolException;
     14 import org.apache.http.client.methods.CloseableHttpResponse;
     15 import org.apache.http.client.methods.HttpPost;
     16 import org.apache.http.client.utils.URIBuilder;
     17 import org.apache.http.entity.StringEntity;
     18 import org.apache.http.impl.client.CloseableHttpClient;
     19 import org.apache.http.impl.client.HttpClientBuilder;
     20 import org.apache.http.util.EntityUtils;
     21 import org.apache.xerces.util.URI;
     22 import org.jsoup.Jsoup;
     23 import org.jsoup.nodes.Document;
     24 import org.jsoup.nodes.Element;
     25 import org.jsoup.select.Elements;
     26 import org.quartz.Job;
     27 import org.quartz.JobExecutionContext;
     28 import org.quartz.JobExecutionException;
     29 
     30 /*
     31  * @说明
     32  * 由于针对网页取数据,1页有100条数据,二期数据一直是变化的;
     33  * 目前设计是每5分钟抓取一次,所以抓一次,存一次,之前的数据仍旧保留,但是只抓第一页数据。
     34  * 其他排名靠后的数据就不抓了
     35  * 默认排序为 totalvolpct 总成交占比
     36  * 
     37  * 
     38  *                     <th>名称</th>
     39                     <th><a href="#" onclick="return window['sortTable']('symbol', ' ');">代码</a> </th>
     40 <th><a href="#" onclick="return window['sortTable']('totalvol', ' ');">总成交量(万股)</a> </th>
     41 <td><a href="#" onclick="return window['sortTable']('totalvolpct', ' ');">总成交量占比</a> </td>  ----百分比,入库时是去掉百分号入库的
     42 <td><a href="#" onclick="return window['sortTable']('totalamt', ' ');">总成交额(万元)</a> </td>
     43 <td><a href="#" onclick="return window['sortTable']('totalamtpct', ' ');">总成交额占比</a> </td>----百分比,入库时是去掉百分号入库的
     44 <td><a href="#" onclick="return window['sortTable']('avgprice', ' ');">平均成交价(元)</a> </td>
     45 <td><a href="#" onclick="return window['sortTable']('kuvolume', '↓');">主买量(万股)</a>↓</td>
     46 <td><a href="#" onclick="return window['sortTable']('kevolume', ' ');">中性量(万股)</a> </td>
     47 <td><a href="#" onclick="return window['sortTable']('kdvolume', ' ');">主卖量(万股)</a> </td>
     48                     <th>详情 </th>
     49 
     50 
     51 建数据库表时,字段顺序一定要按照上面的顺序来建,否则会有问题
     52 
     53  * 
     54  * 
     55 */
     56 
     57 
     58 public class Crawler implements Job{
     59 
     60     
     61     private String url="http://vip.stock.finance.sina.com.cn/quotes_service/view/cn_bill_sum.php?num=100&sort=totalvolpct&asc=0&volume=40000&type=0&dpc=1&page=1";
     62     private String encode="UTF-8";
     63     
     64     public String getUrlData() {
     65 
     66         String out=new String();
     67         
     68         //---大单分析---
     69         //---http://vip.stock.finance.sina.com.cn/quotes_service/view/cn_bill_sum.php?num=100&sort=totalvolpct&asc=0&volume=40000&type=0&dpc=1&page=2
     70 
     71         // 获得Http客户端(可以理解为:你得先有一个浏览器;注意:实际上HttpClient与浏览器是不一样的)
     72                 CloseableHttpClient httpClient = HttpClientBuilder.create().build();
     73          
     74 //                // 创建Post请求
     75 
     76                 HttpPost httpPost = new  HttpPost(url);
     77          
     78                 //---下面这句话暂时没有起作用,不知道原因;其实参数是可以不用放在上面的httpPost对象中的
     79                 StringEntity entity = new StringEntity("num=100&sort=totalvolpct&asc=0&volume=40000&type=0&dpc=1&page=1", encode);
     80          
     81                 // post请求是将参数放在请求体里面传过去的;这里将entity放入post请求体中
     82                 httpPost.setEntity(entity);
     83          
     84                 httpPost.setHeader("Content-Type", "text/html;charset=utf8");
     85          
     86                 // 响应模型
     87                 CloseableHttpResponse response = null;
     88                 try {
     89                     // 由客户端执行(发送)Post请求
     90                     response = httpClient.execute(httpPost);
     91                     // 从响应模型中获取响应实体
     92                     HttpEntity responseEntity = response.getEntity();
     93          
     94                     System.out.println("响应状态为:" + response.getStatusLine());
     95                     if (responseEntity != null) {
     96                         System.out.println("响应内容长度为:" + responseEntity.getContentLength());
     97                         //System.out.println("响应内容为:" + EntityUtils.toString(responseEntity,"GBK"));
     98                         out=EntityUtils.toString(responseEntity,"GBK");
     99                     }
    100                 } catch (ClientProtocolException e) {
    101                     e.printStackTrace();
    102                 } catch (ParseException e) {
    103                     e.printStackTrace();
    104                 } catch (IOException e) {
    105                     e.printStackTrace();
    106                 } finally {
    107                     try {
    108                         // 释放资源
    109                         if (httpClient != null) {
    110                             httpClient.close();
    111                         }
    112                         if (response != null) {
    113                             response.close();
    114                         }
    115                     } catch (IOException e) {
    116                         e.printStackTrace();
    117                     }
    118                 }
    119         
    120         
    121         
    122         return out;
    123 
    124     }
    125 
    126     
    127     //---想直接跳过字符串处理,暂时没成功---//
    128     public void DealUrlString(String inStr,org.springframework.jdbc.core.JdbcTemplate db) {
    129         
    130         String out=new String();
    131         
    132         Document doc = Jsoup.parseBodyFragment(inStr);
    133         
    134         Element et = doc.getElementById("divListTemplate");
    135         Elements et_tab = et.getElementsByTag("table");
    136     
    137         Elements trs = et_tab.first().getElementsByTag("tr");
    138         
    139         
    140         System.out.println("====size===="+trs.size());  
    141         
    142         
    143         int n=0;
    144         
    145         
    146         
    147         for (Element element : trs) {
    148             if(n==0) {
    149                 n++;
    150             }
    151             else {
    152                 StringBuffer insert_sql=new StringBuffer();
    153                 
    154                 insert_sql.append("insert into stock_bigdeal_analyse (cn_name,symbol,totalvol,totalvolpct,totalamt,totalamtpct,avgprice,kuvolume,kevolume,kdvolume,input_time) values ( ");
    155                 
    156                 Elements ele_ths= element.getElementsByTag("th");
    157                 insert_sql.append(  "'" + ele_ths.get(0).text().trim()+"', ");
    158                 insert_sql.append(  "'" + ele_ths.get(1).text().replaceAll(" ", "")+"', ");//--看不见的特殊符号,
    159                 
    160                 
    161                 Elements ele_tds= element.getElementsByTag("td");
    162               
    163                 insert_sql.append( "'" + ele_tds.get(0).text().trim().replaceAll(" ", "").replaceAll(",", "")+"', ");
    164                 insert_sql.append(  "'" + ele_tds.get(1).text().trim().replaceAll(" ", "").replaceAll("%", "")+"',");
    165                 insert_sql.append(  "'" + ele_tds.get(2).text().trim().replaceAll(" ", "").replaceAll(",", "")+"',");
    166                 insert_sql.append(  "'" + ele_tds.get(3).text().trim().replaceAll(" ", "").replaceAll("%", "")+"',");
    167                 insert_sql.append(  "'" + ele_tds.get(4).text().trim().replaceAll(" ", "").replaceAll(",", "")+"',");
    168                 insert_sql.append(  "'" + ele_tds.get(5).text().trim().replaceAll(" ", "").replaceAll(",", "")+"',");
    169                 insert_sql.append(  "'" + ele_tds.get(6).text().trim().replaceAll(" ", "").replaceAll(",", "")+"',");
    170                 insert_sql.append(  "'" + ele_tds.get(7).text().trim().replaceAll(" ", "").replaceAll(",", "")+"',");
    171                 
    172                 insert_sql.append(  "'" +DateFormatUtils.format(new Date(),"yyyyMMddHHmmssSSS")+"') ");
    173                 
    174               //  System.out.println(insert_sql);
    175                 
    176                 db.execute(insert_sql.toString());
    177                 
    178             }
    179         }
    180         
    181         
    182         
    183     }
    184     
    185     
    186     
    187     public  static void main(String args[]) {
    188         
    189         Crawler cr= new Crawler();
    190         
    191         SpringDb sd = new SpringDb();
    192         org.springframework.jdbc.core.JdbcTemplate db= sd.getJdbc();
    193         
    194         String data_in = cr.getUrlData();
    195         cr.DealUrlString(data_in,db);
    196         
    197         
    198     }
    199 
    200 
    201     @Override
    202     public void execute(JobExecutionContext arg0) throws JobExecutionException {
    203 
    204         Crawler cr= new Crawler();
    205         
    206         SpringDb sd = new SpringDb();
    207         org.springframework.jdbc.core.JdbcTemplate db= sd.getJdbc();
    208         
    209         String data_in = cr.getUrlData();
    210         cr.DealUrlString(data_in,db);
    211         
    212     }
    213     
    214     
    215     
    216     
    217     
    218 }
  • 相关阅读:
    基于mAppWidget实现手绘地图--索引&DEMO
    C语言数据结构----栈的定义及实现
    libvirt命令行文档
    清理系统方法
    Linux 经典电子书共享下载
    使用数组实现队列----《数据结构与算法分析---C语言描述》
    清理系统垃圾
    epoll的内部实现 & 百万级别句柄监听 & lt和et模式非常好的解释
    进程、线程、socket套接字-资源大小 & 切换代价
    网络编程学习-面向工资编程
  • 原文地址:https://www.cnblogs.com/alexgl2008/p/12201839.html
Copyright © 2020-2023  润新知