• java网页数据抓取实例


    网页上面数据如下:

    如果想要过去上图所示网页的数据,代码如下:

    (1)调度类,主要调用工具类中的方法获取数据并入库

    package com.jointsky.jointframe.weather.jobservice;
    
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    
    import org.apache.commons.lang.StringUtils;
    import org.springframework.transaction.annotation.Transactional;
    
    import com.jointsky.jointframe.scheduler.exception.ExecutionException;
    import com.jointsky.jointframe.scheduler.quartz.JobService;
    import com.jointsky.jointframe.weather.entity.ActuallyForecastWeather;
    import com.jointsky.jointframe.weather.entity.ActuallyWeather;
    import com.jointsky.jointframe.weather.service.ActuallyForecastWeatherManager;
    import com.jointsky.jointframe.weather.service.ActuallyWeatherManager;
    import com.jointsky.jointframe.weather.utils.UrlInfo;
    
    /*
     * <p>Description:实况天气资料数据资料调度类</p>*/
    @Transactional
    public class ActuallyWeatherJobService implements JobService{
        
        /**
         * 实况天气资料(当天)数据管理类
         */
        private ActuallyWeatherManager actuallyWeatherManager;
        
        /**
         * 实况天气资料(当天)数据实体类
         */
        private ActuallyWeather actuallyWeather;
        
        /**
         * 实况天气资料预报数据管理类
         */
        private ActuallyForecastWeatherManager actuallyForecastWeatherManager;
        
        /**
         * 实况天气资料预报数据实体类
         */
        private ActuallyForecastWeather actuallyForecastWeather;
        
        /**
         * 存放参数的map集合
         */
        private Map<String, Object> map = new HashMap<String, Object>();
        
        
    
        @Override
        public void execute(Map<String, Object> arg0) throws ExecutionException {
            System.out.println("实况天气资料数据获取调度成功");
            String[] countyCodes = UrlInfo.getCountyCodes().split(",");
            for (int j = 0; j < countyCodes.length; j++) {
                String countyCode = countyCodes[j];
                if (StringUtils.isNotEmpty(countyCode)) {
                    try {
                        String url = UrlInfo.getUrl(countyCode);
                        Map<String, Object> filterMap = UrlInfo.getDistrict(countyCode);
                        //市级名称
                        String cityLevel = (String) filterMap.get("cityLevel");
                        //区县级名称
                        String countyLevel = (String) filterMap.get("countyLevel");
                        //银川天气预报详细信息数据
                        List<ActuallyWeather> list_actually = UrlInfo.getURLInfoOfActully(url,"utf-8");
                        List<ActuallyForecastWeather> list_forecast = UrlInfo.getURLInfoOfForecast(url, "utf-8");
                        for (int i = 0; i < list_actually.size(); i++) {
                            actuallyWeather = list_actually.get(i);
                            actuallyWeather.setCityLevel(cityLevel);
                            actuallyWeather.setCountyLevel(countyLevel);
                            //预报时间
                            if (StringUtils.isNotEmpty(actuallyWeather.getForecastTime())) {
                                map.put("forecastTime", actuallyWeather.getForecastTime());
                            }
                            //地名
                            if (StringUtils.isNotEmpty(actuallyWeather.getPlaceName())) {
                                map.put("placeName", actuallyWeather.getPlaceName());
                            }
                            String actuallyWeatherId = actuallyWeatherManager.findIdByParams(map);
                            map = new HashMap<String, Object>();
                            if (StringUtils.isNotEmpty(actuallyWeatherId)) {
                                actuallyWeather.setId(actuallyWeatherId);
                                actuallyWeatherManager.updateWeather(actuallyWeather);
                            }else {
                                actuallyWeatherManager.save(actuallyWeather);
                            }
                        }
                        for (int i = 0; i < list_forecast.size(); i++) {
                            actuallyForecastWeather = list_forecast.get(i);
                            actuallyForecastWeather.setCityLevel(cityLevel);
                            actuallyForecastWeather.setCountyLevel(countyLevel);
                            //预报时间
                            if (StringUtils.isNotEmpty(actuallyForecastWeather.getForecastTime())) {
                                map.put("forecastTime", actuallyForecastWeather.getForecastTime());
                            }
                            //地名
                            if (StringUtils.isNotEmpty(actuallyForecastWeather.getPlaceName())) {
                                map.put("placeName", actuallyForecastWeather.getPlaceName());
                            }
                            String actuallyForecastWeatherId = actuallyForecastWeatherManager.findIdByParams(map);
                            map = new HashMap<String, Object>();
                            if (StringUtils.isNotEmpty(actuallyForecastWeatherId)) {
                                actuallyForecastWeather.setId(actuallyForecastWeatherId);
                                actuallyForecastWeatherManager.updateForecastWeather(actuallyForecastWeather);
                            }else {
                                actuallyForecastWeatherManager.save(actuallyForecastWeather);
                            }
                        }
                        
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
            }
        }
    
        
    
        public ActuallyWeatherManager getActuallyWeatherManager() {
            return actuallyWeatherManager;
        }
    
        public void setActuallyWeatherManager(
                ActuallyWeatherManager actuallyWeatherManager) {
            this.actuallyWeatherManager = actuallyWeatherManager;
        }
    
        public ActuallyWeather getActuallyWeather() {
            return actuallyWeather;
        }
    
        public void setActuallyWeather(ActuallyWeather actuallyWeather) {
            this.actuallyWeather = actuallyWeather;
        }
    
    
    
        public Map<String, Object> getMap() {
            return map;
        }
    
    
    
        public void setMap(Map<String, Object> map) {
            this.map = map;
        }
    
    
    
        public ActuallyForecastWeatherManager getActuallyForecastWeatherManager() {
            return actuallyForecastWeatherManager;
        }
    
    
    
        public void setActuallyForecastWeatherManager(
                ActuallyForecastWeatherManager actuallyForecastWeatherManager) {
            this.actuallyForecastWeatherManager = actuallyForecastWeatherManager;
        }
    
    
    
        public ActuallyForecastWeather getActuallyForecastWeather() {
            return actuallyForecastWeather;
        }
    
    
    
        public void setActuallyForecastWeather(
                ActuallyForecastWeather actuallyForecastWeather) {
            this.actuallyForecastWeather = actuallyForecastWeather;
        }
    
    
        
        
    }
    View Code

    (2)工具类,主要为一些执行查询数据的实现方法

    package com.jointsky.jointframe.weather.utils;
    
    import java.io.BufferedReader;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    import java.net.HttpURLConnection;
    import java.net.URL;
    import java.text.SimpleDateFormat;
    import java.util.ArrayList;
    import java.util.Date;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import org.apache.commons.lang.StringUtils;
    
    import com.jointsky.jointframe.weather.entity.ActuallyForecastWeather;
    import com.jointsky.jointframe.weather.entity.ActuallyWeather;
    /**
     * <p>Description:实况天气资料工具类</p>*/
    public class UrlInfo {
        
        /**
         *  生成一个Pattern,同时编译一个正则表达式 
         */
        private static Pattern proInfo = Pattern.compile("<font>(.*?)</font>", Pattern.DOTALL);
        
        /**
         * 宁夏区县编码(总乡镇数190)
         * 银川{市辖区(11个乡镇):53614;贺兰县(7):53610;永宁县(8):53618;灵武市(8):53619}
         */
        private static String countyCodes = "53614,53610,53618,53619";
            
        /**
         * 获取实况天气(当天)数据的方法
         * @param urlInfo
         * @param charset
         * @return
         * @throws Exception
         */
        public static List<ActuallyWeather> getURLInfoOfActully(String urlInfo,String charset) throws Exception {
            String info = getUrlInfo(urlInfo);
            //获得网页源码(0是当天)
            return getDataStructure(info,0);
        }
        
        /**
         * 获取实况天气(预报)数据的方法
         * @param urlInfo
         * @param charset
         * @return
         * @throws Exception
         */
        public static List<ActuallyForecastWeather> getURLInfoOfForecast(String urlInfo,String charset) throws Exception {
            String info = getUrlInfo(urlInfo);
            //获得网页源码(1是预报)
            return getDataStructure(info,1);
        }
        
        /**
         * 网页信息
         * @param urlInfo
         * @return
         * @throws Exception
         */
        public static String getUrlInfo(String urlInfo) throws Exception {
            //读取目的网页URL地址,获取网页源码
            URL url = new URL(urlInfo);
            HttpURLConnection httpUrl = (HttpURLConnection)url.openConnection();
            InputStream is = httpUrl.getInputStream();
            BufferedReader br = new BufferedReader(new InputStreamReader(is,"utf-8"));
            StringBuilder sb = new StringBuilder();
            String line;
            while ((line = br.readLine()) != null) {
                //这里是对链接进行处理
                line = line.replaceAll("</?a[^>]*>", "");
                //这里是对样式进行处理
                line = line.replaceAll("<(\w+)[^>]*>", "<$1>");
                sb.append(line);
            }
            is.close();
            br.close();
            return sb.toString().trim();
        }
        
        private static List getDataStructure(String str,int j) {
            //运用正则表达式对获取的网页源码进行数据匹配,提取我们所要的数据,在以后的过程中,我们可以采用httpclient+jsoup,
            //现在暂时运用正则表达式对数据进行抽取提取
            //String[] info = str.split("</li>");
            SimpleDateFormat sf = new SimpleDateFormat("HH");
            Date dateTime = new Date();
            String hour = sf.format(dateTime);
            Integer h = Integer.parseInt(hour);
            int t = 0;
            //如果十二点之前当天会有四个时间段模块(今天上午6~12;今天下午12~18;今天前半夜18~24;今天后半夜次日00~06)
            if (h<=12) {
                t=4;
            //如果十二点之后十八点之前当天会有三个时间段模块(今天下午12~18;今天前半夜18~24;今天后半夜次日00~06)
            }else if (12<h&&h<=18) {
                t=3;
            //如果十八点之后当天会有两个时间段模块(今天前半夜18~24;今天后半夜次日00~06)
            }else if(h>18) {
                t=2;
            }
            String[] info = str.split("<th>");
            List<ActuallyWeather> list_actually = new ArrayList<ActuallyWeather>();
            List<ActuallyForecastWeather> list_forecast = new ArrayList<ActuallyForecastWeather>();
            SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
            //当t的值是几的时候相应的当天的天气数据就还剩下几个模块,k就是用来控制第几个模块数据的参数
            int k = 0;
            for (String s : info) {
                //这个Pattern对象将会使用matcher()方法来生成一个Matcher实例,接着便可以使用该 Matcher实例以编译的正则表达式为基础对目标字符串进行匹配工作,多个Matcher是可以共用一个Pattern对象的。
                Matcher m = proInfo.matcher(s);
                ActuallyWeather actually = null;
                ActuallyForecastWeather forecast = null;
                //使用find()方法查找第一个匹配的对象 
                if (m.find()) {
                    actually = new ActuallyWeather();
                    forecast = new ActuallyForecastWeather();
                    //返回与组匹配的子串内容
                    String[] ss = m.group(1).trim().replace(" ", "").split(">");
                    
                    if ("风力".equals(ss[0])) {
                        k++;
                        String[] strsss = s.split("<td>");
                        int i = 0;
                        if (k<=t&&j==0) {
                            actually = new ActuallyWeather();
                            for (String ss1 : strsss) {
                                Matcher mm = proInfo.matcher(ss1);
                                if (mm.find()) {
                                    //设置产品型号
                                    String[] sss = mm.group(1).trim().replace(" ", "").split(">");
                                    if (i%4==1) {
                                        actually.setPlaceName(sss[0]);
                                    }else if (i%4==2) {
                                        actually.setWeatherStatus(sss[0]);
                                    }else if (i%4==3) {
                                        String temp = sss[0];
                                        String[] temps = temp.split("/");
                                        actually.setMaxTemperature(temps[0]+"℃");
                                        actually.setMinTemperature(temps[1]);
                                    }else if (i%4==0&&i!=0) {
                                        actually.setWindPower(sss[0]);
                                    }
                                }
                                if (i%4==0&&i!=0) {
                                    Date date = new Date();
                                    //发布日期
                                    actually.setPubTime(sdf.format(date));
                                    //前四模块数据从当天早上七点开始加六个小时
                                    //date = new Date(date.getTime() + (k-1)*21600000);
                                    int p = 0;
                                    //t是当日数据剩余次数;k是当前循环次数
                                    if ((t-k)==3) {
                                        p = 6;
                                    }else if ((t-k)==2) {
                                        p = 12;
                                    }else if ((t-k)==1) {
                                        p = 18;
                                    }else if ((t-k)==0) {
                                        p = 24;
                                    }
                                    
                                    //次日凌晨
                                    if (24==p) {
                                        Date time = new Date(date.getTime() + 86400000);
                                        actually.setForecastTime(sdf.format(time)+" 00");
                                    }else if (p<10) {
                                        actually.setForecastTime(sdf.format(date)+" 0"+p);
                                    }else if (p>10&&p!=24) {
                                        actually.setForecastTime(sdf.format(date)+" "+p);
                                    }
                                    list_actually.add(actually);
                                    actually=new ActuallyWeather();
                                }
                                
                                i++;
                            }
                        }else if (k>t&&j==1) {
                            forecast = new ActuallyForecastWeather();
                            for (String ss1 : strsss) {
                                Matcher mm = proInfo.matcher(ss1);
                                if (mm.find()) {
                                    //设置产品型号
                                    String[] sss = mm.group(1).trim().replace(" ", "").split(">");
                                    if (i%4==1) {
                                        forecast.setPlaceName(sss[0]);
                                    }else if (i%4==2) {
                                        forecast.setWeatherStatus(sss[0]);
                                    }else if (i%4==3) {
                                        String temp = sss[0];
                                        String[] temps = temp.split("/");
                                        forecast.setMaxTemperature(temps[0]+"℃");
                                        forecast.setMinTemperature(temps[1]);
                                    }else if (i%4==0&&i!=0) {
                                        forecast.setWindPower(sss[0]);
                                    }
                                }
                                if (i%4==0&&i!=0) {
                                    Date date = new Date();
                                    //发布日期
                                    forecast.setPubTime(sdf.format(date));
                                    //从第五个模块数据开始数据是从次日凌晨开始每下一组比上一组晚24小时
                                    date = new Date(date.getTime() + (k-t)*86400000);
                                    forecast.setForecastTime(sdf.format(date)+" 00");
                                    list_forecast.add(forecast);
                                    forecast=new ActuallyForecastWeather();
                                }
                                
                                i++;
                            }
                        }
    
                        
                    }
                }
            }
            if (0==j) {
                return list_actually;
            }else if (1==j) {
                return list_forecast;
            }
            return null;
        }
    
        /**
         * 生成url
         * @param countyCode
         * @return
         * @throws Exception
         */
        public static String getUrl(String countyCode) throws Exception {
            String url = "http://3g.nx121.com/pc/tqybxzb.aspx";
            if (StringUtils.isNotEmpty(countyCode)) {
                url = url + "?sd="+countyCode;
            }
            return url;
        }
        
        /**
         * 根据城市编码查询所属市级和区县级行政区
         * @param countyCode
         * @return
         * @throws Exception
         */
        public static Map<String, Object> getDistrict(String countyCode) throws Exception {
            
            Map<String, Object> map = new HashMap<String, Object>();
            if ("53614".equals(countyCode)) {
                //市级行政区
                map.put("cityLevel", "银川市");
                //区县级行政区
                map.put("countyLevel", "市辖区");
            }else if ("53610".equals(countyCode)) {
                //市级行政区
                map.put("cityLevel", "银川市");
                //区县级行政区
                map.put("countyLevel", "贺兰县");
            }else if ("53618".equals(countyCode)) {
                //市级行政区
                map.put("cityLevel", "银川市");
                //区县级行政区
                map.put("countyLevel", "永宁县");
            }else if ("53619".equals(countyCode)) {
                //市级行政区
                map.put("cityLevel", "银川市");
                //区县级行政区
                map.put("countyLevel", "灵武市");
            }        
            return map;
        }
        
        public static Pattern getProInfo() {
            return proInfo;
        }
    
        public static void setProInfo(Pattern proInfo) {
            UrlInfo.proInfo = proInfo;
        }
    
        public static String getCountyCodes() {
            return countyCodes;
        }
    
        public static void setCountyCodes(String countyCodes) {
            UrlInfo.countyCodes = countyCodes;
        }
        
    }
    View Code

    (3)实体类,用于存放数据的bean

    package com.jointsky.jointframe.weather.entity;
    
    import javax.persistence.Column;
    import javax.persistence.Entity;
    import javax.persistence.Table;
    
    import org.apache.commons.lang.builder.EqualsBuilder;
    import org.apache.commons.lang.builder.HashCodeBuilder;
    import org.apache.commons.lang.builder.ToStringBuilder;
    import org.apache.commons.lang.builder.ToStringStyle;
    import org.hibernate.annotations.Cache;
    import org.hibernate.annotations.CacheConcurrencyStrategy;
    
    import com.jointsky.jointframe.core.entity.IdEntity;
    
    /**
     * <p>Description:实况天气资料(当天)Entity类</p>*/
    @Entity
    @Table(name = "t_actually_weather")
    @Cache(usage = CacheConcurrencyStrategy.READ_WRITE)
    public class ActuallyWeather extends IdEntity {
    
        /**
         * 
         */
        private static final long serialVersionUID = -5324072662712469478L;
        
        /**
         * 市级行政区(名称)
         */
        private String cityLevel;
        
        /**
         * 区县级行政区名(名称)
         */
        private String countyLevel;
    
        /**
         * 发布时间
         */
        private String pubTime;
        
        /**
         * 地名
         */
        private String placeName;
        
        /**
         * 天气状态:多云、晴、小雨......
         */
        private String weatherStatus;
        
        /**
         * 最高温度
         */
        private String maxTemperature; 
        
        /**
         * 最低温度
         */
        private String minTemperature;
        
        /**
         * 风力
         */
        private String windPower;
        
        /**
         * 预报时间
         */
        private String forecastTime;
        
    
        @Column(name="pub_time",length=50)
        public String getPubTime() {
            return pubTime;
        }
    
        public void setPubTime(String pubTime) {
            this.pubTime = pubTime;
        }
    
        @Column(name="place_name",length=50)
        public String getPlaceName() {
            return placeName;
        }
    
        public void setPlaceName(String placeName) {
            this.placeName = placeName;
        }
    
        @Column(name="weather_status",length=50)
        public String getWeatherStatus() {
            return weatherStatus;
        }
    
        public void setWeatherStatus(String weatherStatus) {
            this.weatherStatus = weatherStatus;
        }
    
        @Column(name="max_temperature",length=50)
        public String getMaxTemperature() {
            return maxTemperature;
        }
    
        public void setMaxTemperature(String maxTemperature) {
            this.maxTemperature = maxTemperature;
        }
    
        @Column(name="min_temperature",length=50)
        public String getMinTemperature() {
            return minTemperature;
        }
    
        public void setMinTemperature(String minTemperature) {
            this.minTemperature = minTemperature;
        }
    
        @Column(name="wind_power",length=50)
        public String getWindPower() {
            return windPower;
        }
    
        public void setWindPower(String windPower) {
            this.windPower = windPower;
        }
    
        public static long getSerialversionuid() {
            return serialVersionUID;
        }
        
        @Column(name="forecast_time",length=50)
        public String getForecastTime() {
            return forecastTime;
        }
    
        public void setForecastTime(String forecastTime) {
            this.forecastTime = forecastTime;
        }
    
        @Override
        public String toString() {
            return  new ToStringBuilder(this, ToStringStyle.MULTI_LINE_STYLE)
                .append("id", id).toString();
        }
        
        @Column(name="city_level",length=50)
        public String getCityLevel() {
            return cityLevel;
        }
    
        public void setCityLevel(String cityLevel) {
            this.cityLevel = cityLevel;
        }
    
        @Column(name="county_level",length=50)
        public String getCountyLevel() {
            return countyLevel;
        }
    
        public void setCountyLevel(String countyLevel) {
            this.countyLevel = countyLevel;
        }
    
        //MeetingArrangement是当前实体
        @Override
        public boolean equals(Object o) {
            boolean equal = false;
            if (o != null && ActuallyWeather.class.isAssignableFrom(o.getClass())) {
                ActuallyWeather actuallyWeather = (ActuallyWeather) o;
                equal = (new EqualsBuilder().append(this.id, actuallyWeather.id)).isEquals();
            }
            return equal;
        }
        
        @Override
        public int hashCode() {
            return new HashCodeBuilder(17, 37).append(id).toHashCode();
        }
    }
    View Code

    预报实体类和当天的字段完全一致,不过表名不一致。

    参考文档:http://www.cnblogs.com/shuilangyizu/p/6595588.html

  • 相关阅读:
    FILE 创建
    jfreechart折线图 demo
    Win7下Maven的安装与配置
    IntelliJ IDEA 14.x 与 Tomcat 集成,创建并运行Java Web项目
    Java中print、printf、println
    添加SSH密钥到GitHub
    GitHub学习资料
    Windows下Git的安装及配置
    【转】我害怕阅读的人
    安装MongoDB
  • 原文地址:https://www.cnblogs.com/shuilangyizu/p/6616763.html
Copyright © 2020-2023  润新知