• Jsoup抓取页面的小技巧


    前面写了好多jsoup的例子

     现在写写小技巧吧

    (1) 得到document 的方法,

    (有时候这个不一定能得到document,可以看前面介绍的两个方法中的另一个方法,也可以将post 方法改成get ——在try里)

        public static Document readUrlFist(String url) {
            Document doc = null;
            Connection conn = Jsoup.connect(url);
            conn
                    .header(
                            "User-Agent",
                            "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2 Googlebot/2.1");
            try {
                doc = conn.timeout(200 * 1000).post();
            } catch (IOException e) {
                e.printStackTrace();
                if ((e instanceof UnknownHostException)
                        || (e instanceof SocketTimeoutException)) {
                    doc = readUrlFist(url);
                }
    
            }
            return doc;
        }

     得到body 的方法

        // 读取url得到一级节点
        public static Elements readBody(String url) {
            Document doc = readUrlFist(url);
            Elements body = doc.select("body");
            return body;
        }

    得到某节点下孩子最多的节点

        // 得到该节点下的孩子最多的那个节点(这个节点就是含有Url的节点)
        public static Element readChildByMaxNum(Element body) {
            Elements divOne = body.children();
            Element bestElement = divOne.get(0);
            int best = divOne.get(0).children().size();
            for (int i = 0; i < divOne.size(); i++) {
                Elements divTwo = divOne.get(i).select("a");
                int temp = divTwo.size();
                if (temp > best) {
                    best = temp;
                    bestElement = divOne.get(i);
                }
            }
            // System.out.println(bestElement.attr("id"));
            return bestElement;
        }

    得到tagName 最多的节点

    // 得到tagName最多的节点
        public static List<Element> takeAparentByTagName(List<Element> as,
                Element bestElement) {
            List<Element> bestElements = new ArrayList<Element>();
            Map<String, Integer> aparent = new HashMap<String, Integer>();
            String index = "";
            for (Element element : as) {
                String tag = element.tagName();
                if (aparent.containsKey(tag)) {
                    aparent.put(tag, aparent.get(tag) + 1);
                } else {
                    aparent.put(tag, 1);
                }
            }
            Set<String> keys = aparent.keySet();
            Iterator<String> iterable = keys.iterator();
            int max = 0;
            String best = "";
            while (iterable.hasNext()) {
                String key = iterable.next();
                if (max < aparent.get(key)) {
                    max = aparent.get(key);
                    best = key;
                }
            }
            index = best;
            bestElements = bestElement.select(index);
    
            return bestElements;
        }

    得到className 的名字和他的数量

    // 得到className的名字和他的数量
        public static Map<String, Integer> takeMapClass(List<Element> bestElements) {
            Map<String, Integer> myClass = new HashMap<String, Integer>();
            for (int i = 0; i < bestElements.size(); i++) {
                Element element = bestElements.get(i);
                String temp = element.className();
                int sum = element.children().size();
                if (sum == 0) {
                    sum = 1;
                }
                if (temp == null || temp.equals("")) {
                    temp = "iiiuuuzzz";
                }
                if (myClass.containsKey(temp)) {
                    myClass.put(temp, myClass.get(temp) + sum);
                } else {
                    myClass.put(temp, sum);
                }
            }
            // System.out.println("myClass.size() "+myClass.size());
            return myClass;
        }

    得到所有节点的父亲节点

        public static List<Element> gerFaterPonit(Elements elements) {
            List<Element> bestElements = new ArrayList<Element>();
            for (Element element : elements) {
                Element elementFater = element.parent();
                bestElements.add(elementFater);
            }
            return bestElements;
        }

    得到className数量最多的节点的名字

    public static String takeIndexByClassName(List<Element> bestElements) {
            Map<String, Integer> myClass = takeMapClass(bestElements);
            Set<String> keys = myClass.keySet();
            Iterator<String> iterable = keys.iterator();
            int max = 0;
            String best = "";
            while (iterable.hasNext()) {
                String key = iterable.next();
                // System.out.println(key+ myClass.get(key));
                if (max < myClass.get(key)) {
                    max = myClass.get(key);
                    best = key;
                }
            }
            String index = best;
            // System.out.println("index :" +index);
            return index;
        }

     得到className 数量居第二的名称

    // 和className数量次多的节点的索引
        public static String takeBetterIndexByClassName(List<Element> bestElements) {
            Map<String, Integer> myClass = takeMapClass(bestElements);
            String index = takeIndexByClassName(bestElements);
            String index2 = "";
            Set<String> keys = myClass.keySet();
            Iterator<String> iterable = keys.iterator();
            int max = 0;
            String best = "";
            while (iterable.hasNext()) {
                String key = iterable.next();
                if (!key.equals(index)) {
                    if (max < myClass.get(key)) {
                        max = myClass.get(key);
                        best = key;
                    }
                }
            }
            index2 = best;
            // System.out.println("index2   :" +index2);
            return index2;
        }
    // 根据索引得出所要的节点
        public static List<Element> getElementByClassName(
                List<Element> bestElements, String index) {
            List<Element> elementList = new ArrayList<Element>();
            for (Element element : bestElements) {
                String temp = element.className();
                if (temp == null || temp.equals("")) {
                    temp = "iiiuuuzzz";
                }
                if (index.equals(temp)) {
                    elementList.add(element);
                }
    
            }
            return elementList;
    
        }

    // 得到孩子节点

    public static List<Element> takeChildren(List<Element> bestElements) {
            List<Element> children = new ArrayList<Element>();
            for (Element element : bestElements) {
                Elements childrens = element.children();
                for (Element element2 : childrens) {
                    children.add(element2);
                }
            }
            return children;
    
        }
    // 得到和自己内容不同的父亲节点
        public static Element getParent(Element element) {
            Element parent = element.parent();
            if (element.siblingElements().size() > 0) {
                while (parent.text().equals(element.text())) {
                    parent = parent.parent();
                }
            }
            return parent;
        }
  • 相关阅读:
    站立会议(二)
    站立会议(一)
    买书优惠问题
    软件的NABCD----安装部分
    你的灯亮着吗读书笔记(一)
    软件工程概论---环状二维数组最大子数组和
    梦断代码读书笔记(三)
    梦断代码读书笔记(二)
    课程作业3.10
    软件工程作业提交3.06
  • 原文地址:https://www.cnblogs.com/tomcattd/p/2880986.html
Copyright © 2020-2023  润新知