• 摘要:HtmlAgilityPack 2


    通过HtmlAgilityPack实现网页信息抓取。

    2012-2-16 08:24| 发布者: benben| 查看: 4823| 评论: 0

    摘要: 1. 下载Html Agility Pack,解压保存到本地 下载地址:http://htmlagilitypack.codeplex.com/ 1 void caijisoufun() 2 { 3 try 4 { 5 6 HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); 7 ...

    1. 下载Html Agility Pack,解压保存到本地 下载地址: http://htmlagilitypack.codeplex.com/
     

     1 void caijisoufun()
    2 {
    3 try
    4 {
    5
    6 HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
    7 String str = "http://esf.wuxi.soufun.com/agent/agent/AloneHouseList.aspx?agentid=160148311&housetype=esf&price=&roomtype=&district=&page=1";
    8         
    9 String htmlstr = fhttp2(str);//获取html页面的源文件
    10 doc.LoadHtml(htmlstr);
    11 HtmlNode navNode = doc.GetElementbyId("right");//获取id为right的节点
    12 //print(navNode.InnerHtml);
    13 HtmlNodeCollection categoryNodeList = navNode.SelectNodes("//div[1]/table/tr[1]/td[1]/a[1]"); //分析html结构
    14
    15 HtmlNode temp = null;
    16
    17 foreach (HtmlNode categoryNode in categoryNodeList)
    18 {
    19 temp = HtmlNode.CreateNode(categoryNode.OuterHtml);
    20
    21 String url = "http://esf.wuxi.soufun.com" + temp.Attributes["href"].Value;
    22 println(url);//其实就是个Response.Write
    23 String showstr = fhttp2(url);
    24 HtmlAgilityPack.HtmlDocument doc2 = new HtmlAgilityPack.HtmlDocument();
    25 doc2.LoadHtml(showstr);
    26 HtmlNode cnode = doc2.GetElementbyId("wrap");
    27 HtmlNode title = cnode.SelectSingleNode("//div[2]/div[1]/h1[1]/font[1]");
    28 println(title.InnerText);//
    29 //这里就可以做很多事情了,包括楼盘户型全部可以通过抓取获得信息,导入自己的数据库。
    30 flush();
    31 sleep(10);
    32
    33 //println(temp.Attributes["href"].Value);
    34 }
    35
    36 }
    37 catch (Exception ex)
    38 {
    39 println(ex);
    40 }
    41 }
    42
    43 String fhttp2(String url)
    44 {
    45 try
    46 {
    47 WebRequest rGet = WebRequest.Create(url);
    48 WebResponse rSet = rGet.GetResponse();
    49 Stream s = rSet.GetResponseStream();
    50 StreamReader sr = new StreamReader(s, Encoding.GetEncoding("GB2312"));
    51 StringBuilder sb = new StringBuilder();
    52 String Str;
    53
    54 while ((Str = sr.ReadLine()) != null)
    55 {
    56 sb.Append(Str + "\n");
    57 }
    58
    59 sr.Close();
    60 s.Close();
    61 rSet.Close();
    62
    63 return tostr(sb);
    64 }
    65 catch (Exception e)
    66 {
    67 return "";
    68 }
    69 }


    http://www.189works.com/article-40082-1.html
  • 相关阅读:
    eclipse code templates 设置(eclipse注释模版配置)
    kettle 程序调用执行ktr转换示例代码
    JQuery学习笔记
    获取工程路径(jar和普通文件结构通用) java
    java 获取jar包路径,遍历jar包
    百度跨域ajax
    eclipse Wtp在线安装
    PropertiesHelper
    java base64/jQuery Base64
    Eclipse中,打开文件所在文件夹的插件,及设置
  • 原文地址:https://www.cnblogs.com/iammackong/p/3036199.html
Copyright © 2020-2023  润新知