This is an agile HTML parser that builds a read/write DOM and supports plain XPATH or XSLT (you actually don't HAVE to understand XPATH nor XSLT to use it, don't worry...). It is a .NET code library that allows you to parse "out of the web" HTML files. The parser is very tolerant with "real world" malformed HTML. The object model is very similar to what proposes System.Xml, but for HTML documents (or streams).
//开始要创建一个HtmlDocument 装入html
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(TextBox1.Value);
HtmlDocument Subdoc = new HtmlDocument();//嵌套内用到
//这里以cjol简历为例
int i = 0;
if (doc.DocumentNode.InnerText.Contains("【技能专长】"))
hasSpe = true;
//
//在有“【技能专长】”的情况下,中国人才热线里的有工作经验人的简历里包含的table数为22个,学生为19个,以此来区分学生和有工作经验的人
//在没有“【技能专长】”的情况下,中国人才热线里的有工作经验人的简历里包含的table数为22-3个,学生为19-3个,以此来区分学生和有工作经验的人
if (hasSpe)
{
if (doc.DocumentNode.SelectNodes("//table").Count == 19)//为学生
isstu = true;
}
else
{
if (doc.DocumentNode.SelectNodes("//table").Count == 16)//为学生
isstu = true;
}
foreach (HtmlNode link in doc.DocumentNode.SelectNodes("//table"))//获取所有table,cjol
{
i++;
if (i == 2) //<!-- 个人照片 -->//个人照片存在于第二个table中。
{
/*
* 姓名(编号:J353453453)的简历 最后更新简历时间:2006-11-20 13:09:32 应聘职位:.NET程序开发
* */
tdnum = 0;
Subdoc.LoadHtml(link.InnerHtml);
foreach (HtmlNode linktd in Subdoc.DocumentNode.SelectNodes("//img"))//当前table里只有一个img标签
{
try
{
if (linktd.Attributes["src"].Value.Trim() != "http://www.cjol.com/Jobseeker/cn/myjobsdb/jobseeking/images/photo_JobSeekers_cn.gif")
{
photourl = linktd.Attributes["src"].Value.Trim();
}
}
catch
{
}
}
}
就这么多了,大家看了也会举一反三,有什么指教的,请给我发邮件。leizhipan@gmail.com