程序就是个好东西,人很难完成的任务,它只需很短时间就搞定。
下面我们来采集一个房产网站上的所有普陀区的小区列表
改地址为:http://sh.fangjia.com/xiaoqu/--e-{0}|r-%E6%99%AE%E9%99%80%E5%8C%BA
{0}为页码,共35页,C#实现代码如下:
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.IO; using System.Net; using System.Text.RegularExpressions; namespace Hourse { class Program { private static string uri; private static string file; static void Main(string[] args) { uri = "http://sh.fangjia.com/xiaoqu/--e-{0}|r-%E6%99%AE%E9%99%80%E5%8C%BA"; file = AppDomain.CurrentDomain.BaseDirectory + "data.txt"; if (!File.Exists(file)) File.Create(file); Console.WriteLine("--------------------------"); Console.WriteLine("开始采集数据,请等待..."); Console.WriteLine("--------------------------"); int pages = 35; int counts = 0; for (int i = 1; i <= pages; i++) { counts += OperateInfo(i); } Console.WriteLine("采集完成!共"+counts+"条,文件存放在"+file); Console.ReadKey(); } static int OperateInfo(int page) { string _uri = uri.Replace("{0}", page.ToString()); WebClient client = new WebClient(); byte[] datas= client.DownloadData(_uri); string txt = Encoding.UTF8.GetString(datas); /* string txt=@" <div class=""fsize14 margin-bottom8""> <strong> <a href=""/xiaoqu-4796-%E6%9B%B9%E6%9D%A8%E4%BA%8C%E6%9D%91"" target=""_blank""> 曹杨二村</a> </strong> </div> <div class=""margin-bottom5""> 普陀区 曹杨路1107弄,</div> "; */ //匹配小区列表 string pattern = "<div class=\"fsize14 margin-bottom8\">\\s+<strong>\\s+<a\\s+[^>]+>\\s+(.+?)</a>\\s+</strong>"+ "\\s+</div>\\s+<div class=\"margin-bottom5\">([^<]+)</div>"; //获取所有的匹配 string name, address; //小区名字和地址 MatchCollection mc = Regex.Matches(txt, pattern); foreach (Match m in mc) { name = Regex.Replace(m.Value, pattern, "$1"); address = Regex.Replace(m.Value, pattern, "$2"); address = Regex.Replace(address, "[\\s,( )]+", ""); Save(name+" "+address); } Console.WriteLine("第" + page + "页采集到" + mc.Count + "条!"); return mc.Count; } static void Save(string str) { using (StreamWriter sw = new StreamWriter(file, true, Encoding.UTF8)) { sw.WriteLine(str); sw.Flush(); } } } }
运行程序: