名字有点大,其实就是我女朋友现在所在的这家公司的工作,要求她必须拥有一个很大的南京公司名单,而且现状就是,她知道的公司就屈指可数,更别说是她同事们不知道而她知道的公司了。但是她被要求每天至少找到一个他们公司数据里面没有人公司,所以她只能借住搜索引擎。
上面是做这个东西的原因,起初我是想用网络爬虫来写的,后来操作了个开头,发现,网络爬虫是个大项目,而且涉及到大数据的操作和数据是否最新等特点,觉得没必要。加上时间紧急,她生怕完成不了任务被辞,哎,就快速的用业余时间写了一个小程序。
程序的思路很简单,就是通过搜索引擎(以百度为例)构造get型的url,然后在C#里面生成一个request请求,去取得Response,然后从Response里面取得dom,再用正则表达式从dom里面取得自己想要的数据:1、与公司名称匹配度高的字符串;2、路转到搜索结果下一页的地址。将取得的所有公司名称放进到list中,然后遍历去重,再与自己预定义的列表匹配,将不合法的公司名去掉。最张显示出来,加上另存为txt,双击复制等上功能。当然,为了可扩展性,我将取公司名称的正则表达式开放给用户,让用户(我女朋友?肯定不是啊!是我自己)可以尽可能的自定义。
好了,二话不说,上代码:
1: using System;
2: using System.Collections.Generic;
3: using System.ComponentModel;
4: using System.Data;
5: using System.Drawing;
6: using System.IO;
7: using System.Linq;
8: using System.Net;
9: using System.Text;
10: using System.Text.RegularExpressions;
11: using System.Windows.Forms;
12: using System.Threading;
13: using System.Collections;
14:
15:
16: namespace SearchCompany
17: {
18: public partial class Form1 : Form
19: {
20: Regex rx;
21: Thread getCompany;
22: public delegate void MyInvoke(string str,int type);
23: List<String> lstCom = new List<String>();
24: public List<String> lstFackName =new List<string>(){ "公司", "有限公司", "广告公司", "保险公司", "猎头公司", "新公司", "旧公司"};
25: public string googleNextPageRx;
26: public string baiduNextPageRx;
27: public long pagenum;
28: public Form1()
29: {
30: InitializeComponent();
31: baiduNextPageRx = "(?<=</a><a href=")/s[^"]*(?="\sclass="n">下一页></a><span class="nums" style="margin-left:120px">)";
32: googleNextPageRx = "";
33: textBox1.Text = @"((w)|((w+?)))+(?<=公司)";
34: }
35:
36: private void Search_Click(object sender, EventArgs e)
37: {
38: pagenum = 1;
39: rx = new Regex(textBox1.Text.Trim());//regex传进来的时候被转义,但是给了rx后又自动转义回去了,所以说,不影响,可以直接写任意正确的正则式;
40: lblShow.Text = "";
41: lblBug.Text = "";
42: lstCompany.Items.Clear();
43: getCompany = new Thread(new ThreadStart(MakeCompany));
44: getCompany.Name = "getPageAndComputeData";
45: getCompany.IsBackground = true;
46: getCompany.Start();
47: }
48: /// <summary>
49: /// 子线程对主线程的操作
50: /// </summary>
51: /// <param name="lblstr">要在主界面上显示的内容</param>
52: /// <param name="type">操作类型:1、开始处理网站,显示已经处理多少个正在处理第几个,2、正在获取的数据,3、处理完成,搜索按钮可以点击,4、填充数据,5、显示bug</param>
53: public void setFromThread(string lblstr, int type)
54: {
55: if (lblShow.InvokeRequired)
56: {
57: MyInvoke _myInvoke = new MyInvoke(setFromThread);
58: this.Invoke(_myInvoke, new object[] { lblstr,type });
59: }
60: else
61: {
62: if (type == 1)
63: {
64: this.Search.Enabled = false;
65: this.AddCompany.Enabled = false;
66: this.lblShow.Text = lblstr;
67: }
68: else if (type == 2)
69: {
70: this.lblShow.Text = lblstr;
71: }
72: else if (type == 3)
73: {
74: this.Search.Enabled = true;
75: this.AddCompany.Enabled = true;
76: }
77: else if (type == 4)
78: {
79: //lstCom中已有数据,现在将它处理优化,再写入lstCompany中
80: lblShow.Text = "正在处理数据,请稍后";
81: List<String> newlst = new List<string>();
82: if (lstCom.Count > 0)
83: {
84: foreach (String str in lstCom.Distinct<String>())
85: {
86: lstCompany.Items.Add(str);
87: }
88: }
89: lblShow.Text=String.Format("数据已经处理完成 共处理{0}个页面 找到名称不重复的公司:{1}家",pagenum.ToString(), lstCompany.Items.Count.ToString());
90: this.Search.Enabled = true;
91: this.AddCompany.Enabled = true;
92: }
93: else if (type == 5)
94: {
95: lblBug.Text += lblstr;
96: }
97: }
98: }
99:
100: public void MakeCompany()
101: {
102: try
103: {
104: setFromThread("开始获取页面", 1);
105: HttpWebRequest request;
106: StringBuilder sbPageString = new StringBuilder();
107: string oldNextUrl;
108: string newNextUrl;
109: foreach (string url in lstSite.Items)
110: {
111:
112: request = (HttpWebRequest)WebRequest.Create(url);
113: request.MaximumAutomaticRedirections = 500;
114: request.CookieContainer = new CookieContainer();
115: oldNextUrl = url;
116: request.Timeout = 3000;
117: request.Headers.Set("Pragma", "no-cache");
118: HttpWebResponse response = (HttpWebResponse)request.GetResponse();
119: Stream sm = response.GetResponseStream();
120: Encoding ecode = Encoding.GetEncoding("utf-8");
121: StreamReader sr = new StreamReader(sm, ecode);
122: string pages = sr.ReadToEnd();
123: //从pages里面取下一页的地址/s?.*(?="sclass="n">下一页></a><span class="nums" style="margin-left:120px">)
124: Regex rxNextUrl = new Regex(baiduNextPageRx);
125: Match mcNextUrl = rxNextUrl.Match(pages);
126:
127: //把pages放到sbPageString里面
128: sbPageString.Append(pages.Replace("<em>", "").Replace("</em>", ""));
129: //用新的地址取数据放进pages里面
130: setFromThread("已获取1个页面,正在尝试获取下一页", 2);
131: while (mcNextUrl.Success)
132: {
133: newNextUrl = "http://www.baidu.com" + mcNextUrl.Value;
134: if (oldNextUrl.Equals(newNextUrl)) break;
135: oldNextUrl = newNextUrl;
136: request = (HttpWebRequest)WebRequest.Create("http://www.baidu.com" + mcNextUrl.Value);
137: request.Timeout = 3000;
138: request.Headers.Set("Pragma", "no-cache");
139: response = (HttpWebResponse)request.GetResponse();
140: sm = response.GetResponseStream();
141: ecode = Encoding.GetEncoding("utf-8");
142: sr = new StreamReader(sm, ecode);
143: pages = sr.ReadToEnd();
144: rxNextUrl = new Regex(baiduNextPageRx);
145: mcNextUrl = rxNextUrl.Match(pages);
146: sbPageString.Append(pages.Replace("<em>", "").Replace("</em>", ""));
147: setFromThread(String.Format("已获取{0}个页面,正在尝试获取下一个页面", (++pagenum).ToString()), 2);
148: }
149: }
150:
151: setFromThread(String.Format("共获取{0}个页面的数据,正在对数据进行处理",pagenum.ToString()), 2);
152: string strPage = sbPageString.ToString().ToLower(); MatchCollection mc = rx.Matches(strPage);
153: string str="";
154: foreach (Match tmc in mc)
155: {
156: str=tmc.Value.Trim();
157: if(!lstFackName.Contains<String>(str))
158: lstCom.Add(str);
159: }
160: setFromThread("", 4);
161: }catch (Exception ex)
162: {
163: setFromThread(ex.Message.ToString(), 5);
164: }
165: }
166:
167: private void AddCompany_Click(object sender, EventArgs e)
168: {
169: String strURL=newURL.Text.Trim();
170: if (strURL != String.Empty)
171: {
172: lstSite.Items.Add(strURL);
173: newURL.Text = "";
174: AddCompany.Enabled = false;
175: }
176: }
177:
178: private void newURL_TextChanged(object sender, EventArgs e)
179: {
180: if (newURL.Text.Trim() == String.Empty)
181: {
182: AddCompany.Enabled = false;
183: }
184: else
185: {
186: AddCompany.Enabled = true;
187: }
188: }
189:
190: private void button1_Click(object sender, EventArgs e)
191: {
192: if (lstSite.SelectedItems.Count > 0)
193: {
194: for (int i = 0; i < lstSite.SelectedItems.Count; i++)
195: {
196: lstSite.Items.Remove(lstSite.SelectedItems[i]);
197: }
198: }
199: }
200:
201: private void button2_Click(object sender, EventArgs e)
202: {
203: if (saveFileDialog1.ShowDialog() == DialogResult.OK)
204: {
205: try
206: {
207: System.IO.FileStream fs = (System.IO.FileStream)saveFileDialog1.OpenFile();
208: StreamWriter sw = new StreamWriter(fs);
209: for (int i = 0; i < lstCompany.Items.Count; i++)
210: {
211: sw.WriteLine(lstCompany.Items[i].ToString());
212: }
213: sw.Flush();
214: sw.Close();
215: fs.Close();
216: MessageBox.Show("文件保存成功");
217: }
218: catch (Exception ex)
219: {
220: MessageBox.Show("异常: {0}", ex.Message.ToString());
221: }
222: }
223: }
224:
225: private void lstCompany_DoubleClick(object sender, EventArgs e)
226: {
227: Clipboard.SetText(lstCompany.SelectedItems[0].ToString());
228: lblCopy.Text = "复制成功...";
229: timer1.Tick += lblCopyClear;
230: timer1.Interval = 3000;
231: timer1.Start();
232: }
233: public void lblCopyClear(object sender,EventArgs e)
234: {
235: lblCopy.Text = "";
236: timer1.Tick -= lblCopyClear;
237: timer1.ToString();
238: }
239:
240: }
241: }