• 公司搜索--关于搜索引擎结果的析出


    名字有点大,其实就是我女朋友现在所在的这家公司的工作,要求她必须拥有一个很大的南京公司名单,而且现状就是,她知道的公司就屈指可数,更别说是她同事们不知道而她知道的公司了。但是她被要求每天至少找到一个他们公司数据里面没有人公司,所以她只能借住搜索引擎。

    上面是做这个东西的原因,起初我是想用网络爬虫来写的,后来操作了个开头,发现,网络爬虫是个大项目,而且涉及到大数据的操作和数据是否最新等特点,觉得没必要。加上时间紧急,她生怕完成不了任务被辞,哎,就快速的用业余时间写了一个小程序。

     

    程序的思路很简单,就是通过搜索引擎(以百度为例)构造get型的url,然后在C#里面生成一个request请求,去取得Response,然后从Response里面取得dom,再用正则表达式从dom里面取得自己想要的数据:1、与公司名称匹配度高的字符串;2、路转到搜索结果下一页的地址。将取得的所有公司名称放进到list中,然后遍历去重,再与自己预定义的列表匹配,将不合法的公司名去掉。最张显示出来,加上另存为txt,双击复制等上功能。当然,为了可扩展性,我将取公司名称的正则表达式开放给用户,让用户(我女朋友?肯定不是啊!是我自己)可以尽可能的自定义。

    好了,二话不说,上代码:

     
    截图1
     
     
       1:  using System;
       2:  using System.Collections.Generic;
       3:  using System.ComponentModel;
       4:  using System.Data;
       5:  using System.Drawing;
       6:  using System.IO;
       7:  using System.Linq;
       8:  using System.Net;
       9:  using System.Text;
      10:  using System.Text.RegularExpressions;
      11:  using System.Windows.Forms;
      12:  using System.Threading;
      13:  using System.Collections;
      14:   
      15:   
      16:  namespace SearchCompany
      17:  {
      18:      public partial class Form1 : Form
      19:      {
      20:          Regex rx;
      21:          Thread getCompany;
      22:          public delegate void MyInvoke(string str,int type);
      23:          List<String> lstCom = new List<String>();
      24:          public List<String> lstFackName =new List<string>(){ "公司", "有限公司", "广告公司", "保险公司", "猎头公司", "新公司", "旧公司"};
      25:          public string googleNextPageRx;
      26:          public string baiduNextPageRx;
      27:          public long pagenum;
      28:          public Form1()
      29:          {
      30:              InitializeComponent();
      31:              baiduNextPageRx = "(?<=</a><a href=")/s[^"]*(?="\sclass="n">下一页&gt;</a><span class="nums" style="margin-left:120px">)";
      32:              googleNextPageRx = "";
      33:              textBox1.Text = @"((w)|((w+?)))+(?<=公司)";
      34:          }
      35:   
      36:          private void Search_Click(object sender, EventArgs e)
      37:          {
      38:              pagenum = 1;
      39:              rx = new Regex(textBox1.Text.Trim());//regex传进来的时候被转义,但是给了rx后又自动转义回去了,所以说,不影响,可以直接写任意正确的正则式;
      40:              lblShow.Text = "";
      41:              lblBug.Text = "";
      42:              lstCompany.Items.Clear();
      43:              getCompany = new Thread(new ThreadStart(MakeCompany));
      44:              getCompany.Name = "getPageAndComputeData";
      45:              getCompany.IsBackground = true;
      46:              getCompany.Start();
      47:          }
      48:          /// <summary>
      49:          /// 子线程对主线程的操作
      50:          /// </summary>
      51:          /// <param name="lblstr">要在主界面上显示的内容</param>
      52:          /// <param name="type">操作类型:1、开始处理网站,显示已经处理多少个正在处理第几个,2、正在获取的数据,3、处理完成,搜索按钮可以点击,4、填充数据,5、显示bug</param>
      53:          public void setFromThread(string lblstr, int type)
      54:          {
      55:              if (lblShow.InvokeRequired)
      56:              {
      57:                  MyInvoke _myInvoke = new MyInvoke(setFromThread);
      58:                  this.Invoke(_myInvoke, new object[] { lblstr,type });
      59:              }
      60:              else
      61:              {
      62:                  if (type == 1)
      63:                  {
      64:                      this.Search.Enabled = false;
      65:                      this.AddCompany.Enabled = false;
      66:                      this.lblShow.Text = lblstr;
      67:                  }
      68:                  else if (type == 2)
      69:                  {
      70:                      this.lblShow.Text = lblstr;
      71:                  }
      72:                  else if (type == 3)
      73:                  {
      74:                      this.Search.Enabled = true;
      75:                      this.AddCompany.Enabled = true;
      76:                  }
      77:                  else if (type == 4)
      78:                  {
      79:                      //lstCom中已有数据,现在将它处理优化,再写入lstCompany中
      80:                      lblShow.Text = "正在处理数据,请稍后";
      81:                      List<String> newlst = new List<string>();
      82:                      if (lstCom.Count > 0)
      83:                      {
      84:                          foreach (String str in lstCom.Distinct<String>())
      85:                          {
      86:                              lstCompany.Items.Add(str);
      87:                          }
      88:                      }
      89:                      lblShow.Text=String.Format("数据已经处理完成
    共处理{0}个页面
    找到名称不重复的公司:{1}家",pagenum.ToString(), lstCompany.Items.Count.ToString());
      90:                      this.Search.Enabled = true;
      91:                      this.AddCompany.Enabled = true;
      92:                  }
      93:                  else if (type == 5)
      94:                  {
      95:                      lblBug.Text += lblstr;
      96:                  }
      97:              }
      98:          }
      99:        
     100:          public void MakeCompany()
     101:          {
     102:              try
     103:              {
     104:                  setFromThread("开始获取页面", 1);
     105:                  HttpWebRequest request;
     106:                  StringBuilder sbPageString = new StringBuilder();
     107:                  string oldNextUrl;
     108:                  string newNextUrl;
     109:                  foreach (string url in lstSite.Items)
     110:                  {
     111:                          
     112:                      request = (HttpWebRequest)WebRequest.Create(url);
     113:                      request.MaximumAutomaticRedirections = 500;
     114:                      request.CookieContainer = new CookieContainer();
     115:                      oldNextUrl = url;
     116:                      request.Timeout = 3000;
     117:                      request.Headers.Set("Pragma", "no-cache");
     118:                      HttpWebResponse response = (HttpWebResponse)request.GetResponse();
     119:                      Stream sm = response.GetResponseStream();
     120:                      Encoding ecode = Encoding.GetEncoding("utf-8");
     121:                      StreamReader sr = new StreamReader(sm, ecode);
     122:                      string pages = sr.ReadToEnd();
     123:                      //从pages里面取下一页的地址/s?.*(?="sclass="n">下一页&gt;</a><span class="nums" style="margin-left:120px">)
     124:                      Regex rxNextUrl = new Regex(baiduNextPageRx);
     125:                      Match mcNextUrl = rxNextUrl.Match(pages);
     126:   
     127:                      //把pages放到sbPageString里面
     128:                      sbPageString.Append(pages.Replace("<em>", "").Replace("</em>", ""));
     129:                      //用新的地址取数据放进pages里面
     130:                      setFromThread("已获取1个页面,正在尝试获取下一页", 2);
     131:                      while (mcNextUrl.Success)
     132:                      {
     133:                          newNextUrl = "http://www.baidu.com" + mcNextUrl.Value;
     134:                          if (oldNextUrl.Equals(newNextUrl)) break;
     135:                          oldNextUrl = newNextUrl;
     136:                          request = (HttpWebRequest)WebRequest.Create("http://www.baidu.com" + mcNextUrl.Value);
     137:                          request.Timeout = 3000;
     138:                          request.Headers.Set("Pragma", "no-cache");
     139:                          response = (HttpWebResponse)request.GetResponse();
     140:                          sm = response.GetResponseStream();
     141:                          ecode = Encoding.GetEncoding("utf-8");
     142:                          sr = new StreamReader(sm, ecode);
     143:                          pages = sr.ReadToEnd();
     144:                          rxNextUrl = new Regex(baiduNextPageRx);
     145:                          mcNextUrl = rxNextUrl.Match(pages);
     146:                          sbPageString.Append(pages.Replace("<em>", "").Replace("</em>", ""));
     147:                          setFromThread(String.Format("已获取{0}个页面,正在尝试获取下一个页面", (++pagenum).ToString()), 2);
     148:                      }
     149:                  }
     150:   
     151:                  setFromThread(String.Format("共获取{0}个页面的数据,正在对数据进行处理",pagenum.ToString()), 2);
     152:                  string strPage = sbPageString.ToString().ToLower(); MatchCollection mc = rx.Matches(strPage);
     153:                  string str="";
     154:                  foreach (Match tmc in mc)
     155:                  {
     156:                      str=tmc.Value.Trim();
     157:                      if(!lstFackName.Contains<String>(str))
     158:                      lstCom.Add(str);
     159:                  }
     160:                  setFromThread("", 4);
     161:              }catch (Exception ex)
     162:              {
     163:                  setFromThread(ex.Message.ToString(), 5);
     164:              }
     165:          }
     166:   
     167:          private void AddCompany_Click(object sender, EventArgs e)
     168:          {
     169:              String strURL=newURL.Text.Trim();
     170:              if (strURL != String.Empty)
     171:              {
     172:                  lstSite.Items.Add(strURL);
     173:                  newURL.Text = "";
     174:                  AddCompany.Enabled = false;
     175:              }
     176:          }
     177:   
     178:          private void newURL_TextChanged(object sender, EventArgs e)
     179:          {
     180:              if (newURL.Text.Trim() == String.Empty)
     181:              {
     182:                  AddCompany.Enabled = false;
     183:              }
     184:              else
     185:              {
     186:                  AddCompany.Enabled = true;
     187:              }
     188:          }
     189:   
     190:          private void button1_Click(object sender, EventArgs e)
     191:          {
     192:              if (lstSite.SelectedItems.Count > 0)
     193:              {
     194:                  for (int i = 0; i < lstSite.SelectedItems.Count; i++)
     195:                  {
     196:                      lstSite.Items.Remove(lstSite.SelectedItems[i]);
     197:                  }
     198:              }
     199:          }
     200:   
     201:          private void button2_Click(object sender, EventArgs e)
     202:          {
     203:              if (saveFileDialog1.ShowDialog() == DialogResult.OK)
     204:              {
     205:                  try
     206:                  {
     207:                      System.IO.FileStream fs = (System.IO.FileStream)saveFileDialog1.OpenFile();
     208:                      StreamWriter sw = new StreamWriter(fs);
     209:                      for (int i = 0; i < lstCompany.Items.Count; i++)
     210:                      {
     211:                          sw.WriteLine(lstCompany.Items[i].ToString());
     212:                      }
     213:                      sw.Flush();
     214:                      sw.Close();
     215:                      fs.Close();
     216:                      MessageBox.Show("文件保存成功");
     217:                  }
     218:                  catch (Exception ex)
     219:                  {
     220:                      MessageBox.Show("异常:
    {0}", ex.Message.ToString());
     221:                  }
     222:              }
     223:          }
     224:   
     225:          private void lstCompany_DoubleClick(object sender, EventArgs e)
     226:          {
     227:              Clipboard.SetText(lstCompany.SelectedItems[0].ToString());
     228:              lblCopy.Text = "复制成功...";
     229:              timer1.Tick += lblCopyClear;
     230:              timer1.Interval = 3000;
     231:              timer1.Start();
     232:          }
     233:          public void lblCopyClear(object sender,EventArgs e)
     234:          {
     235:              lblCopy.Text = "";
     236:              timer1.Tick -= lblCopyClear;
     237:              timer1.ToString();
     238:          }
     239:   
     240:      }
     241:  }
  • 相关阅读:
    css去掉iPhone、iPad默认按钮样式
    STL~Deque简介
    Centos 7 ssh登录速度慢
    C++ delete 两次
    编译gdb 报错 No module named gdb.frames
    gdb 脚本
    转载: CentOS/Linux 解决 SSH 连接慢
    百度经验:Win10查看已存储WiFi密码的两种方法
    git 操作
    Avoiding memory leaks in POSIX thread programming, 多线程避免内存泄漏
  • 原文地址:https://www.cnblogs.com/ensleep/p/3337336.html
Copyright © 2020-2023  润新知