• 工程代码の初體驗


    受天天学长影响入坑python,颓了半天多+一晚上终于写出爬虫了,感觉NOIP药丸qaq

    在慕课上学的,本来想抓关于OI的百度百科的,因为图样,目标函数太简单,刚开始就偏了……

    结果界面也很丑,下回搞个更好的OvO

    收货了很多经验,最有用的一条就是把主方法中抓取失败的except写成

    except Exception as f:

    print 'crew failed: ', f
    就可以查出失败的大致方向

     

    python有些东西打错了也不会报错

    现在依旧经常犯新手经典错误:self没打或者打错

    贴出爬虫的代码(几乎完全是照着慕课上的模板抄的一,一):

      1 主要
      2 
      3 import url_manager, html_downloader, html_parser, html_outputer
      4 
      5 class SpiderMain(object):
      6     def __init__(self):
      7         self.urls = url_manager.UrlManager()
      8         self.downloader = html_downloader.HtmlDownloader()
      9         self.parser = html_parser.HtmlParser()
     10         self.outputer = html_outputer.HtmlOutputer()
     11 
     12     def craw(self, root_url):
     13         count = 1
     14         self.urls.add_new_url(root_url)
     15         while self.urls.has_new_url():
     16             try:
     17                 new_url = self.urls.get_new_url()
     18                 print 'craw %d : %s' % (count, new_url)
     19                 html_cont = self.downloader.download(new_url)
     20                 new_urls, new_data = self.parser.parse(new_url, html_cont)
     21                 self.urls.add_new_urls(new_urls)
     22                 self.outputer.collect_data(new_data)
     23             except Exception as f:#这么做可以指出抓取失败的大致方向
     24                 print 'crew failed: ', f
     25 
     26             if count == 100:
     27                 break
     28 
     29             count = count + 1
     30 
     31         self.outputer.output_html()
     32 
     33 
     34 if __name__=="__main__":
     35     root_url = "http://baike.baidu.com/item/oi/74020"
     36     boj_spider = SpiderMain()
     37     boj_spider.craw(root_url)
     38 
     39 
     40 输出器
     41 
     42 
     43 class HtmlOutputer(object):
     44     def __init__(self):
     45         self.datas = []
     46 
     47     def collect_data(self, data):
     48         if data is None:
     49             return
     50         self.datas.append(data)
     51 
     52     def output_html(self):
     53         fout = open('output.html', 'w')
     54 
     55         fout.write("<html>")
     56         fout.write("<body>")
     57         fout.write("<meta charset='utf-8'>")
     58         fout.write("<table>")
     59 
     60         for data in self.datas:
     61             fout.write("<tr>")
     62             fout.write("<td>%s</td>" % data['url'])
     63             fout.write("<td>%s</td>" % data['title'].encode('utf-8'))
     64             fout.write("<td>%s</td>" % data['summary'].encode('utf-8'))#old error:didn't have encode
     65             fout.write("</tr>")
     66 
     67         fout.write("</table>")
     68         fout.write("</body>")
     69         fout.write("</html>")
     70 
     71         fout.close()
     72 
     73 
     74 管理器
     75 
     76 
     77 class UrlManager(object):
     78 
     79     def __init__(self):
     80         self.new_urls = set()
     81         self.old_urls = set()
     82 
     83     def add_new_url(self, url):
     84         if url is None:
     85             return
     86         if url not in self.new_urls and url not in self.old_urls:
     87             self.new_urls.add(url)
     88 
     89     def add_new_urls(self, urls):
     90         if urls is None or len(urls) == 0:
     91             return
     92         for url in urls:
     93             self.add_new_url(url)
     94 
     95     def has_new_url(self):
     96         return len(self.new_urls) != 0
     97 
     98     def get_new_url(self):
     99         new_url = self.new_urls.pop()
    100         self.old_urls.add(new_url)
    101         return new_url
    102 
    103 
    104 解析器
    105 
    106 from bs4 import BeautifulSoup
    107 import re
    108 import urlparse
    109 
    110 class HtmlParser(object):
    111 
    112     def _get_new_urls(self,page_url, soup):#old error:didn't have self
    113         new_urls = set()
    114 
    115         links = soup.find_all('a', href=re.compile(r"/view/d+.htm"))#old error:re.complie
    116         for link in links:
    117             new_url = link['href']
    118             new_full_url = urlparse.urljoin(page_url, new_url)
    119             new_urls.add(new_full_url)
    120 
    121         return new_urls
    122 
    123     def _get_new_data(self,page_url, soup):
    124         res_data = {}
    125 
    126         # url
    127         res_data['url'] = page_url
    128 
    129         # <dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1>
    130         title_node = soup.find('dd', class_="lemmaWgt-lemmaTitle-title").find("h1")
    131         res_data['title'] = title_node.get_text()
    132 
    133         # <div class="lemma-summary" label-module="lemmaSummary">
    134         summary_node = soup.find('div', class_="lemma-summary")
    135         res_data['summary'] = summary_node.get_text()
    136 
    137         return res_data
    138 
    139     def parse(self, page_url, html_cont):
    140         if page_url is None or html_cont is None:
    141             return
    142 
    143         soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
    144         new_urls = self._get_new_urls(page_url, soup)
    145         new_data = self._get_new_data(page_url, soup)#old error:new_urls =
    146         return new_urls, new_data
    147 
    148 
    149 
    150 下载器
    151 
    152 import urllib2
    153 
    154 class HtmlDownloader(object):
    155 
    156     def download(self, url):
    157         if url is None:
    158             return None
    159 
    160         response = urllib2.urlopen(url)
    161 
    162         if response.getcode() != 200:
    163             return None
    164 
    165         return response.read()
    View Code
  • 相关阅读:
    System.ServiceModel.CommunicationException: 接收HTTP 响应时错误发生
    "智囊"王沪宁先后辅佐三任总书记 _中国经济网
    xx
    我告诉你哦,最好吃的海南鸡饭不在海南…
    服务密码重置_中国移动通信
    移动服务密码怎么查_服务密码忘记了怎么办_百度经验
    http://www.sohu.com/a/162795109_465329
    首页--易配菜-中国餐饮行业最大的综合解决方案提供商
    浙江方圆工程咨询有限公司
    MySQL中间件方案盘点_搜狐科技_搜狐网
  • 原文地址:https://www.cnblogs.com/JSL2018/p/6067543.html
Copyright © 2020-2023  润新知