• Python 爬虫学习 urllib


    1. 网页抓取
      # -*-coding: utf-8 -*-
      
      import urllib
      
      url = "http://www.cndzz.com/"
      
      html = urllib.urlopen(url)
      
      print html.read()
      

        对于网页编码为gb2312等格式的网页,使用如下方法

      # -*-coding: utf-8 -*-
      
      import urllib
      
      url = "http://www.sina.com.cn/"
      
      html = urllib.urlopen(url)
      
      print html.read().decode("gbk").encode("utf-8")
      

        如果有多种编码,可以使用如下方法

      # -*-coding: utf-8 -*-
      # Author:Evilxr
      
      import urllib
       
      url = "http://www.sina.com.cn/"
       
      html = urllib.urlopen(url)
       
      print html.read().decode("gbk", "ignore").encode("utf-8")
      

        

    2. 获取Web服务器头部信息
      # -*-coding: utf-8 -*-
      # Author:Evilxr
      
      import urllib
      
      url = "http://www.sina.com.cn/"
      
      html = urllib.urlopen(url)
      
      print html.info()
      

        返回信息:

      Server: nginx
      Date: Otc, 10 Nov 2014 12:54:50 GMT
      Content-Type: text/html
      Last-Modified: Otc, 10 Nov 2014 12:54:11 GMT
      Vary: Accept-Encoding
      Expires: Otc, 10 Nov 2014 12:55:50 GMT
      Cache-Control: max-age=60
      X-Powered-By: schi_v1.03
      Age: 27
      Content-Length: 563513
      X-Cache: HIT from cd31-151.sina.com.cn
      Connection: close
      
      
      [Finished in 0.2s]
      

        

    3. 获取网页状态码
      # -*-coding: utf-8 -*-
      # Author:Evilxr
      
      import urllib
      
      url = "http://www.sina.com.cn/"
      
      html = urllib.urlopen(url)
      
      # 200正常访问	301重定向	403 禁止访问 404页面不存在	500 服务器忙或者服务器无响应
      print html.getcode()
      
      # 获取用户传入的url
      print html.geturl()
      
      # 关闭文件
      html.close
      

        

    4. 保存网页内容
      # -*-coding: utf-8 -*-
      # Author:Evilxr
      
      import urllib
      
      url = "http://www.cdnzz.com/"
      
      urllib.urlretrieve(url, "d:\evilxr.html")
      

        

    5. 获取网站编码类型
      # coding:utf8
      # Author:Evilxr
      
      import urllib
      
      url = "http://www.163.com"
      
      html = urllib.urlopen(url)
      
      print html.info().getparam('charset')
      html.close()
      

        返回:

      GBK
      [Finished in 0.6s]
      

        

      # coding:utf8
      # Author:Evilxr
      
      import urllib
      
      url = "http://www.cnblogs.com/Evilxr"
      
      html = urllib.urlopen(url)
      
      print html.info().getparam('charset')
      html.close()
      

        返回:

      utf-8
      [Finished in 0.3s]
      

        

    6. 自动获取网站编码 chardet[字符集检测]
      #先安装chardet
      #pip install chardet
      # coding:utf8
      
      import urllib 
      import chardet
      
      def automatic_detect(url):
      	"""" doc """
      	content = urllib.urlopen(url).read()
      	result= chardet.detect(content)
      	encoding = result['encoding']
      	return encoding
      
      url_list = ["http://www.sina.com.cn/", 
      			 "http://www.cnblogs.com/evilxr",
      			  "http://bbs.hackav.com/",
      			  "http://www.baidu.com/",
      			  "http://fuli.ba/"]
      for url in url_list:
      	print url, automatic_detect(url)
      http://www.sina.com.cn/ GB2312
      http://www.cnblogs.com/evilxr utf-8
      http://bbs.hackav.com/ GB2312
      http://www.baidu.com/ utf-8
      http://fuli.ba/ utf-8
      [Finished in 17.1s]
      

        

  • 相关阅读:
    filter过滤组件Grok 正则捕获
    Logstash 最佳实践配置语法
    【小记】Ubuntu 工具链升级 gcc 流程
    图数据集cora详解
    HGNN超图神经网络代码
    modelnet40数据集
    GCN代码解读(版本1)
    GCN代码解读(版本2:github版本)
    企业微信代开发自建应用开发
    企业微信服务商工具实现
  • 原文地址:https://www.cnblogs.com/evilxr/p/4036697.html
Copyright © 2020-2023  润新知