• 爬虫入门(二)


    使用xpath解析数据

    环境安装:

    1 pip install lxml

    解析原理:

      1.获取网页源码数据

      2.实例化一个etree对象,并切将页面源码数据加载到该对象中

      3.调用该对象的xpath方法进行制定标签的定位

      ps:xpath函数必须结合xpath表达式进行标签的定位和内容捕获

    案例:

     1 # 项目需求:解析58二手房的相关数据
     2 import requests
     3 from lxml import etree
     4 
     5 url = 'https://bj.58.com/shahe/ershoufang/?PGTID=0d30000c-0047-e1d9-5baf-47db5897c065&ClickID=1'
     6 headers = {
     7     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
     8 }
     9 
    10 page_text = requests.get(url=url,headers=headers).text
    11 
    12 tree = etree.HTML(page_text)
    13 li_list = tree.xpath("//ul[@class='house-list-wrap']/li")
    14 fp = open('58.csv','w',encoding='utf8')
    15 for li in li_list:
    16     title = li.xpath('./div[2]/h2/a/text()')[0]
    17     price = li.xpath('./div[3]//text()')
    18     price = "".join(price)
    19     fp.write(title + ':' + price + '
    ')    
    20 fp.close()
    21 print("over!")
     1 # - 解析图片数据:http://pic.netbian.com/4kmeinv/
     2 import requests
     3 import urllib
     4 import os
     5 from lxml import etree
     6 
     7 url = 'http://pic.netbian.com/4kmeinv/'
     8 headers = {
     9     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
    10 }
    11 response = requests.get(url=url,headers=headers)
    12 # response.encoding = 'utf-8'
    13 page_text = response.text
    14 
    15 tree = etree.HTML(page_text)
    16 
    17 li_list = tree.xpath('//div[@class="slist"]/ul/li')
    18 
    19 if not os.path.exists('./imgs'):
    20     os.mkdir('./imgs')
    21 
    22 for li in li_list:
    23     title = li.xpath('./a/b/text()')[0]
    24     img_name = title.encode('iso-8859-1').decode('gbk')
    25     img_url = 'http://pic.netbian.com' + li.xpath('./a/img/@src')[0]
    26     img_path = './imgs/' + img_name + '.jpg'
    27     urllib.request.urlretrieve(url=img_url,filename=img_path)
    28     print(img_name,"下载成功")
    29 print("over!!!")
    解析图片数据
     1 #【重点】下载煎蛋网中的图片数据:http://jandan.net/ooxx
     2 #数据加密  (反爬机制)
     3 import requests
     4 import urllib
     5 import os
     6 import base64
     7 from lxml import etree
     8 
     9 url = 'http://jandan.net/ooxx'
    10 headers = {
    11     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
    12 }
    13 
    14 page_text = requests.get(url=url,headers=headers).text
    15 
    16 tree = etree.HTML(page_text)
    17 
    18 if not os.path.exists('./jiandan'):
    19     os.mkdir('./jiandan')
    20 
    21 img_hash_list = tree.xpath('//span[@class="img-hash"]/text()')
    22 for img_hash in img_hash_list:
    23     img_url = 'http:' + base64.b64decode(img_hash).decode()
    24     img_name = img_url.split('/')[-1]
    25     img_path = './jiandan/' + img_name
    26     urllib.request.urlretrieve(url=img_url,filename=img_path)
    27 print('over!')
    下载煎蛋网中的图片数据
     1 #爬取站长素材中的简历模板
     2 import requests
     3 import random
     4 import os
     5 from lxml import etree
     6 
     7 
     8 url = 'http://sc.chinaz.com/jianli/free_%s.html'
     9 
    10 headers = {
    11     'Connection': 'close', # 当请求成功后,马上断开该次请求(及时释放请求池中的资源)
    12     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
    13 }
    14 
    15 if not os.path.exists('./jianli'):
    16     os.mkdir('./jianli')
    17 
    18 for i in range(1, 4):
    19     if i == 1:
    20         new_url = 'http://sc.chinaz.com/jianli/free.html'
    21     else:
    22         new_url = format(url % i)
    23 
    24     response = requests.get(url=new_url, headers=headers)
    25     response.encoding = 'utf8'
    26     page_text = response.text
    27 
    28     tree = etree.HTML(page_text)
    29 
    30     div_list = tree.xpath('//div[@id="container"]/div')
    31     for div in div_list:
    32         name = div.xpath('./p//text()')[0]
    33         detail_url = div.xpath('./a/@href')[0]
    34 
    35         detail_page_text = requests.get(url=detail_url, headers=headers).text
    36         tree = etree.HTML(detail_page_text)
    37         download_list = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li/a/@href')
    38         download_url = random.choice(download_list)
    39         data = requests.get(url=download_url, headers=headers).content
    40         file_name = name + '.rar'
    41         file_path = './jianli/' + file_name
    42 
    43         with open(file_path,'wb') as fp:
    44             fp.write(data)
    45             print(file_name, "下载成功")
    46     
    47 print('over!')
    爬取站长素材中的简历模板
     1 import requests
     2 from lxml import etree
     3 
     4 
     5 url = 'https://www.aqistudy.cn/historydata/'
     6 headers = {
     7     'Connection': 'close', # 当请求成功后,马上断开该次请求(及时释放请求池中的资源)
     8     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
     9 }
    10 
    11 page_text = requests.get(url=url, headers=headers).text
    12 tree = etree.HTML(page_text)
    13 div_list = tree.xpath('//div[@class="col-lg-9 col-md-8 col-sm-8 col-xs-12"]/div')
    14 print(div_list)
    15 
    16 for div in div_list:
    17     title = div.xpath('./div[1]//text()')[0]
    18     print(title)
    19     ul_list = div.xpath('./div[2]/ul | ./div[2]/ul')
    20     
    21     for ul in ul_list:
    22         fl = ul.xpath('./div[1]//text()')
    23         if fl:
    24             print(fl[0])
    25         li_list = ul.xpath('./li | ./div[2]/li')
    26         
    27         for li in li_list:
    28             city_name = li.xpath('./a/text()')[0]
    29             print(city_name)
    解析所有的城市名称

    图片懒加载:找对属性

    代理ip的简单使用

     1 #设置请求的代理ip: www.goubanjia.com  快代理  西祠代理
     2 #代理ip的类型必须和请求url的协议头保持一致
     3 import requests
     4 
     5 url = 'https://www.baidu.com/s?wd=ip'
     6 
     7 page_text = requests.get(url=url, headers=headers, proxies={'https': '114.88.53.19:53281'}).text
     8 
     9 with open('./ip.html','w',encoding='utf-8') as fp:
    10     fp.write(page_text)
  • 相关阅读:
    angularjs html事件(ng-click,ng-hide,ng-show)
    《javascript设计模式与开放实践》学习(二)对象方法的借用
    《javascript设计模式与开放实践》学习(二)Function.prototype.bind
    《javascript设计模式与开放实践》学习(一)javascript实现多态2
    《javascript设计模式与开放实践》学习(一)javascript实现多态1
    关于BFC
    css三角形的绘制
    ER图
    如何使用powerDesigner 生成 html文件
    spring aop 会根据实际情况(有无接口)自动选择 两种 动态代理(jdk和cglib)之一
  • 原文地址:https://www.cnblogs.com/qq849784670/p/10446073.html
Copyright © 2020-2023  润新知