• Xpath数据解析


    好段子网内容爬取

     1 from lxml import etree
     2 import requests
     3 url='http://www.haoduanzi.com'
     4 headers = {
     5         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
     6     }
     7 url_content=requests.get(url=url,headers=headers).text
     8 tree=etree.HTML(url_content)
     9 #xpath返回值是一个列表
    10 div_list=tree.xpath('//div[@id="main"]/div')[2:-2]
    11 ur_list=[]
    12 for div in div_list:
    13     img_url=div.xpath('./div/img/@src')[0]
    14     ur_list.append(img_url)
    15 print(ur_list)
    爬虫代码

    煎蛋网图片爬取

     1 from lxml import etree
     2 import requests
     3 import base64
     4 url="http://jandan.net/ooxx"
     5 headers = {
     6         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
     7     }
     8 url_content=requests.get(url=url,headers=headers).text
     9 # url_content
    10 tree=etree.HTML(url_content)
    11 img_list=tree.xpath('//span[@class="img-hash"]/text()')
    12 img_so_list=[]
    13 for imgcode in img_list:
    14     img_url="http:"+base64.b64decode(imgcode).decode()
    15     img_so_list.append(img_url)
    16 img_so_list
    爬虫代码

    站长素材中进行免费简历模板的下载

     1 from lxml import etree
     2 import requests
     3 import random
     4 url = 'http://sc.chinaz.com/jianli/free_%d.html'
     5 headers = {
     6     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
     7     'Connection':'close'
     8 }
     9 
    10 for pageNum in range(1,3):
    11     pageUrl = format(url%pageNum)
    12     if pageNum == 1:
    13         pageUrl = 'http://sc.chinaz.com/jianli/free.html'
    14     #page_text = requests.get(url=pageUrl,headers=headers).text
    15     response = requests.get(url=pageUrl,headers=headers)
    16     #将请求到的页面源码数据的编码格式进行指定的修改
    17     response.encoding = 'utf-8'
    18     page_text = response.text
    19     #解析:简历的名称 详情页的url
    20     tree = etree.HTML(page_text)
    21     div_list = tree.xpath('//div[@id="container"]/div')
    22     for div in div_list:
    23         #存储谋一份建立的12个下载通道
    24         download_list = []
    25         
    26         detail_url = div.xpath('./p/a/@href')[0]
    27         title = div.xpath('./p/a/text()')[0]
    28         
    29         #获取详情页面的页面数据
    30         detail_text = requests.get(url=detail_url,headers=headers).text
    31         #一个etree对象只可以加载一个页面源码数据
    32         tree = etree.HTML(detail_text)
    33         #12个下载通道的url
    34         li_list =  tree.xpath('//div[@class="clearfix mt20 downlist"]/ul[@class="clearfix"]/li')
    35         for li in li_list:
    36             download_url = li.xpath('./a/@href')[0]
    37             download_list.append(download_url)
    38         #从12个下载通道中随机选择某一个下载通道    
    39         download_data_url = random.choice(download_list)
    40         #简历模板的下载
    41         data = requests.get(url=download_data_url,headers=headers).content
    42         data_path = title+'.rar'
    43         with open(data_path,'wb') as fp:
    44             fp.write(data)
    45             print(data_path+'下载成功')
    爬虫代码
  • 相关阅读:
    数据库压力测试的参考地址
    Infopath表单部署到Farm的方法
    oracle 的几个开发工具比较
    智能Web算法/NLP 参考图书
    Wireshark & Ethereal包分析工具【图书节选】
    Sharepoint内置的”翻译管理库”体验
    开发相关“视频公开课webcast”资源地址
    读书:架构师的12项技能 12 ESSENTIAL SKILLS FOR SOFTWARE ARCHITECTS
    Linux 下Oracle Client JAVA JDBC 集成点滴
    MOS2010的界面介绍和定制方法简介【资料汇集】
  • 原文地址:https://www.cnblogs.com/duanhaoxin/p/10110867.html
Copyright © 2020-2023  润新知