• 爬虫三 bs4&xpath&jsonpath


    一、bs4规则

     1 from bs4 import BeautifulSoup
     2 
     3 soup = BeautifulSoup(open('test_bs4.html',encoding='utf'),'lxml')
     4 
     5 '''1.标签查找'''
     6 # print(soup.a)
     7 
     8 '''2.获取属性'''
     9 # print(soup.a['title'])    #获取单个属性
    10 # print(soup.a.attrs)       #获取所有属性,返回一个字典
    11 # print(soup.a.attrs['title'])   #获取单个属性
    12 
    13 '''3.获取内容'''
    14 # print(soup.a.text)
    15 # print(soup.a.string)        #只能拿到标签中的文本,拿不到子标签的内容
    16 # print(soup.a.get_text())
    17 
    18 '''4.find方法'''
    19 # print(soup.find('a'))       #找到第一个a标签
    20 # print(soup.find('a',title='qin'))     #限制属性
    21 # print(soup.find('a',class_='nu'))    #class关键字,要加下换线
    22 # print(soup.find('a',id='feng'))
    23 
    24 '''5.找到指定标签下的子标签'''
    25 # div = soup.find('div',class_='tang')
    26 # print(div.find('a'))       #找到class='tang'的div下的a
    27 
    28 '''6.find_all方法'''
    29 # print(div.find('a'))
    30 
    31 # div = soup.find('div',class_='tang')
    32 # print(div.find_all('a'))
    33 # print(div.find_all('a',limit=2))    #找到前两个
    34 # print(soup.find_all(['a','b']))    #同时找多类标签
    35 
    36 '''select-通过选择器找'''
    37 # print(soup.select('.tang .nu'))
    38 # print(soup.select('#feng'))
    39 # print(soup.select('.tang .nu')[0].text)
    40 # print(soup.select('.tang .nu')[0]['href'])
    41 '''select选择器返回的永远是列表'''
    42 
    43 div = soup.find('div',class_='tang')
    44 print(div.select('#feng'))

    二、bs4例子

    import urllib.request
    import urllib.parse
    from bs4 import BeautifulSoup
    import json
    
    class ZhiLianSpider(object):
    
        url = 'https://ty.fang.anjuke.com/loupan/'
    
        def __init__(self,qu,num,start_page,end_page):
            self.qu = qu
            self.num = num
            self.start_page = start_page
            self.end_page = end_page
            self.items = []
    
        # 生成url,创建请求对象
        def handle_request(self,page):
            '''拼接url'''
            url = self.url + self.qu + '/' + 'h' + self.num + '/'
            # print(url)
    
            '''创建请求对象'''
            header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
                                    ' Chrome/73.0.3683.86 Safari/537.36',
                      }
            req = urllib.request.Request(url=url,headers=header)
            return req
    
        #解析内容
        def parse_content(self,content):
            soup = BeautifulSoup(content,'lxml')
            '''思路:先找到所有岗位的盒子,再通过这个盒子对象找里面具体的信息'''
            # 找到盒子
            title_list = soup.select('.key-list > .item-mod')[1:]
            print(len(title_list))
            # print(title_list)
    
            #遍历盒子
            for box in title_list:
                #标题
                if box.select('.infos .lp-name h3 span'):
                    title = box.select('.infos .lp-name h3 span')[0].text
    
                #地址
                if box.select('.infos .address span'):
                    address = box.select('.infos .address span')[0].text
    
                # 价格
                if box.select('.favor-pos p span'):
                    price = box.select('.favor-pos p span')[0].text
    
                #户型
                if box.select('.huxing span'):
                    huxing = box.select('.huxing span')[0].text
    
                #放到字典中
                item= {'标题':title,
                       '地址':address,
                       '价格':price,
                       '户型':huxing}
    
                #再放到列表中
                self.items.append(item)
    
        def run(self):
            #循环爬取每页数据
            for page in range(self.start_page,self.end_page+1):
    
                #创建请求
                request = self.handle_request(page)
                # print(request)
    
                #发送请求,获取内容
                content = urllib.request.urlopen(request).read().decode()
                # print(content)
                # with open('fang.html','wb') as fp:
                #     fp.write(content)
    
                #解析内容,并存入列表
                info = self.parse_content(content)
    
            #写入文件
            string = json.dumps(self.items,ensure_ascii=False)
            with open('fang.csv','w',encoding='utf8') as fp:
                fp.write(string)
    
    def main():
    
        #输入需求关键字
        qu = input('输入地点:')
        num = input('输入室数:')
        start_page = int(input('起始页码:'))
        end_page = int(input('结束页码:'))
    
        #创建对象,启动爬取程序
        spider = ZhiLianSpider(qu,num,start_page,end_page)
        spider.run()
    
    
    if __name__ == '__main__':
        main()

    三、xpath提取本地文件

    from lxml import etree
    
    #生成对象
    tree = etree.parse('xpath.html')
    print(tree)
    
    ret = tree.xpath('//div[@class="song"]/p[2]')
    print(ret)
    
    ret = tree.xpath('//div[@class="tang"]/ul/li[2]/a/text()')[0]
    print(ret)

    四、xpath例子

    import urllib.request
    import urllib.parse
    from lxml import etree
    import time
    import os
    
    '''爬取建筑图片,用xpath解析
       懒加载'''
    
    def create_request(url,page):
        #拼接url
        if page == 1:
            req_url = url.format('')
        else:
            req_url = url.format('_' + str(page))
    
        header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
                             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    
        #构建请求对象
        req = urllib.request.Request(url=req_url,headers=header)
        return req
    
    def download_img(img_src):
    
        #创建文件夹
        dirname = 'jianzhu'
        if not os.path.exists(dirname):
            os.mkdir(dirname)
    
        #定义文件名
        img_name = os.path.basename(img_src)
    
        #拼接图片路径
        filepath = dirname + '/' + img_name
    
        header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
                                'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    
        req = urllib.request.Request(url=img_src, headers=header)
    
        rep = urllib.request.urlopen(req)
    
        with open(filepath,'wb') as fp:
            fp.write(rep.read())
    
    def parse_content(content):
        tree =  etree.HTML(content)
        img_list = tree.xpath('//div[@id="container"]/div/div/a/img/@src2')
        # print(img_list)
        # print(len(img_list))
    
        for img_src in img_list:
            download_img(img_src)
    
    def main():
        url = 'http://sc.chinaz.com/tupian/tesejianzhutupian{}.html'
    
        start_page = int(input('起始页码:'))
        end_page = int(input('结束页码:'))
    
        for page in range(start_page,end_page+1):
            print('第%s页开始下载...' %page)
            #创建请求
            req = create_request(url,page)
    
            #发送请求,得到内容
            rep = urllib.request.urlopen(req).read().decode()
    
            #解析内容,下载图片
            parse_content(rep)
            print('第%s页结束下载...' % page)
            time.sleep(2)
    
    if __name__ == '__main__':
        main()

     五、json用到的函数

    import json
    
    lt = [{'name':'王宝强','age':'30'},
          {'name':'王保墙','age':'32'},
          {'name':'王饱蔷','age':'35'},
          {'name':'王煲樯','age':'33'},
          ]
    
    '''python对象转json格式字符串'''
    str = json.dumps(lt)
    print(str)
    print(type(str))
    
    '''json格式字符串转python对象'''
    r = json.loads(str)
    print(r)
    print(type(r))
    
    '''dump()'''
    json.dump(tt,open('book1.txt','w',encoding='utf8'))
    
    '''load()'''
    obj = json.load(open('xx.txt','r',encoding='utf8'))
    print(type(obj))
    print(obj)

    六、jsonpath规则

    import  jsonpath
    import json
    
    '''将json格式字符串转换为python对象'''
    obj = json.load(open('book1.txt','r',encoding='utf8'))
    # print(type(obj))
    
    '''查找第三本书的作者'''
    # ret = jsonpath.jsonpath(obj,'$.store.book[2].author')
    # print(ret)
    
    '''查找所有作者'''
    # ret = jsonpath.jsonpath(obj,'$..author')
    # print(ret)
    
    '''查找第store下所有节点'''
    # ret = jsonpath.jsonpath(obj,'$.store.*')
    # print(ret)
    
    '''查找store下第三本书'''
    # ret = jsonpath.jsonpath(obj,'$..book[2]')
    # print(ret)
    
    '''查找store下最后一本书'''
    # ret = jsonpath.jsonpath(obj,'$..book[(@.length-1)]')
    # print(ret)
    
    # ret = jsonpath.jsonpath(obj,'$..book[:2]')
    # print(ret)
    
    '''查找带有status键的书'''
    ret = jsonpath.jsonpath(obj,'$..book[?(@.status)]')
    print(ret)
  • 相关阅读:
    python3中的匿名函数
    python3拆包、元组、字典
    python3函数中的不定长参数
    python3中的缺省参数和命名参数
    python3字符串的常见操作
    用python使用Mysql数据库
    Git常用命令
    Linux下配置Nginx(在root的/etc/rc.local里配置开机启动功能http://tengine.taobao.org/)
    大数据项目中js中代码和java中代码(解决Tomcat打印日志中文乱码)
    java中时间
  • 原文地址:https://www.cnblogs.com/Finance-IT-gao/p/11124515.html
Copyright © 2020-2023  润新知