• 第十一节 lxml库解析xpath


    from lxml import etree
    
    
    '''
    需求:
    1、获取所有的tr标签
    2、获取第二个tr标签
    3、获取所有class等于even的标签
    4、获取所有a标签的href属性
    5、获取所有的职位信息(纯文本)
    '''
    def parse_tengxun():
        parse = etree.HTMLParser(encoding='utf-8')
        html = etree.parse("tengxun.html", parser=parse)
    
        # trs = html.xpath(r'//h4[@class="recruit-title"]')
        # # #返回的是一个列表
        # for tr in trs:
        #     print(etree.tostring(tr,encoding='utf-8').decode('utf-8'))
        #
        # trs1 = html.xpath(r'//div/a/@href')
        # #取出的是字符串类型
        # for a in trs1:
        #     print(a)
    
        trs2 = html.xpath(r'//*/div[@class="recruit-list"]/a')
        #取出的是字符串类型
        positons = []
        for a in trs2:
            zhiwei_list = a.xpath("./h4/text()")
            zhiwei = zhiwei_list[0].split("-")[1]
            # print(zhiwei)
    
            qita = a.xpath("./p//span//text()")
            didian = qita[1]
            leixing = qita[2]
            date = qita[3]
            miaoshu = a.xpath("normalize-space(./p[@class='recruit-text']/text())")
    
            positon = {
                '职位':zhiwei,
                '地点':didian,
                '职位类型':leixing,
                '发布日期':date,
                '职位描述':miaoshu
            }
            positons.append(positon)
        return positons
        # print(print(etree.tostring(trs[0],encoding='utf-8').decode('utf-8')))
    if __name__ == '__main__':
        a=parse_tengxun()
        for x in a:
            print(x)
  • 相关阅读:
    axis
    LRU
    apk 反编译
    android 设置 button 不同状态的图片
    resin
    scrum 项目管理
    android 国外广告平台
    JNI
    java 通信
    google网站分析
  • 原文地址:https://www.cnblogs.com/kogmaw/p/12506961.html
Copyright © 2020-2023  润新知