from lxml import etree ''' 需求: 1、获取所有的tr标签 2、获取第二个tr标签 3、获取所有class等于even的标签 4、获取所有a标签的href属性 5、获取所有的职位信息(纯文本) ''' def parse_tengxun(): parse = etree.HTMLParser(encoding='utf-8') html = etree.parse("tengxun.html", parser=parse) # trs = html.xpath(r'//h4[@class="recruit-title"]') # # #返回的是一个列表 # for tr in trs: # print(etree.tostring(tr,encoding='utf-8').decode('utf-8')) # # trs1 = html.xpath(r'//div/a/@href') # #取出的是字符串类型 # for a in trs1: # print(a) trs2 = html.xpath(r'//*/div[@class="recruit-list"]/a') #取出的是字符串类型 positons = [] for a in trs2: zhiwei_list = a.xpath("./h4/text()") zhiwei = zhiwei_list[0].split("-")[1] # print(zhiwei) qita = a.xpath("./p//span//text()") didian = qita[1] leixing = qita[2] date = qita[3] miaoshu = a.xpath("normalize-space(./p[@class='recruit-text']/text())") positon = { '职位':zhiwei, '地点':didian, '职位类型':leixing, '发布日期':date, '职位描述':miaoshu } positons.append(positon) return positons # print(print(etree.tostring(trs[0],encoding='utf-8').decode('utf-8'))) if __name__ == '__main__': a=parse_tengxun() for x in a: print(x)