• xpath


    存html

    etree.tounicode(etree_html.xpath('//*[@id="prodDetailCotentDiv"]/textarea/table/tbody')[0], method="xml")
    

    requests

    from lxml import etree
    html = requests.get(url=url, headers=self.headers).content.decode('utf-8')
    etree_html = etree.HTML(html)
    data= etree_html.xpath('')[0].strip().replace(':', '')
    data = etree.tounicode(etree_html.xpath('//*[@id="J_ParameterTable"]')[0], method="xml") 获取带格式的html内容
    data = etree.tostring(stree_html.xpath('//*[@id="J_ParameterTable"]')[0], encoding="utf-8").decode("utf-8") data = etree_html.xpath('//*[@id="classicont"]/div[@class="els-doc-h4"]//text() | //div[@class="els-doc-con-left"]//text()') # 两个规则放在一起 pymysql.escape_string(data) 存数据库 html = self.driver.page_source html = etree.HTML(html) html = etree.fromstring(html.encode('utf-8')) html.xpath('')[0].strip() etree.fromstring(html.encode('utf-8')) 遇到内容有标签时会报错 需要把标签替换掉 ['�', '�', '�', '�', '�', '�'] biaoqin_list = re.findall(r'(&#.+?;)', html) print(biaoq_list) for biaoqin in biaoq_list: html = html.replace(biaoqin, '') html = etree.HTML(html.encode('utf-8'))

      

    scrapy

    response.xpath('').extract()[0].strip().replace("'","‘")
    response.xpath('').extract_first().strip().replace("'","‘")
    

    mysql 存html转译

    pymysql.escape_string(item['content'])
    

    循环

    # scrapy
    
    for data in response.xpath('//div[@class="newslist"]/div[@class="item"]'):
        print(data.xpath('span[@class="ui-img"]/a[@class="tag tag-blue"]/text()').extract_first())
    
    
    # requsest
    html = json.loads(response.body.decode('utf-8'))['html']
    etree_html = etree.HTML(html)
    print('len:{}'.format(len(etree_html.xpath('//div[@class="review_box  "]'))))
    for data in etree_html.xpath('//div[@class="review_box  "]'):
        data = etree.tostring(data, pretty_print=True).decode('utf-8')   # 按照字符串序列号html文档
        data = etree.HTML(data)
        print(data.xpath('div/div[1]/div[1]/a/@href'))
        print('*' * 100)
    
    
    # requsest
    html = json.loads(response.body.decode('utf-8'))['html']
    etree_html = etree.HTML(html)
    print('len:{}'.format(len(etree_html.xpath('//div[@class="review_box  "]'))))
    for data in etree_html.xpath('//div[@class="review_box  "]'):
        print(data.xpath('div/div[1]/div[1]/a/@href'))
        print('*' * 100)

    循环

    for i, data in enumerate(response.xpath('//*[@id="js_content"]/p')):
        html = data.extract()
        etree_html = etree.HTML(html)
        url = etree_html.xpath('//a/@href')
        title = html
        for i in re.findall(r'(<.*?>)', html):
            title = title.replace(i, '')

    取值

    print(etree_html.xpath('//div[@class="review_boxccc  "]'))   # 没有取到是空列表  []
    response.xpath('//*[@id="highlight_player_areaddd"]/div').extract()  # 没有取到是  []
    response.xpath('//*[@id="highlight_player_areaddd"]/div').extract_first()  # 没有取到是  None
    

      

    biaoqin
  • 相关阅读:
    WCF Security系列(1)Security概述
    转:如何修复Team Foundation Server Workgroup Edition 不小心删除了所有Team Foundation Licensed Users组内用户问题
    转:最真实的2006年应届毕业生真实薪水
    如果为网站生成自签名SSL证书
    转 :TFS(Team Foundation Server)使用经验
    The sequence 2 序列2 攻略 (第4049关)
    力扣 223. 矩形面积
    The sequence 2 序列2 攻略 (第5059关)
    The sequence 2攻略 序列2攻略(第3039关)
    题解 P1147 【连续自然数和】
  • 原文地址:https://www.cnblogs.com/yoyo1216/p/10450643.html
Copyright © 2020-2023  润新知