存html
etree.tounicode(etree_html.xpath('//*[@id="prodDetailCotentDiv"]/textarea/table/tbody')[0], method="xml")
requests
from lxml import etree html = requests.get(url=url, headers=self.headers).content.decode('utf-8') etree_html = etree.HTML(html) data= etree_html.xpath('')[0].strip().replace(':', '') data = etree.tounicode(etree_html.xpath('//*[@id="J_ParameterTable"]')[0], method="xml") 获取带格式的html内容
data = etree.tostring(stree_html.xpath('//*[@id="J_ParameterTable"]')[0], encoding="utf-8").decode("utf-8") data = etree_html.xpath('//*[@id="classicont"]/div[@class="els-doc-h4"]//text() | //div[@class="els-doc-con-left"]//text()') # 两个规则放在一起 pymysql.escape_string(data) 存数据库 html = self.driver.page_source html = etree.HTML(html) html = etree.fromstring(html.encode('utf-8')) html.xpath('')[0].strip() etree.fromstring(html.encode('utf-8')) 遇到内容有标签时会报错 需要把标签替换掉 ['�', '�', '�', '�', '�', '�'] biaoqin_list = re.findall(r'(&#.+?;)', html) print(biaoq_list) for biaoqin in biaoq_list: html = html.replace(biaoqin, '') html = etree.HTML(html.encode('utf-8'))
scrapy
response.xpath('').extract()[0].strip().replace("'","‘") response.xpath('').extract_first().strip().replace("'","‘")
mysql 存html转译
pymysql.escape_string(item['content'])
循环
# scrapy for data in response.xpath('//div[@class="newslist"]/div[@class="item"]'): print(data.xpath('span[@class="ui-img"]/a[@class="tag tag-blue"]/text()').extract_first()) # requsest html = json.loads(response.body.decode('utf-8'))['html'] etree_html = etree.HTML(html) print('len:{}'.format(len(etree_html.xpath('//div[@class="review_box "]')))) for data in etree_html.xpath('//div[@class="review_box "]'): data = etree.tostring(data, pretty_print=True).decode('utf-8') # 按照字符串序列号html文档 data = etree.HTML(data) print(data.xpath('div/div[1]/div[1]/a/@href')) print('*' * 100) # requsest html = json.loads(response.body.decode('utf-8'))['html'] etree_html = etree.HTML(html) print('len:{}'.format(len(etree_html.xpath('//div[@class="review_box "]')))) for data in etree_html.xpath('//div[@class="review_box "]'): print(data.xpath('div/div[1]/div[1]/a/@href')) print('*' * 100)
循环
for i, data in enumerate(response.xpath('//*[@id="js_content"]/p')): html = data.extract() etree_html = etree.HTML(html) url = etree_html.xpath('//a/@href') title = html for i in re.findall(r'(<.*?>)', html): title = title.replace(i, '')
取值
print(etree_html.xpath('//div[@class="review_boxccc "]')) # 没有取到是空列表 [] response.xpath('//*[@id="highlight_player_areaddd"]/div').extract() # 没有取到是 [] response.xpath('//*[@id="highlight_player_areaddd"]/div').extract_first() # 没有取到是 None
biaoqin