2018的最后一天了,感觉今年有得有失,这里就不再浪费时间了,愿2019万事如意
之前的爬虫信息下载后只写入txt文档,想到了以后工作加入有特殊需求,趁放假有时间将这写数据写入excel表格
以吾爱精品软件去为例,代码如下:
# -*- coding: utf-8 -*-
import json,xlwt
import os
import requests
from lxml import etree
class Wuai(object):
def __init__(self):
self.url= "https://www.52pojie.cn/forum-16-{}.html"
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.5.0.17997'}
self.files = open("wuai.txt","wb")
self.start=0
def get_data(self,url):
response = requests.get(url)
return response.text
def xml_data(self,data):
html = etree.HTML(data)
mes = html.xpath("//table[@summary='forum_16']")
name=['类型','标题','时间']
f=xlwt.Workbook()
sheet1 = f.add_sheet("无爱",cell_overwrite_ok=True)
for x in range(0,len(name)):
sheet1.write(0,x,name[x])
for i in mes:
type = i.xpath(".//tr/th/em/a/text()")
if type == [] or type==None:
continue
title= i.xpath(".//tr/th/a[@class='s xst']/text()") if len(i.xpath(".//tr/th/a[@class='s xst']/text()"))>0 else None
time = i.xpath(".//tr/td[@class='by']/em/span/text()") if len(i.xpath(".//tr/td[@class='by']/em/span/text()"))>0 else None
for y in range(0,len(type)):
sheet1.write(y+1,0,type[y])
for n in range(0,len(title)):
sheet1.write(n+1,1,title[n])
for n in range(0,len(time)):
sheet1.write(n+1,2,time[n])
f.save("1.xls")
# info = json.dumps(dict,ensure_ascii=False) +"
"
# info1 = info.encode()
# self.files.write(info1)
# if not len(i.xpath("//a[@class='nxt']")):
# break
# else:
# next_url = "https://www.52pojie.cn/"+i.xpath("//a[@class='nxt]/@href")[0]
# self.get_data(next_url)
def run(self):
while True:
url = self.url.format(self.start)
data = self.get_data(url)
if data == []:
break
else:
self.xml_data(data)
if self.start ==5:
break
else:
self.start+=1
if __name__ == '__main__':
wuai = Wuai()
wuai.run()