pyhton 爬虫爬去吾爱精品软件的信息并写入excel

pyhton 爬虫爬去吾爱精品软件的信息并写入excel

2018的最后一天了,感觉今年有得有失,这里就不再浪费时间了,愿2019万事如意

之前的爬虫信息下载后只写入txt文档,想到了以后工作加入有特殊需求,趁放假有时间将这写数据写入excel表格

以吾爱精品软件去为例,代码如下:

# -*- coding: utf-8 -*-

import json,xlwt

import os

import requests

from lxml import etree

class Wuai(object):

def __init__(self):

self.url= "https://www.52pojie.cn/forum-16-{}.html"

self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.5.0.17997'}

self.files = open("wuai.txt","wb")

self.start=0

def get_data(self,url):

response = requests.get(url)

return response.text

def xml_data(self,data):

html = etree.HTML(data)

mes = html.xpath("//table[@summary='forum_16']")

name=['类型','标题','时间']

f=xlwt.Workbook()

sheet1 = f.add_sheet("无爱",cell_overwrite_ok=True)

for x in range(0,len(name)):

sheet1.write(0,x,name[x])

for i in mes:

type = i.xpath(".//tr/th/em/a/text()")

if type == [] or type==None:

continue

title= i.xpath(".//tr/th/a[@class='s xst']/text()") if len(i.xpath(".//tr/th/a[@class='s xst']/text()"))>0 else None

time = i.xpath(".//tr/td[@class='by']/em/span/text()") if len(i.xpath(".//tr/td[@class='by']/em/span/text()"))>0 else None

for y in range(0,len(type)):

sheet1.write(y+1,0,type[y])

for n in range(0,len(title)):

sheet1.write(n+1,1,title[n])

for n in range(0,len(time)):

sheet1.write(n+1,2,time[n])

f.save("1.xls")

# info = json.dumps(dict,ensure_ascii=False) +" "

# info1 = info.encode()

# self.files.write(info1)

# if not len(i.xpath("//a[@class='nxt']")):

# break

# else:

# next_url = "https://www.52pojie.cn/"+i.xpath("//a[@class='nxt]/@href")[0]

# self.get_data(next_url)

def run(self):

while True:

url = self.url.format(self.start)

data = self.get_data(url)

if data == []:

break

else:

self.xml_data(data)

if self.start ==5:

break

else:

self.start+=1

if __name__ == '__main__':

wuai = Wuai()

wuai.run()
相关阅读:
2020软件工程作业01
2020软件工程个人作业06——软件工程实践总结作业
 2020软件工程作业05
2020软件工程作业04
2020软件工程作业03
2020软件工程02
原文地址：https://www.cnblogs.com/xcsg/p/10201697.html