爬取淘宝交易记录的爬虫

前几天刚来头儿让爬个淘宝交易记录先看看，就用python写了个，我是分成两步爬的，首先是爬取商品链接，代码如下：

#-*- coding:utf-8 -*-

import BeautifulSoup
import urllib2
import json
import cookielib

class MyParser:
    def __init__(self,seedurl,destpath,stop_file_path):
        self.seedurl=seedurl
        self.stop_file_path=stop_file_path
        stop_file=open(stop_file_path,"rb")
        splits=stop_file.readline().split("	")
        self.no_0=splits[0]         #stop文件里的值：初始为0
        self.no_1=splits[1]  #当前页第几个物品
        self.no_2=splits[2] #当前物品第几个记录
        self.destpath=destpath
    def  run(self):
        print self.no_0
        while int(self.no_0)<5*44:
            
            self.seedurl=self.seedurl+str(self.no_0)
            headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36"}
            req=urllib2.Request(url=self.seedurl,headers=headers)
                
            content=urllib2.urlopen(req).read()
            contentsoup=BeautifulSoup.BeautifulSoup(content)
            
            items=contentsoup.findAll("div",{"class":"col title"})
            out_file=open(self.destpath,"a+")
            for item in items:
                print item.find("a")["href"]
                out_file.write(item.find("a")["href"]+"
")
                out_file.flush()
            out_file.close()
            self.no_0=int(self.no_0)+44

        print "ok"
def run():
    seedurl="http://s.taobao.com/search?spm=a230r.1.8.15.5n02zF&refpid=420461_1006&tab=all&q=%C5%AE%D1%A9%B7%C4%C9%C0&style=list&bcoffset=-4&s="
    item_stop_file="e://item_stop_file"
    record_stop_file="s://record_stop_file"
    outFile="e://out"
    myParser=MyParser(seedurl,outFile,item_stop_file)
    myParser.run()
if __name__=="__main__":
    run()
    print "done!"

这样得到了输出文件e://out ，每行是一个商品的链接。

下面根据上面爬到的文件，爬取每个商品的交易记录，代码如下：

#-*- coding:utf-8 -*-
'''
Created on 2014��7��23��

@author: sj
'''
import re
import BeautifulSoup
import os
import urllib2

 
class MyParser:
    def __init__(self,item_path_file,stop_file,out_file):
        self.item_path_file=item_path_file
        self.stop_file=stop_file
        self.out_file=out_file
        stop_object=open(self.stop_file,"rb")
        splits=stop_object.readline().split("	")
        stop_object.close()
        self.item=splits[0]
        self.page=splits[1]
        self.record=splits[2]
        self.tag=0
    def run(self):

        
            print self.item
            print self.page
            print self.record
            item_object=open(self.item_path_file,"rb")
            num_items=len(item_object.readlines())
            item_object.close()
            item_object=open(self.item_path_file,"rb")
            for line in item_object.readlines()[int(self.item):num_items]:
                try:
                    if re.search("tmall",line):
                        stop_object=open(self.stop_file,"rb")
                        item_new=stop_object.readline().split("	")[0]
                        stop_object.close()
                        stop_object=open(self.stop_file,"wb")
                        stop_object.write(item_new+"	"+"0"+"	"+"0"+"
")
                        stop_object.flush()
                        stop_object.close()
                        continue
                    print line
                    headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36"}
                    req=urllib2.Request(url=line,headers=headers)
                    
                    content=urllib2.urlopen(req,timeout=3).read()
                    contentSoup=BeautifulSoup.BeautifulSoup(content)
                    
                    data_api=contentSoup.find("button",{"id":"J_listBuyerOnView"})["data-api"]           
                    parameters=data_api.split("?")[1]
                    stop_object=open(self.stop_file,"rb")
                    bid_page=stop_object.readline().split("	")[1]
                    stop_object.close()
                    page_size=int(parameters.split("&")[2].split("=")[1])
        
                    while int(bid_page)<int(page_size):
                        print "没有超过pagesize的大小..."
                        print bid_page
                        if self.tag==1:
                            data_api=data_api.replace("bid_page="+str(bid_page),"bid_page="+str(int(bid_page)+1))
                        else:
                            data_api=data_api.replace("bid_page=1","bid_page="+str(int(bid_page)+1))
                        data_url=data_api+"&ua=006tpOWUuXBidH1MRWQZ0InIldyJ0J3AibxJg%3D%3D%7CtaBkcTQxVFHEsbQxBFEEIfY%3D%7CtJFV4sbweFGpcSkNye3Y7ckNKV7GLmae5976Lfo%3D%7Cs6aDR2N2MzZTVsO2szYjpsOmAwbil4KX4tei15LXgpeSh%2FLHQmax%7Csqcy9kFUkBUANfF0sJQ9VOM7Y%2BeTZUGWQQ%3D%3D%7CsSTgxOA3%7CsIVB9vM3Mvbj1pPGAmcSJ0KGk6bDxgJ3EpdTRnMWE9eihwLVAg%3D%3D%7Cv%2Fo%2Bia0L%2FGqyyuwU7KUtCc3o3Vic%2BZzJDVhtOA3aDQ%3D%3D%7CvusvmLyYXOuOy%2B4qrzpfm85L3jpvq767rmp%2Fau8rbjvsKC3pzektWB04vWq9%7Cvfj9%2BDw5%2FdgcCUxZnaj9iEw5XJitafw4LViP&t=1406097091097&callback=Hub.data.records_reload"
            
                        req=urllib2.Request(url=data_url,headers=headers)
                        datacontent=urllib2.urlopen(req,timeout=3).read()
                        datacontent=datacontent.decode("gbk").encode("utf-8")
                        self.deal(datacontent)
                        
                        bid_page=int(bid_page)+1
                        
                        stop_object=open(self.stop_file,"wb")
                        stop_object.write(str(self.item)+"	"+str(bid_page)+"	"+"0")
                        stop_object.flush()
                        stop_object.close()
                        self.tag=1
                        print self.item
                    if int(bid_page)>=page_size:
                        print "超过page_size大小，保存下一个物品的行数          0    0"
                        stop_object=open(self.stop_file,"wb")
                        stop_object.write(str(int(self.item)+1)+"	0	0
")
                        stop_object.close()
                        self.item=int(self.item)+1
                except Exception as e:
                    if e=="timed out":
                        continue
            
    def deal(self,content):
        ls=[m.start() for m in re.finditer(""",content)]
        content=content[(ls[0]+1):ls[-3]]
        contentSoup=BeautifulSoup.BeautifulSoup(content)
        recordshtml=contentSoup.find("tbody")
        if recordshtml==None:
            return 
        recordshtml=recordshtml.findAll("tr")
        for record in recordshtml:
            cols=record.findAll("td")
            if len(cols)!=5:
                continue
            name=cols[0].text
            price_em=cols[1].findAll("em")
            price=price_em[-1].text
            num=cols[2].text
            time=cols[3].text
            type=cols[4].text
            line=name+"	"+price+"	"+num+"	"+time+"	"+type+"
"
            print line
            out_object=open(self.out_file,"a+")
            out_object.write(line)
            out_object.flush()
            out_object.close()
        print "ok"
def run():
    item_path_file="e:/item_path_file"
    stop_file="e://stop_file"
    out_file="e://records_file"
    parser=MyParser(item_path_file,stop_file,out_file)
    parser.run()
if __name__=="__main__":
    run()
    print "done~"

这里item_path_file 就是第一步爬取到的商品链接文件，stop_file用于记录爬取到的位置，其实不记录也可以，上面程序没有记录爬取失败数据文件。

注意，这里可能会爬取到天猫上的物品，但是天猫的交易记录和淘宝的格式不一样，所以这里直接过滤掉天猫的。

这次爬数据比之前进步的地方：

try except的使用，之前没有用，每次超时还要手动把程序停掉，然后再开启，从断点处爬，try except 的使用使得超时就跳过本链接，这样少了很多人工操作。

后来得知自己都是手动爬的，还有一种scrapy框架比较简单些。

相关阅读:
Azure 虚拟机安全加固整理
 AzureARM 使用 powershell 扩容系统磁盘大小
 Azure Linux 云主机使用Root超级用户登录
 Open edX 配置 O365 SMTP
powershell 根据错误GUID查寻错误详情
 azure 创建redhat镜像帮助
 Azure Powershell blob中指定的vhd创建虚拟机
 Azure Powershell 获取可用镜像 PublisherName,Offer,Skus,Version
Power BI 连接到 Azure 账单，自动生成报表，可刷新
 Azure powershell 获取 vmSize 可用列表的命令
原文地址：https://www.cnblogs.com/eva_sj/p/3971163.html