• 爬取淘宝交易记录的爬虫


    前几天刚来头儿让爬个淘宝交易记录先看看,就用python写了个,我是分成两步爬的,首先是爬取商品链接,代码如下:


    #-*- coding:utf-8 -*-
    
    import BeautifulSoup
    import urllib2
    import json
    import cookielib
    
    class MyParser:
        def __init__(self,seedurl,destpath,stop_file_path):
            self.seedurl=seedurl
            self.stop_file_path=stop_file_path
            stop_file=open(stop_file_path,"rb")
            splits=stop_file.readline().split("	")
            self.no_0=splits[0]         #stop文件里的值:初始为0
            self.no_1=splits[1]  #当前页第几个物品
            self.no_2=splits[2] #当前物品第几个记录
            self.destpath=destpath
        def  run(self):
            print self.no_0
            while int(self.no_0)<5*44:
                
                self.seedurl=self.seedurl+str(self.no_0)
                headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36"}
                req=urllib2.Request(url=self.seedurl,headers=headers)
                    
                content=urllib2.urlopen(req).read()
                contentsoup=BeautifulSoup.BeautifulSoup(content)
                
                items=contentsoup.findAll("div",{"class":"col title"})
                out_file=open(self.destpath,"a+")
                for item in items:
                    print item.find("a")["href"]
                    out_file.write(item.find("a")["href"]+"
    ")
                    out_file.flush()
                out_file.close()
                self.no_0=int(self.no_0)+44
    
            print "ok"
    def run():
        seedurl="http://s.taobao.com/search?spm=a230r.1.8.15.5n02zF&refpid=420461_1006&tab=all&q=%C5%AE%D1%A9%B7%C4%C9%C0&style=list&bcoffset=-4&s="
        item_stop_file="e://item_stop_file"
        record_stop_file="s://record_stop_file"
        outFile="e://out"
        myParser=MyParser(seedurl,outFile,item_stop_file)
        myParser.run()
    if __name__=="__main__":
        run()
        print "done!"

    这样得到了输出文件e://out ,每行是一个商品的链接。


    下面根据上面爬到的文件,爬取每个商品的交易记录,代码如下:

    #-*- coding:utf-8 -*-
    '''
    Created on 2014��7��23��
    
    @author: sj
    '''
    import re
    import BeautifulSoup
    import os
    import urllib2
    
     
    class MyParser:
        def __init__(self,item_path_file,stop_file,out_file):
            self.item_path_file=item_path_file
            self.stop_file=stop_file
            self.out_file=out_file
            stop_object=open(self.stop_file,"rb")
            splits=stop_object.readline().split("	")
            stop_object.close()
            self.item=splits[0]
            self.page=splits[1]
            self.record=splits[2]
            self.tag=0
        def run(self):
    
            
                print self.item
                print self.page
                print self.record
                item_object=open(self.item_path_file,"rb")
                num_items=len(item_object.readlines())
                item_object.close()
                item_object=open(self.item_path_file,"rb")
                for line in item_object.readlines()[int(self.item):num_items]:
                    try:
                        if re.search("tmall",line):
                            stop_object=open(self.stop_file,"rb")
                            item_new=stop_object.readline().split("	")[0]
                            stop_object.close()
                            stop_object=open(self.stop_file,"wb")
                            stop_object.write(item_new+"	"+"0"+"	"+"0"+"
    ")
                            stop_object.flush()
                            stop_object.close()
                            continue
                        print line
                        headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36"}
                        req=urllib2.Request(url=line,headers=headers)
                        
                        content=urllib2.urlopen(req,timeout=3).read()
                        contentSoup=BeautifulSoup.BeautifulSoup(content)
                        
                        data_api=contentSoup.find("button",{"id":"J_listBuyerOnView"})["data-api"]           
                        parameters=data_api.split("?")[1]
                        stop_object=open(self.stop_file,"rb")
                        bid_page=stop_object.readline().split("	")[1]
                        stop_object.close()
                        page_size=int(parameters.split("&")[2].split("=")[1])
            
                        while int(bid_page)<int(page_size):
                            print "没有超过pagesize的大小..."
                            print bid_page
                            if self.tag==1:
                                data_api=data_api.replace("bid_page="+str(bid_page),"bid_page="+str(int(bid_page)+1))
                            else:
                                data_api=data_api.replace("bid_page=1","bid_page="+str(int(bid_page)+1))
                            data_url=data_api+"&ua=006tpOWUuXBidH1MRWQZ0InIldyJ0J3AibxJg%3D%3D%7CtaBkcTQxVFHEsbQxBFEEIfY%3D%7CtJFV4sbweFGpcSkNye3Y7ckNKV7GLmae5976Lfo%3D%7Cs6aDR2N2MzZTVsO2szYjpsOmAwbil4KX4tei15LXgpeSh%2FLHQmax%7Csqcy9kFUkBUANfF0sJQ9VOM7Y%2BeTZUGWQQ%3D%3D%7CsSTgxOA3%7CsIVB9vM3Mvbj1pPGAmcSJ0KGk6bDxgJ3EpdTRnMWE9eihwLVAg%3D%3D%7Cv%2Fo%2Bia0L%2FGqyyuwU7KUtCc3o3Vic%2BZzJDVhtOA3aDQ%3D%3D%7CvusvmLyYXOuOy%2B4qrzpfm85L3jpvq767rmp%2Fau8rbjvsKC3pzektWB04vWq9%7Cvfj9%2BDw5%2FdgcCUxZnaj9iEw5XJitafw4LViP&t=1406097091097&callback=Hub.data.records_reload"
                
                            req=urllib2.Request(url=data_url,headers=headers)
                            datacontent=urllib2.urlopen(req,timeout=3).read()
                            datacontent=datacontent.decode("gbk").encode("utf-8")
                            self.deal(datacontent)
                            
                            bid_page=int(bid_page)+1
                            
                            stop_object=open(self.stop_file,"wb")
                            stop_object.write(str(self.item)+"	"+str(bid_page)+"	"+"0")
                            stop_object.flush()
                            stop_object.close()
                            self.tag=1
                            print self.item
                        if int(bid_page)>=page_size:
                            print "超过page_size大小,保存下一个物品的行数          0    0"
                            stop_object=open(self.stop_file,"wb")
                            stop_object.write(str(int(self.item)+1)+"	0	0
    ")
                            stop_object.close()
                            self.item=int(self.item)+1
                    except Exception as e:
                        if e=="timed out":
                            continue
                
        def deal(self,content):
            ls=[m.start() for m in re.finditer(""",content)]
            content=content[(ls[0]+1):ls[-3]]
            contentSoup=BeautifulSoup.BeautifulSoup(content)
            recordshtml=contentSoup.find("tbody")
            if recordshtml==None:
                return 
            recordshtml=recordshtml.findAll("tr")
            for record in recordshtml:
                cols=record.findAll("td")
                if len(cols)!=5:
                    continue
                name=cols[0].text
                price_em=cols[1].findAll("em")
                price=price_em[-1].text
                num=cols[2].text
                time=cols[3].text
                type=cols[4].text
                line=name+"	"+price+"	"+num+"	"+time+"	"+type+"
    "
                print line
                out_object=open(self.out_file,"a+")
                out_object.write(line)
                out_object.flush()
                out_object.close()
            print "ok"
    def run():
        item_path_file="e:/item_path_file"
        stop_file="e://stop_file"
        out_file="e://records_file"
        parser=MyParser(item_path_file,stop_file,out_file)
        parser.run()
    if __name__=="__main__":
        run()
        print "done~"


    这里item_path_file 就是第一步爬取到的商品链接文件,stop_file用于记录爬取到的位置,其实不记录也可以,上面程序没有记录爬取失败数据文件。

    注意,这里可能会爬取到天猫上的物品,但是天猫的交易记录和淘宝的格式不一样,所以这里直接过滤掉天猫的。

    这次爬数据比之前进步的地方:

    try except的使用,之前没有用,每次超时还要手动把程序停掉,然后再开启,从断点处爬,try except 的使用使得超时就跳过本链接,这样少了很多人工操作。


    后来得知自己都是手动爬的,还有一种scrapy框架比较简单些。


  • 相关阅读:
    Azure 虚拟机安全加固整理
    AzureARM 使用 powershell 扩容系统磁盘大小
    Azure Linux 云主机使用Root超级用户登录
    Open edX 配置 O365 SMTP
    powershell 根据错误GUID查寻错误详情
    azure 创建redhat镜像帮助
    Azure Powershell blob中指定的vhd创建虚拟机
    Azure Powershell 获取可用镜像 PublisherName,Offer,Skus,Version
    Power BI 连接到 Azure 账单,自动生成报表,可刷新
    Azure powershell 获取 vmSize 可用列表的命令
  • 原文地址:https://www.cnblogs.com/eva_sj/p/3971163.html
Copyright © 2020-2023  润新知