• 爬取淘宝交易记录的爬虫


    前几天刚来头儿让爬个淘宝交易记录先看看,就用python写了个,我是分成两步爬的,首先是爬取商品链接,代码如下:


    #-*- coding:utf-8 -*-
    
    import BeautifulSoup
    import urllib2
    import json
    import cookielib
    
    class MyParser:
        def __init__(self,seedurl,destpath,stop_file_path):
            self.seedurl=seedurl
            self.stop_file_path=stop_file_path
            stop_file=open(stop_file_path,"rb")
            splits=stop_file.readline().split("	")
            self.no_0=splits[0]         #stop文件里的值:初始为0
            self.no_1=splits[1]  #当前页第几个物品
            self.no_2=splits[2] #当前物品第几个记录
            self.destpath=destpath
        def  run(self):
            print self.no_0
            while int(self.no_0)<5*44:
                
                self.seedurl=self.seedurl+str(self.no_0)
                headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36"}
                req=urllib2.Request(url=self.seedurl,headers=headers)
                    
                content=urllib2.urlopen(req).read()
                contentsoup=BeautifulSoup.BeautifulSoup(content)
                
                items=contentsoup.findAll("div",{"class":"col title"})
                out_file=open(self.destpath,"a+")
                for item in items:
                    print item.find("a")["href"]
                    out_file.write(item.find("a")["href"]+"
    ")
                    out_file.flush()
                out_file.close()
                self.no_0=int(self.no_0)+44
    
            print "ok"
    def run():
        seedurl="http://s.taobao.com/search?spm=a230r.1.8.15.5n02zF&refpid=420461_1006&tab=all&q=%C5%AE%D1%A9%B7%C4%C9%C0&style=list&bcoffset=-4&s="
        item_stop_file="e://item_stop_file"
        record_stop_file="s://record_stop_file"
        outFile="e://out"
        myParser=MyParser(seedurl,outFile,item_stop_file)
        myParser.run()
    if __name__=="__main__":
        run()
        print "done!"

    这样得到了输出文件e://out ,每行是一个商品的链接。


    下面根据上面爬到的文件,爬取每个商品的交易记录,代码如下:

    #-*- coding:utf-8 -*-
    '''
    Created on 2014��7��23��
    
    @author: sj
    '''
    import re
    import BeautifulSoup
    import os
    import urllib2
    
     
    class MyParser:
        def __init__(self,item_path_file,stop_file,out_file):
            self.item_path_file=item_path_file
            self.stop_file=stop_file
            self.out_file=out_file
            stop_object=open(self.stop_file,"rb")
            splits=stop_object.readline().split("	")
            stop_object.close()
            self.item=splits[0]
            self.page=splits[1]
            self.record=splits[2]
            self.tag=0
        def run(self):
    
            
                print self.item
                print self.page
                print self.record
                item_object=open(self.item_path_file,"rb")
                num_items=len(item_object.readlines())
                item_object.close()
                item_object=open(self.item_path_file,"rb")
                for line in item_object.readlines()[int(self.item):num_items]:
                    try:
                        if re.search("tmall",line):
                            stop_object=open(self.stop_file,"rb")
                            item_new=stop_object.readline().split("	")[0]
                            stop_object.close()
                            stop_object=open(self.stop_file,"wb")
                            stop_object.write(item_new+"	"+"0"+"	"+"0"+"
    ")
                            stop_object.flush()
                            stop_object.close()
                            continue
                        print line
                        headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36"}
                        req=urllib2.Request(url=line,headers=headers)
                        
                        content=urllib2.urlopen(req,timeout=3).read()
                        contentSoup=BeautifulSoup.BeautifulSoup(content)
                        
                        data_api=contentSoup.find("button",{"id":"J_listBuyerOnView"})["data-api"]           
                        parameters=data_api.split("?")[1]
                        stop_object=open(self.stop_file,"rb")
                        bid_page=stop_object.readline().split("	")[1]
                        stop_object.close()
                        page_size=int(parameters.split("&")[2].split("=")[1])
            
                        while int(bid_page)<int(page_size):
                            print "没有超过pagesize的大小..."
                            print bid_page
                            if self.tag==1:
                                data_api=data_api.replace("bid_page="+str(bid_page),"bid_page="+str(int(bid_page)+1))
                            else:
                                data_api=data_api.replace("bid_page=1","bid_page="+str(int(bid_page)+1))
                            data_url=data_api+"&ua=006tpOWUuXBidH1MRWQZ0InIldyJ0J3AibxJg%3D%3D%7CtaBkcTQxVFHEsbQxBFEEIfY%3D%7CtJFV4sbweFGpcSkNye3Y7ckNKV7GLmae5976Lfo%3D%7Cs6aDR2N2MzZTVsO2szYjpsOmAwbil4KX4tei15LXgpeSh%2FLHQmax%7Csqcy9kFUkBUANfF0sJQ9VOM7Y%2BeTZUGWQQ%3D%3D%7CsSTgxOA3%7CsIVB9vM3Mvbj1pPGAmcSJ0KGk6bDxgJ3EpdTRnMWE9eihwLVAg%3D%3D%7Cv%2Fo%2Bia0L%2FGqyyuwU7KUtCc3o3Vic%2BZzJDVhtOA3aDQ%3D%3D%7CvusvmLyYXOuOy%2B4qrzpfm85L3jpvq767rmp%2Fau8rbjvsKC3pzektWB04vWq9%7Cvfj9%2BDw5%2FdgcCUxZnaj9iEw5XJitafw4LViP&t=1406097091097&callback=Hub.data.records_reload"
                
                            req=urllib2.Request(url=data_url,headers=headers)
                            datacontent=urllib2.urlopen(req,timeout=3).read()
                            datacontent=datacontent.decode("gbk").encode("utf-8")
                            self.deal(datacontent)
                            
                            bid_page=int(bid_page)+1
                            
                            stop_object=open(self.stop_file,"wb")
                            stop_object.write(str(self.item)+"	"+str(bid_page)+"	"+"0")
                            stop_object.flush()
                            stop_object.close()
                            self.tag=1
                            print self.item
                        if int(bid_page)>=page_size:
                            print "超过page_size大小,保存下一个物品的行数          0    0"
                            stop_object=open(self.stop_file,"wb")
                            stop_object.write(str(int(self.item)+1)+"	0	0
    ")
                            stop_object.close()
                            self.item=int(self.item)+1
                    except Exception as e:
                        if e=="timed out":
                            continue
                
        def deal(self,content):
            ls=[m.start() for m in re.finditer(""",content)]
            content=content[(ls[0]+1):ls[-3]]
            contentSoup=BeautifulSoup.BeautifulSoup(content)
            recordshtml=contentSoup.find("tbody")
            if recordshtml==None:
                return 
            recordshtml=recordshtml.findAll("tr")
            for record in recordshtml:
                cols=record.findAll("td")
                if len(cols)!=5:
                    continue
                name=cols[0].text
                price_em=cols[1].findAll("em")
                price=price_em[-1].text
                num=cols[2].text
                time=cols[3].text
                type=cols[4].text
                line=name+"	"+price+"	"+num+"	"+time+"	"+type+"
    "
                print line
                out_object=open(self.out_file,"a+")
                out_object.write(line)
                out_object.flush()
                out_object.close()
            print "ok"
    def run():
        item_path_file="e:/item_path_file"
        stop_file="e://stop_file"
        out_file="e://records_file"
        parser=MyParser(item_path_file,stop_file,out_file)
        parser.run()
    if __name__=="__main__":
        run()
        print "done~"


    这里item_path_file 就是第一步爬取到的商品链接文件,stop_file用于记录爬取到的位置,其实不记录也可以,上面程序没有记录爬取失败数据文件。

    注意,这里可能会爬取到天猫上的物品,但是天猫的交易记录和淘宝的格式不一样,所以这里直接过滤掉天猫的。

    这次爬数据比之前进步的地方:

    try except的使用,之前没有用,每次超时还要手动把程序停掉,然后再开启,从断点处爬,try except 的使用使得超时就跳过本链接,这样少了很多人工操作。


    后来得知自己都是手动爬的,还有一种scrapy框架比较简单些。


  • 相关阅读:
    封装简单的mvc框架
    php中date函数获取当前时间的时区误差解决办法
    PHP中date函数参数详解
    PHP中字符串补齐为定长
    php将xml文件转化为数组:simplexml_load_string
    PHP基于变量的引用实现的树状结构
    EcShop后台添加菜单[步骤]
    Cmd批处理语法实例
    Mysql语句的批量操作[修改]
    HTML前端技术(JS的使用,包括数组和字符串)
  • 原文地址:https://www.cnblogs.com/eva_sj/p/3971163.html
Copyright © 2020-2023  润新知