• 第四次作业


    作业一

    (1)熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取当当网站图书数据

    graph TD A[解析当当网址] -->|print| B(先看一下有木有爬下来) B --> C{xpath获取需要的item} C -->|和mysql建立连接| D[试往里存] C -->|都OK| F[run]
    #!/usr/bin/env python
    # _*_ coding:utf-8 _*_
    # author: xm time:2020/10/27
    import scrapy
    from ..items import DangdangItem
    from bs4 import BeautifulSoup
    from bs4 import UnicodeDammit
    
    class spider_dangdang(scrapy.Spider):
        name = "spiderdangdang"
        # key = 'python'
        # source_url='http://www.dangdang.com/'
        # start_urls=["http://search.dangdang.com/?key=python&act=input&page_index=2"]
        def start_requests(self):
            url="http://search.dangdang.com/?key=python&act=input"
            print(url)
            yield scrapy.Request(url=url,callback=self.parse)
    
        def parse(self, response):
            try:
                dammit = UnicodeDammit(response.body,["utf-8",'gbk'])
                data = dammit.unicode_markup
                selector=scrapy.Selector(text=data)
                lis = selector.xpath("//ul[@class='bigimg']/li")
                for li in lis:
                    title = li.xpath("./p[@class='name']/a/@title").extract_first()
                    author = li.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
                    price = li.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
                    date = li.xpath("./p[@class='search_book_author']/span[position()=last()-1]/text()").extract_first()
                    publisher = li.xpath("./p[@class='search_book_author']/span[position()=3]/a/@title").extract_first()
                    detail = li.xpath("./p[@class='detail']/text()").extract_first()
                    # print(title)
                    item=DangdangItem()
                    item["title"]=title.strip() if title else ""
                    item["author"]=author.strip() if author else ""
                    item['price']=price.strip() if price else ""
                    item['date']=date.strip()[1:] if price else ""
                    item['publisher']=publisher.strip() if publisher else ""
                    item['detail']=detail.strip() if detail else ""
                    yield item
            except Exception as err:
                print(err)
    
    import scrapy
    class DangdangItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        title = scrapy.Field()
        author = scrapy.Field()
        publisher = scrapy.Field()
        date = scrapy.Field()
        price = scrapy.Field()
        detail = scrapy.Field()
    
    import pymysql
    class DangdangPipeline:
        def open_spider(self,spider):
            print("opened")
            try:
                self.con=pymysql.connect(host="127.0.0.1",port=3306,user="root",
                                         passwd="031804114.hao",db="mydb",charset="utf8")
                self.cursor=self.con.cursor(pymysql.cursors.DictCursor)
                self.cursor.execute("delete from books")
                self.opened=True
                self.count=0
            except Exception as err:
                print(err)
                self.opened=False
        def close_spider(self,spider):
            if self.opened:
                self.con.commit()
                self.con.close()
                self.opened=False
            print("closed")
            print("总共爬取",self.count,"本书籍")
        def process_item(self, item, spider):
            try:
                if self.opened:
                    self.cursor.execute("insert into books(bTitle,bAuthor,bPublisher,bDate,bPrice,bDetail) values (%s,%s,%s,%s,%s,%s)",(item["title"],item["author"],item["publisher"],item["date"],item["price"],item["detail"]))
                    self.count+=1
            except Exception as err:
                print(err)
            return item
    

    (2)心得体会

    Xpath的解析标签要详细一点,还有python3的去除字符的空格,str.strip()去除两端的空格;mysql记得创建表的时候加一句 character set = utf8,默认的不是utf8,不然就给你存的时候搞事情

    作业二

    (1)熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取股票相关信息

    #!/usr/bin/env python
    # _*_ coding:utf-8 _*_
    # author: xm time:2020/10/28
    import scrapy
    import json
    import re
    import math
    from ..items import GupiaomysqlItem
    class spider_gupiao(scrapy.Spider):
        name = "spidergupiao"
        start_urls=["http://11.push2.eastmoney.com/api/qt/clist/get?&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1603874731034"]
        def parse(self, response):
            try:
                sites = json.loads(response.body_as_unicode())
                data = sites["data"]
                diff = data["diff"]
                print(diff)
                print(len(diff))
                for i in range(len(diff)):
                    item=GupiaomysqlItem()
                    item["mount"]=str(i)
                    item["code"]=str(diff[i]["f12"])
                    item["name"]=str(diff[i]["f14"])
                    item["lately"]=str(diff[i]["f2"])
                    item["zhangdiefu"]=str(diff[i]["f3"])
                    item["zhangdiee"]=str(diff[i]["f4"])
                    item["chengjiaoliang"]=str(diff[i]["f5"])
                    item["chengjiaoe"]=str(diff[i]["f6"])
                    item["zhenfu"]=str(diff[i]["f7"])
                    item["zuigao"]=str(diff[i]["f15"])
                    item["zuidi"]=str(diff[i]["f16"])
                    item["jinkai"]=str(diff[i]["f17"])
                    item["zuoshou"]=str(diff[i]["f18"])
                    yield item
                #all_page = math.ceil(eval(re.findall('"total":(d+)', response.body.decode())[0]) / 20)
                page = re.findall("pn=(d+)", response.url)[0]  # 当前页数
                if int(page) < 5:  # 爬取页数
                    url = response.url.replace("pn=" + page, "pn=" + str(int(page) + 1))  # 跳转下一页
                    yield scrapy.Request(url=url, callback=self.parse)
                yield scrapy.Request(url, self.parse)
            except Exception as err:
                        print(err)
    
    import scrapy
    class GupiaomysqlItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        mount = scrapy.Field()
        code = scrapy.Field()
        name = scrapy.Field()
        lately = scrapy.Field()
        zhangdiefu = scrapy.Field()
        zhangdiee = scrapy.Field()
        chengjiaoliang = scrapy.Field()
        chengjiaoe = scrapy.Field()
        zhenfu = scrapy.Field()
        zuigao = scrapy.Field()
        zuidi = scrapy.Field()
        jinkai = scrapy.Field()
        zuoshou = scrapy.Field()
    
    from itemadapter import ItemAdapter
    import pymysql
    class GupiaomysqlPipeline:
        def open_spider(self,spider):
            print("opened")
            try:
                self.con=pymysql.connect(host="127.0.0.1",port=3306,user="root",
                                         passwd="031804114.hao",db="gupiao",charset="utf8")
                self.cursor=self.con.cursor(pymysql.cursors.DictCursor)
                self.cursor.execute("delete from information")
                self.opened=True
                self.count=0
            except Exception as err:
                print(err)
                self.opened=False
        def close_spider(self,spider):
            if self.opened:
                self.con.commit()
                self.con.close()
                self.opened=False
            print("closed")
            print("总共爬取",self.count,"条")
        def process_item(self, item, spider):
            try:
                print(item["mount"])
                print()
                if self.opened:
                    self.cursor.execute("insert into information(id,bno,bname,bLatestPrice,bZhangDieFu,bZhangDieE,bChengJiaoLiang,bChengJioaE,bZhenFu,bZuiGao,bZuiDi,bJinKai,bZuoShou) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(item["mount"],item["code"],item["name"],item["lately"],item["zhangdiefu"],item["zhangdiee"],item["chengjiaoliang"],item["chengjiaoe"],item["zhenfu"],item["zuigao"],item["zuidi"],item["jinkai"],item["zuoshou"]))
                    self.count+=1
            except Exception as err:
                print(err)
            return item
    

    (2)心得体会

    主要还是根据上次爬股票的小爬5页,数据库的操作和实验一一样

    作业三

    (1)熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。

    #!/usr/bin/env python
    # _*_ coding:utf-8 _*_
    # author: xm time:2020/10/30
    import scrapy
    from bs4 import UnicodeDammit
    from ..items import WaihuimysqlItem
    class spider_waihui(scrapy.Spider):
        name = "spiderwaihui"
        start_urls=["http://fx.cmbchina.com/hq/"]
        def parse(self, response):
            try:
                dammit = UnicodeDammit(response.body, ["utf-8", 'gbk'])
                data = dammit.unicode_markup
                # print(data)
                selector = scrapy.Selector(text=data)
                trs = selector.xpath("//div[@id='realRateInfo']/table/tr")
                # print(trs)
                for tr in trs[1:]:
                    item=WaihuimysqlItem()
                    a =tr.xpath("./td[position()=1][@class='fontbold']/text()").extract_first()
                    item["type"] = str(a).strip()
                    item["tsp"] = str(tr.xpath("./td[position()=4][@class='numberright']/text()").extract_first()).strip()
                    item["csp"] = str(tr.xpath("./td[position()=5][@class='numberright']/text()").extract_first()).strip()
                    item["tbp"] = str(tr.xpath("./td[position()=6][@class='numberright']/text()").extract_first()).strip()
                    item["cbp"] = str(tr.xpath("./td[position()=7][@class='numberright']/text()").extract_first()).strip()
                    item["time"] = str(tr.xpath("./td[position()=8][@align='center']/text()").extract_first()).strip()
                    yield item
            except Exception as err:
                print(err)
    
    import scrapy
    class WaihuimysqlItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        type = scrapy.Field()
        tsp = scrapy.Field()
        csp = scrapy.Field()
        tbp = scrapy.Field()
        cbp = scrapy.Field()
        time = scrapy.Field()
    
    import pymysql
    class WaihuimysqlPipeline:
        def open_spider(self,spider):
            print("opened")
            try:
                self.con=pymysql.connect(host="127.0.0.1",port=3306,user="root",
                                         passwd="031804114.hao",db="mydb",charset="utf8")
                self.cursor=self.con.cursor(pymysql.cursors.DictCursor)
                self.cursor.execute("delete from waihui")
                self.opened=True
                self.count=0
            except Exception as err:
                print(err)
                self.opened=False
        def close_spider(self,spider):
            if self.opened:
                self.con.commit()
                self.con.close()
                self.opened=False
            print("closed")
            print("总共爬取",self.count,"条")
        def process_item(self, item, spider):
            try:
                if self.opened:
                    self.cursor.execute("insert into waihui(btype,btsp,bcsp,btbp,bcbp,btime) values (%s,%s,%s,%s,%s,%s)",(item["type"],item["tsp"],item["csp"],item["tbp"],item["cbp"],item["time"]))
                    self.count+=1
            except Exception as err:
                print(err)
            return item
    

    (2)心得体会

    有上面两个实验这个很简单,就是去除空格就好了

  • 相关阅读:
    后端程序员之路 28、一个轻量级HTTP Server的实现
    后端程序员之路 27、LogStash
    后端程序员之路 26、CAP理论
    后端程序员之路 25、Redis Cluster
    后端程序员之路 24、Redis hiredis
    后端程序员之路 23、一个c++的api framework
    后端程序员之路 22、RESTful API
    后端程序员之路 21、一个cgi的c++封装
    后端程序员之路 20、python复习
    flask框架的学习
  • 原文地址:https://www.cnblogs.com/lmmlm/p/13907495.html
Copyright © 2020-2023  润新知