• 第三次作业


    作业①

    (1):指定一个网站,爬取这个网站中的所有的所有图片,例如中国气象网(http://www.weather.com.cn)。分别使用单线程和多线程的方式爬取。

    单线程爬取

    #!/usr/bin/env python
    # _*_ coding:utf-8 _*_
    # author: xm time:2020/10/14
    from bs4 import BeautifulSoup
    from bs4 import UnicodeDammit
    import urllib.request
    def imageSpider(start_url):
        try:
            urls=[]
            req=urllib.request.Request(start_url,headers=headers)
            data=urllib.request.urlopen(req)
            data=data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            images = soup.select("img")
            for image in images:
                try:
                    src = image["src"]
                    url = urllib.request.urljoin(start_url, src)
                    if url not in urls:
    
                        urls.append(url)
                        print(url)
                        download(url)
                except Exception as err: print(err)
        except Exception as err:
            print(err)
    
    def download(url):
        global count
        try:
            count=count+1
            # 提取文件后缀扩展名
            if (url[len(url) - 4] == "."):
                ext = url[len(url) - 4:]
            else:
                ext = ""
            req = urllib.request.Request(url,headers=headers)
            data = urllib.request.urlopen(req, timeout=100)
            data = data.read()
            fobj = open("images\" + str(count) + ext, "wb")
            fobj.write(data)
            fobj.close()
            print("downloaded " + str(count) + ext)
        except Exception as err:
            print(err)
    
    start_url="http://www.weather.com.cn/weather/101280601.shtml"
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
    count=0
    imageSpider(start_url)
    


    多线程爬取

    #!/usr/bin/env python
    # _*_ coding:utf-8 _*_
    # author: xm time:2020/10/15
    from bs4 import BeautifulSoup
    from bs4 import UnicodeDammit
    import urllib.request
    import threading
    import time
    
    def imageSpider(start_url):
        global threads
        global count
        try:
            urls = []
            req = urllib.request.Request(start_url, headers=headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            images1 = soup.select("img")
            for image in images1:
                try:
                    src = image["src"]
                    url = urllib.request.urljoin(start_url, src)
                    if url not in urls:
                        urls.append(url)
                        print(url)
                        count = count + 1
                        T = threading.Thread(target=download, args=(url, count))
                        T.setDaemon(False)
                        T.start()
                        threads.append(T)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)
    
    
    def download(url, count):
        try:
            if (url[len(url)-4] == "."):
                ext = url[len(url)-4:]
            else:
                ext = ""
            req = urllib.request.Request(url, headers=headers)
            data = urllib.request.urlopen(req, timeout=100)
            data = data.read()
            fobj = open("images\" + str(count) + ext, "wb")
            fobj.write(data)
            fobj.close()
            print("downloaded " + str(count) + ext)
        except Exception as err:
            print(err)
    
    
    print("More Threads Craw JPG Images")
    start_url = "http://www.weather.com.cn/weather/101280601.shtml"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US;rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
    count = 0
    threads = []
    
    time_start = time.time()
    
    imageSpider(start_url)
    for t in threads:
        t.join()
    print("the End")
    time_end = time.time()
    time_using = time_end - time_start
    print("More Threads Craw JPG Images Time Using:", time_using, 's')
    


    (2):心得体会

    尝试着实践了多线程的代码,中间也出现一些小问题,函数和参数的使用,thread.Thread的使用也查了一下,过程相对顺利

    作业②

    (1):使用scrapy框架复现作业①

    思路流程:

    #!/usr/bin/env python
    # _*_ coding:utf-8 _*_
    # author: xm time:2020/10/16
    import scrapy
    from ..items import WeatherPhotoItem
    from scrapy.selector import Selector
    class Spider_weatherphoto(scrapy.Spider):
    
        name = "spiderweatherphoto"#给定爬虫的名字
        start_urls=["http://www.weather.com.cn/"]
    #执行爬虫的方法
        def parse(self, response):
    
            try:
    
                data = response.body.decode()
                selector = Selector(text=data)
                s=selector.xpath("//img/@src").extract()#获得src元素的Selector对象对应的src元素的文本组成的列表,extract()获取属性值
                for e in s:#循环提取
                    item=WeatherPhotoItem()
                    item["photo"] = [e]#photo要提取的数据项目,在items.py中
                    yield item#返回一个值等待被取走
            except Exception as err:
                print(err)
    
    
             # print(response.url)
            # item = WeatherPhotoItem()
            #  data = response.body.decode()
             # print(data)
             # selector=Selector(text=data)
             # s=selector.xpath("//img/@src")
             #
             # print(s)
             # print(s.extract())
             # for e in s:
             #     print(e.extract())
    
    
            # lis = response.xpath("//div/a/img")
            # # print(lis)
            # for li in lis:
            #     src = li.xpath("@src")
            #     print(src)
            # item["image"] = response
            # data=response.body.decode()
            # print(data)
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://docs.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class WeatherPhotoItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        photo = scrapy.Field()#要提取photo
        # pass
    
    
    ITEM_PIPELINES = {
        #'weather_photo.pipelines.WeatherPhotoPipeline': 300,
        'scrapy.pipelines.images.ImagesPipeline':1#然后我们在添加这行代码就可以使用他scrapy内置的图片下载器
    }
    IMAGES_STORE=r'D:anacondaexampledata_acquisitiondown_images'
    IMAGES_URLS_FIELD='photo'
    
    #!/usr/bin/env python
    # _*_ coding:utf-8 _*_
    # author: xm time:2020/10/16
    from scrapy import cmdline
    cmdline.execute("scrapy crawl spiderweatherphoto -s LOG_ENABLED=False".split())
    

    (2):心得体会

    初次使用scrapy框架确实有些不好弄,不过xpath确实很好用,全文档搜索标签很方便,可以在settings里面用scrapy
    内置的图片下载器,不用自己在编一个download有点方便,items里面写入自己想要获取的数据名称

    作业③

    (1):使用scrapy框架爬取股票相关信息。

    东方财富网:https://www.eastmoney.com/ 新浪股票:http://finance.sina.com.cn/stock/
    思路流程:

    #!/usr/bin/env python
    # _*_ coding:utf-8 _*_
    # author: xm time:2020/10/17
    import scrapy
    import json
    import re
    from ..items import GupiaodataItem
    class spider_gupiao(scrapy.Spider):
    
        name = "spidergupiao"
        # for j in range(1,10):
        
        
        start_urls=["http://75.push2.eastmoney.com/api/qt/clist/get?&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1602901412583%20Request%20Method:%20GET"]
    
        def parse(self, response):
                try:
                   sites = json.loads(response.body_as_unicode())
                   data = sites["data"]
                   diff = data["diff"]
            # pat =re.compile("[{.*?}]")
            # data3 = pat.
            # print(sites)
                 
                   for i in range(len(diff)):
                        item=GupiaodataItem()
                        item["mount"]=str(i)
                        item["code"]=str(diff[i]["f12"])
                        item["name"]=str(diff[i]["f14"])
                        item["lately"]=str(diff[i]["f2"])
                        item["zhangdiefu"]=str(diff[i]["f3"])
                        item["zhangdiee"]=str(diff[i]["f4"])
                        item["chengjiaoliang"]=str(diff[i]["f5"])
                        item["chengjiaoe"]=str(diff[i]["f6"])
                        item["zhenfu"]=str(diff[i]["f7"])
                        item["zuigao"]=str(diff[i]["f15"])
                        item["zuidi"]=str(diff[i]["f16"])
                        item["jinkai"]=str(diff[i]["f17"])
                        item["zuoshou"]=str(diff[i]["f18"])
                        yield item
                except Exception as err:
                    print(err)
            # song_item=response.meta["item"]
            # js = json.loads(response.text)["data"]
            # song_item["data"] = data
            # print(data)
    
    
    # https://docs.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class GupiaodataItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        mount = scrapy.Field()
        code = scrapy.Field()
        name = scrapy.Field()
        lately = scrapy.Field()
        zhangdiefu = scrapy.Field()
        zhangdiee = scrapy.Field()
        chengjiaoliang = scrapy.Field()
        chengjiaoe = scrapy.Field()
        zhenfu = scrapy.Field()
        zuigao = scrapy.Field()
        zuidi = scrapy.Field()
        jinkai = scrapy.Field()
        zuoshou = scrapy.Field()
        # pass
    
    
    # useful for handling different item types with a single interface
    from itemadapter import ItemAdapter
    
    class GupiaodataPipeline:
        count = 0
        def process_item(self, item, spider):
            GupiaodataPipeline.count+=1
            #控制输出格式对齐
            tplt = "{0:^2}	{1:^1}	{2:{13}^4}	{3:^5}	{4:^6}	{5:^6}	{6:^6}	{7:^10}	{8:^10}	{9:^10}	{10:^10}	{11:^10}	{12:^10}"
            try:
                if GupiaodataPipeline.count==1:#count==1时,即第一次调用时新建一个txt文件,然后把item数据写到文件中
                    fobj=open("data.txt","wt")#写入data.txt中
                    fobj.write("序号" + "        股票代码" + "    股票名称  " + " 最新报价  " + " 涨跌幅  " + "  涨跌额  " +
                               "      成交量   " + "       成交额         " + "            振幅   " + "      最高   " + "      最低   " + "      今开    "+ "       昨收  " + "
    ")
                else:#如果不是第一次调用count>1就打开已经存在的文件,把item的数据追加到文件中
                    fobj=open("data.txt","at")
                # fobj.write("序号"+"  股票代码  "+"  股票名称  "+"  最新报价  "+"涨跌幅"+"涨跌额"+
                #            "成交量"+"成交额"+"振幅"+"最高"+"最低"+"今开"+"昨收"+"
    ")
                fobj.write(
                    tplt.format(item["mount"], item["code"], item["name"], item['lately'], item['zhangdiefu'],
                                item['zhangdiee'], item['chengjiaoliang'],item['chengjiaoe'],item['zhenfu'],
                                item['zuigao'],item['zuidi'],item['jinkai'],item['zuoshou'],chr(12288)))
                fobj.write("
    ")
                
                fobj.close()
            except Exception as err:
                print(err)
            return item
    
    ITEM_PIPELINES = {
       'gupiaodata.pipelines.GupiaodataPipeline': 300,
    }
    
    #!/usr/bin/env python
    # _*_ coding:utf-8 _*_
    # author: xm time:2020/10/17
    from scrapy import cmdline
    cmdline.execute("scrapy crawl spidergupiao -s LOG_ENABLED=False".split())
    

    部分截图:

    (2):心得体会

    和实验二的大体相似,但是因为获取的是json格式的数据,没办法使用xpath,换了json.loads(),用js[i]["f12"]获取数据值,
    字典形式的也很好。接着就是存储的时候的格式问题,如果直接拼接,就要str(),但是很难对齐,所以还是用format,中文用
    chr(12288),用管道类的进行数据的写入存储,过程稍有困难

  • 相关阅读:
    如何让DataGridView根据数据“0”或“1”等值显示为“是”或“否”
    GridView控件RowDataBound事件中获取列字段的几种方法
    DataGridView 显示和隐藏DataGridViewButtonCell按钮的办法
    ASP.NET会话(Session)保存模式
    Update 两个表之间数据更新
    两表相连去除重复数据
    BS网站与Winform窗体的数据交互(WebService实现)
    winform 下实现消息传递机制
    SQL常用字符串函数
    HDU 威威猫系列故事——篮球梦
  • 原文地址:https://www.cnblogs.com/lmmlm/p/13837070.html
Copyright © 2020-2023  润新知