• 第六次作业


    作业①:

    • 要求:

      • 用requests和BeautifulSoup库方法爬取豆瓣电影Top250数据。
      • 每部电影的图片,采用多线程的方法爬取,图片名字为电影名
      • 了解正则的使用方法
    • 候选网站:豆瓣电影:https://movie.douban.com/top250

    • 输出信息:

      • 排名 电影名称 导演 主演 上映时间 国家 电影类型 评分 评价人数 引用 文件路径
        1 肖申克的救赎 弗兰克·德拉邦特 蒂姆·罗宾斯 1994 美国 犯罪 剧情 9.7 2192734 希望让人自由。 肖申克的救赎.jpg
        2......
    • 代码:

      from bs4 import BeautifulSoup
      from bs4 import UnicodeDammit
      import urllib.request
      import threading
      import os
      import pymysql
      
      
      class MySpider:
      
          def startUp(self):
              try:
                  self.con = pymysql.connect(host="localhost", port=3306, user="root", passwd='523523', db="mydb",
                                             charset="utf8")
                  self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
                  # self.cursor.execute("delete from movie")
                  self.opened = True
              except Exception as err:
                  print(err)
                  self.opened = False
      
          def closeUp(self):
              if self.opened:
                  self.con.commit()
                  self.con.close()
                  self.opened = False
      
          def processSpider(self, start_url):
              global threads
              global urls
              global count
              try:
                  req = urllib.request.Request(start_url, headers=headers)
                  data = urllib.request.urlopen(req)
                  data = data.read()
                  dammit = UnicodeDammit(data, ["utf-8", "gbk"])
                  data = dammit.unicode_markup
                  soup = BeautifulSoup(data, "lxml")
                  lis = soup.find('div', class_='article').find_all('li')
                  for li in lis:
                      title = li.find('div', class_='hd').find_all('span')[0].text  # 标题
                      score = li.find('div', class_='star').find_all('span')[1].text  # 评分
                      head = li.find('div', class_='star').find_all('span')[-1].text  # 评价人数
                      quote = li.find('p', class_='quote').find('span').text  # 引用
                      # 爬取导演、主演、年份、国家、类型
                      actor_infos_html = li.find(class_='bd')
                      # strip()方法用于移除字符串头尾指定的字符(默认为空格)
                      actor_infos = actor_infos_html.find('p').get_text().strip().split('
      ')
                      actor_infos1 = actor_infos[0].split('xa0xa0xa0')
                      director = actor_infos1[0][3:]  # 导演
                      if len(actor_infos1) > 1:
                          role = actor_infos1[1][3:]  # 主演
                      year_area = actor_infos[1].lstrip().split('xa0/xa0')
                      year = year_area[0]  # 年份
                      country = year_area[1]  # 国家
                      type = year_area[2]  # 类型
                      path = title + '.png'  # 文件路径
      
                      # 将数据写入数据库
                      if self.opened:
                          self.cursor.execute(
                              "insert into movie(wId, wTitle, wDirector, wRole, wYear, wCountry, wType, wScore, wHead, wQuote, wPath)"
                              "values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
                              (str(count), str(title), str(director), str(role), str(year), str(country), str(type), str(score), str(head), str(quote), str(path)))
                          count = count + 1
      
                  # 爬取图片
                  images = soup.select("img")
                  for image in images:
                      try:
                          url = image["src"]  # 获取图片下载链接
                          name = image["alt"]  # 获取图片名称
                          print(url)
                          if url not in urls:
                              urls.append(url)
                              print(url)
                              T = threading.Thread(target=self.download, args=(url, name))
                              T.setDaemon(False)
                              T.start()
                              threads.append(T)
                      except Exception as err:
                          print(err)
      
              except Exception as err:
                  print(err)
      
          def download(self, url, name):
              try:
                  ext = '.png'
                  req = urllib.request.Request(url, headers=headers)
                  data = urllib.request.urlopen(req, timeout=100)
                  data = data.read()
                  path = "download\" + name + ext  # 保存为png格式
                  fobj = open(path, "wb")
                  fobj.write(data)
                  fobj.close()
                  print("download" + name + ext)
              except Exception as err:
                  print(err)
      
          def executeSpider(self, start_url):
              print("Spider starting......")
              self.startUp()
              print("Spider processing......")
              self.processSpider(start_url)
              print("Spider closing......")
              self.closeUp()
      
      
      headers = {
          "User-Agent": "Mozilla/5.0(Windows;U;Windows NT 6.0 x64;en-US;rv:1.9pre)Gecko/2008072421 Minefield/3.0.2pre"
      }
      
      spider = MySpider()
      threads = []  # 存放线程
      urls = []  # 存放图片的url链接
      count = 1  # 用于计数
      
      # 全部10个网页
      for i in range(0, 10):
          start_url = "https://movie.douban.com/top250?start=" + str(i * 25) + "&filter="
          spider.executeSpider(start_url)
      
      for t in threads:
          t.join()
      
      
    • 实验结果:





    • 实验心得:

      实验一复习了这学期刚开始学习的一些知识,有一说一忘得差不多了,花了一段时间才重新回忆起来。由于此次爬取的好多内容都是放在一个text里,因此需要对爬取的文本内容进行分割,花了挺多 时间尝试了不同的分割方式,最后终于成功了orz。。然后就是多线程下载图片,这部分比较简单,直接将书本上例题的代码搬下来就好。

    • 作业②:

      • 要求:

        • 熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取科软排名信息
        • 爬取科软学校排名,并获取学校的详细链接,进入下载学校Logo存储、获取官网Url、院校信息等内容。
      • 候选网站:https://www.shanghairanking.cn/rankings/bcur/2020

      • 关键词:学生自由选择

      • 输出信息:MYSQL的输出信息如下

        image-20201124133319576

      • 代码:

        school.py:

        import scrapy
        import urllib.request
        from School.items import SchoolItem
        from bs4 import BeautifulSoup
        from bs4 import UnicodeDammit
        
        
        class SchoolSpider(scrapy.Spider):
            name = 'school'
            start_url = 'https://www.shanghairanking.cn/rankings/bcur/2020/'
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 "
                              "Minefield/3.0.2pre"}
        
            def start_requests(self):
                self.count = 1
                url = SchoolSpider.start_url
                yield scrapy.Request(url=url, callback=self.parse)
        
            def parse(self, response):
                  try:
                      dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
                      data = dammit.unicode_markup
                      selector = scrapy.Selector(text=data)
                      trs = selector.xpath("//div[@class='rk-table-box']/table/tbody/tr")
                      for tr in trs:
                          sNo = tr.xpath("./td[position()=1]/text()").extract_first()  # 排名
                          schoolName = tr.xpath("./td[position()=2]/a/text()").extract_first()  # 学校名称
                          city = tr.xpath("./td[position()=3]/text()").extract_first()  # 城市
                          Url = tr.xpath("./td[position()=2]/a/@href").extract_first()
                          sUrl = 'https://www.shanghairanking.cn' + Url.strip()  # 详情链接
                          # 进入详情链接
                          req = urllib.request.Request(sUrl, headers=SchoolSpider.headers)
                          data1 = urllib.request.urlopen(req)
                          dammit1 = UnicodeDammit(data1, ["utf-8", "gbk"])
                          data1 = dammit1.unicode_markup
                          mes = scrapy.Selector(text=data1)
                          officalUrl = mes.xpath("//div[@class='univ-website']/a/@href").extract_first()  # 官网链接
                          info = mes.xpath("//div[@class='univ-introduce']/p/text()").extract_first()  # 院校信息
                          imgUrl = mes.xpath("//td[@class='univ-logo']/img/@src").extract_first()  # 图片链接
                          self.download(imgUrl)  # 下载图片
                          mFile = str(self.count) + ".jpg"  # 图片保存地址
                          self.count += 1
        
                          item = SchoolItem()
                          item["sNo"] = sNo.strip() if sNo else ""
                          item["schoolName"] = schoolName.strip() if schoolName else ""
                          item["city"] = city.strip() if city else ""
                          item["officalUrl"] = officalUrl.strip() if officalUrl else ""
                          item["info"] = info.strip() if info else ""
                          item["mFile"] = mFile
                          yield item
                  except Exception as err:
                         print(err)
        
            def download(self, url):
                try:
                    ext = '.jpg'  # 保存为jpg格式
                    req = urllib.request.Request(url, headers=SchoolSpider.headers)
                    data = urllib.request.urlopen(req, timeout=100)
                    data = data.read()
                    path = "images\" + str(self.count) + ext
                    fobj = open(path, "wb")
                    fobj.write(data)
                    fobj.close()
                except Exception as err:
                    print(err)
        

        items.py:

        import scrapy
        
        
        class SchoolItem(scrapy.Item):
            # define the fields for your item here like:
            # name = scrapy.Field()
        
            sNo = scrapy.Field()
            schoolName = scrapy.Field()
            city = scrapy.Field()
            officalUrl = scrapy.Field()
            info = scrapy.Field()
            mFile = scrapy.Field()
        

        pipelines.py:

        import pymysql
        from itemadapter import ItemAdapter
        
        
        class SchoolPipeline:
            def open_spider(self, spider):
                print("opened")
                try:
                    self.con = pymysql.connect(host="localhost", port=3306, user="root", passwd="523523", db="mydb",
                                               charset="utf8")
                    self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
                    self.cursor.execute("delete from school")
                    self.opened = True
                    self.count = 0
                except Exception as err:
                    print(err)
                    self.opened = False
        
            def close_spider(self, spider):
                if self.opened:
                    self.con.commit()
                    self.con.close()
                    self.opened = False
                print("closed")
        
            def process_item(self, item, spider):
                try:
                    if self.opened:
                        self.cursor.execute(
                            "insert into school (sNo,schoolName,city,officalUrl,info,mfile) values( % s, % s, % s, % s, % s, % s)",
                            (item["sNo"], item["schoolName"], item["city"], item["officalurl"], item["info"], item["mFile"]))
                except Exception as err:
                    print(err)
        
                return item
        
        

        settings.py:

        ROBOTSTXT_OBEY = False
        
        ITEM_PIPELINES = {
            'School.pipelines.SchoolPipeline': 300,
        }
        
      • 实验结果:


      • 实验心得:

        scrapy框架忘得也差不多了。。花了点时间复习了一下各个模块的作用。代码编写比较简单,没什么好说的。

    • 作业③:

      • 要求:

        • 熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素加载、网页跳转等内容。
        • 使用Selenium框架+ MySQL数据库存储技术模拟登录慕课网,并获取学生自己账户中已学课程的信息并保存在MYSQL中。
        • 其中模拟登录账号环节需要录制gif图。
      • 候选网站: 中国mooc网:https://www.icourse163.org

      • 输出信息:MYSQL数据库存储和输出格式如下

        image-20201124133334348

      • 代码:

        import pymysql
        from selenium import webdriver
        from selenium.webdriver.chrome.options import Options
        import time
        
        
        class MoocSpider:
            headers = {
                "User-Agent": "SMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                              "Chrome/86.0.4240.183 Safari/537.361 "
            }
        
            def startUp(self, url):
                chrome_options = Options()
                self.driver = webdriver.Chrome(options=chrome_options)
        
                # 与数据库建立连接
                try:
                    self.con = pymysql.connect(host="localhost", port=3306, user="root", passwd='523523', db="mydb",
                                               charset="utf8")
                    self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
                    self.cursor.execute("delete from mooc")
                    self.opened = True
                except Exception as err:
                    print(err)
                    self.opened = False
        
                # 获取url链接
                self.driver.get(url)
                # 计数
                self.count = 1
        
            def closeUp(self):
                if self.opened:
                    self.con.commit()
                    self.con.close()
                    self.opened = False
                    self.driver.close()
                print("closed")
        
            # 登录慕课
            def enter(self):
                self.driver.find_element_by_xpath("//a[@class='f-f0 navLoginBtn']").click()  # 点击登录|注册
                time.sleep(3)
                self.driver.find_element_by_xpath("//span[@class='ux-login-set-scan-code_ft_back']").click()  # 选择其他登录方式
                time.sleep(3)
                self.driver.find_elements_by_xpath("//ul[@class='ux-tabs-underline_hd']//li")[1].click()  # 选择手机号登录
                time.sleep(3)
                iframe_id = self.driver.find_elements_by_tag_name("iframe")[1].get_attribute('id')
                self.driver.switch_to.frame(iframe_id)
                self.driver.find_element_by_xpath("//input[@id='phoneipt']").send_keys('13107693360')  # 输入手机号
                time.sleep(1)
                self.driver.find_element_by_xpath("//input[@class='j-inputtext dlemail']").send_keys('********')  # 输入密码
                time.sleep(1)
                self.driver.find_element_by_xpath("//a[@class='u-loginbtn btncolor tabfocus ']").click()  # 点击登录
                time.sleep(3)
                self.driver.find_element_by_xpath("//div[@class='u-navLogin-myCourse-t']").click()  # 进入个人中心
                time.sleep(2)
                self.driver.get(self.driver.current_url)
        
            # 爬取数据
            def processSpider(self):
                time.sleep(1)
                lis = self.driver.find_elements_by_xpath("//div[@class='course-panel-body-wrapper']"
                                                         "//div[@class='course-card-wrapper']")  # 找到所有参加的课程
                for li in lis:
                    li.find_element_by_xpath(".//div[@class='img']").click()
                    # 进入课程窗口
                    last_window = self.driver.window_handles[-1]
                    self.driver.switch_to.window(last_window)
                    time.sleep(2)
                    # 进入课程介绍
                    self.driver.find_element_by_xpath(".//a[@class='f-fl']").click()
                    last_window = self.driver.window_handles[-1]
                    self.driver.switch_to.window(last_window)
                    try:
                        cCourse = self.driver.find_element_by_xpath("//span[@class='course-title f-ib f-vam']").text  # 课程名
                        cCollege = self.driver.find_element_by_xpath("//img[@class='u-img']").get_attribute("alt")  # 大学名
                        cTeacher = self.driver.find_element_by_xpath("//div[@class='um-list-slider_con']/div[position("
                                                                     ")=1]//h3[@class='f-fc3']").text  # 教师
                        k = 0
                        ls = []
                        while (True):
                            try:
                                teacher = self.driver.find_elements_by_xpath(
                                    "//div[@class='um-list-slider_con_item']//h3[@class='f-fc3']")[k].text
                                ls.append(teacher)
                                k += 1
                            except:
                                break
                        cTeam = ",".join(ls)  # 转换成字符串 教师团队
                        cCount = self.driver.find_element_by_xpath(
                            "//span[@class='course-enroll-info_course-enroll_price-enroll_enroll-count']").text  # 人数
                        cProcess = self.driver.find_element_by_xpath(
                            "//div[@class='course-enroll-info_course-info_term-info_term-time']//span[position()=2]").text  # 时间
                        cBrief = self.driver.find_element_by_xpath("//div[@id='j-rectxt2']").text  # 简介
                    except Exception as err:
                        print(err)
                    self.driver.close()  # 关闭窗口
                    # 回到课程窗口
                    old_window1 = self.driver.window_handles[-1]
                    self.driver.switch_to.window(old_window1)
                    self.driver.close()  # 关闭窗口
                    # 回到个人中心
                    old_window2 = self.driver.window_handles[0]
                    self.driver.switch_to.window(old_window2)
                    # 打印爬取结果
                    print(self.count, cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief)
                    # 将结果写入数据库r
                    if self.opened:
                        self.cursor.execute(
                            "insert into mooc(wId, wCourse, wCollege, wTeacher, wTeam, wCount, wProcess, wBrief)"
                            "values(%s, %s, %s, %s, %s, %s, %s, %s)",
                            (str(self.count), cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief))
                        self.count += 1
        
            def executeSpider(self, url):
                print("Spider starting......")
                self.startUp(url)
                print("Spider entering......")
                self.enter()
                print("Spider processing......")
                self.processSpider()
                print("Spider closing......")
                self.closeUp()
        
        
        url = 'https://www.icourse163.org/'
        spider = MoocSpider()
        spider.executeSpider(url)
        
        
      • 实验结果:


      • 实验心得:

        实验三就像是上次实验的进阶版,实践用selenium框架进行点击、用户登录、返回窗口等操作,果然自动化爬取乐趣无穷!!以后还要继续探索selenium框架的妙用。

  • 相关阅读:
    Redis教程16-服务器
    剑指 Offer 21. 调整数组顺序使奇数位于偶数前面
    剑指 Offer 18. 删除链表的节点
    剑指 Offer 16. 数值的整数次方
    通过GetLastError 获取win32 api 的错误信息
    Windows API 错误码
    VS2012程序打包部署详解
    C#中子类调用父类的实现方法
    获取有关编程语言的帮助?
    Windows API 错误码解析
  • 原文地址:https://www.cnblogs.com/wlululu/p/14052407.html
Copyright © 2020-2023  润新知