• 第六次作业


    • 作业①:
      (1)要求:
      用requests和BeautifulSoup库方法爬取豆瓣电影Top250数据。
      每部电影的图片,采用多线程的方法爬取,图片名字为电影名
      了解正则的使用方法

    code:
    1、爬取电影信息

    from bs4 import BeautifulSoup
    import requests
    import re
    
    
    # 获取网页的内容
    def get_html(URL):
        header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/79.0.3945.130 Safari/537.36'}
        res = requests.get(URL, headers=header)  # 获取网页,并带有伪装的浏览器头,一般好的网站会有检测是不是程序访问
        res.encoding = res.apparent_encoding  # 设置编码,防止乱码
        return res.text  # 返回网页的内容
    
    def ana_by_bs4(html):
        soup = BeautifulSoup(html, 'html.parser')  # 注意需要添加html.parser解析
        lis = soup.select("ol li")  # 选择ol li标签
        for li in lis:
            id = li.find('em').text
            title = li.find_all('span', class_='title')
            name = title[0].text
            director_actor = li.find('div', class_='bd').find('p').text.split('\n')[1].strip()  # 导演和演员
            strInfo = re.search("(?<=<br/>).*?(?=<)", str(li.select_one(".bd p")), re.S | re.M).group().strip()  # 年份、国家、类型
            infos = strInfo.split('/')
            year = infos[0].strip()  # 年份
            area = infos[1].strip()  # 国家,地区
            genres = infos[2].strip()  # 类型
            rating = li.find('span', class_='rating_num').text  # 评分
            rating_num = li.find('div', class_='star').find_all('span')[3].text[:-3]  # 评分人数
            try:
                quote = li.find('span', class_='inq').text  # 名言
            except:  # 名言可能不存在
                quote = ''
            print(id, name, director_actor, year, area, genres, rating, rating_num, quote)
    
    
    headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/79.0.3945.130 Safari/537.36'}
    
    
    if __name__ == '__main__':
        count=0
        threads=[]
        for page in range(10):
            print('第{}页'.format(page + 1))
            print('排名', '电影名称', '导演和主演', '年份', '地区', '类型', '评分', '评分人数','名言')
            url = 'https://movie.douban.com/top250?start={}&filter='.format(page * 25)  # 电影的url,有多页的时候需要观察url的规律
            text = get_html(url)  # 获取网页内容
            ana_by_bs4(text)  # bs4方式解析
        # spider_douban250()
        # for t in threads:
        #     t.join()
        # print("The End")
    

    2、爬取图片:

    from urllib.request import urlopen
    import re
    import urllib
    import threading
    
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
    'Cookie':'bid=wjbgW95-3Po; douban-fav-remind=1; __gads=ID=f44317af32574b60:T=1563323120:S=ALNI_Mb4JL8QlSQPmt0MdlZqPmwzWxVvnw; __yadk_uid=hwbnNUvhSSk1g7uvfCrKmCPDbPTclx9b; ll="108288"; _vwo_uuid_v2=D5473510F988F78E248AD90E6B29E476A|f4279380144650467e3ec3c0f649921e; trc_cookie_storage=taboola%2520global%253Auser-id%3Dff1b4d9b-cc03-4cbd-bd8e-1f56bb076864-tuct427f071; viewed="26437066"; gr_user_id=7281cfee-c4d0-4c28-b233-5fc175fee92a; dbcl2="158217797:78albFFVRw4"; ck=4CNe; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1583798461%2C%22https%3A%2F%2Faccounts.douban.com%2Fpassport%2Flogin%3Fredir%3Dhttps%253A%252F%252Fmovie.douban.com%252Ftop250%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.1583974348.1563323123.1572242065.1583798461.8; __utmb=30149280.0.10.1583798461; __utmc=30149280; __utmz=30149280.1583798461.8.7.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/passport/login; __utma=223695111.424744929.1563344208.1572242065.1583798461.4; __utmb=223695111.0.10.1583798461; __utmc=223695111; __utmz=223695111.1583798461.4.4.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/passport/login; push_noty_num=0; push_doumail_num=0; _pk_id.100001.4cf6=06303e97d36c6c15.1563344208.4.1583798687.1572242284.'}
    base_url = 'https://movie.douban.com/top250?start=%d&filter='
    
    
    class spider_douban250(object):
        global threads
        def __init__(self,url = None, start = 0, step = 25 , total = 250):
            self.durl = url
            self.dstart = start
            self.dstep =step
            self.dtotal = total
    
        def start_download(self):
            while self.dstart < self.dtotal:
                durl = self.durl%self.dstart
                print(durl)
                self.load_page(durl)
                self.dstart += self.dstep
    
    
        def load_page(self,url):
            req=urllib.request.Request(url=url,headers=headers)
            req = urlopen(req)
            if req.code != 200:
                return
            con = req.read().decode('utf-8')
            listli = re.findall(r'<li>(.*?)</li>', con,re.S)
            if listli:
                listli = listli[1:]
            else:
                return
            for li in listli:
                imginfo = re.findall(r'<img.*?>', li)
                if imginfo:
                    imginfo = imginfo[0]
                    info = [item.split('=')[1].strip()[1:-1] for item in imginfo.split(' ')[2:4]]
                    #self.load_img(info)
                    T = threading.Thread(target=self.load_img(info))
                    T.setDaemon(False)
                    T.start()
                    threads.append(T)
    
    
        def load_img(self,info):
            print("callhere load img:", info)
            req = urllib.request.Request(url=info[1], headers=headers)
            imgreq = urlopen(req,timeout=100)
            img_c = imgreq.read()
            path = r'E:\\image\\' + info[0] + '.jpg'
            print('path:', path)
            imgf = open(path, 'wb')
            imgf.write(img_c)
            imgf.close()
    
    
    threads=[]
    spider = spider_douban250(base_url,start=0,step=25,total=25)
    spider.start_download()
    for t in threads:
        t.join()
    print("The End")
    
    

    结果:




    (2)心得体会:
    可能是因为正则表达式之前掌握得也不是很好,所以现在用的时候,其实是已经有些遗忘了,临时又重新复习了一把。

    • 作业②
      (1)要求:
      熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取科软排名信息
      爬取科软学校排名,并获取学校的详细链接,进入下载学校Logo存储、获取官网Url、院校信息等内容。
      code:
      rank
    from bs4 import UnicodeDammit
    import scrapy
    from ..items import RuankeItem
    import urllib.request
    import requests
    from lxml import etree
    
    headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
        }
    class RankSpider(scrapy.Spider):
        name = 'rank'
        start_url = 'https://www.shanghairanking.cn/rankings/bcur/2020'
    
        def start_requests(self):
            url = RankSpider.start_url
            yield scrapy.Request(url=url, callback=self.parse)
    
        def parse(self, response):
            dammit = UnicodeDammit(response.body, ["utf-8", "gdk"])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            lis = selector.xpath("//div[@class='rk-table-box']/table/tbody/tr")
            cout = 0
            for li in lis:
                cout=cout+1
                sNo=li.xpath("./td[position=1]/text()").extract_first()
                sNo = str(sNo).strip()
                schoolName=li.xpath("./td[@class='align-left']/a/text()").extract_first()
                schoolName=str(schoolName).strip()
                city=li.xpath("./td[position=3]/text()").extract_first()
                city=str(city).strip()
                mfile = str(cout) + '.png'
                Url=li.xpath("./td[@class='align-left']/a//@href").extract_first()
                urls='https://www.shanghairanking.cn/'
                strurl=urllib.request.urljoin(Url)
                data1 = UnicodeDammit(strurl, ["utf-8", "gbk"])
                data2 = data1.unicode_markup
                selector1 = scrapy.Selector(text=data2)
                lssi = selector1.xpath("//div[@class='info-container']/tbody/tr")
                officalUrl=lssi.xpath("./div[@class ='univ-website']/a/text()").extract_first()
                info=lssi.xpath("./div[@class='univ-introduce'/p/text()").extract_first()
                info = str(info).strip()
                imgs=lssi.xpath("./td[@class='univ-logo'/ img")
                for img in imgs:
                    url1 = img["src"]  # src存储的是图片的名称
                if (url1[len(url1) - 4] == "."):  # 判断是否为目标文件(图片)
                    ext = url1[len(url1) - 4:]  # 获取图片的格式jdp还是png
                else:
                    ext = ""
                req = urllib.request.Request(url1, headers=headers)
                data2 = urllib.request.urlopen(req, timeout=100)
                data2 = data2.read()  # 进行图片的数据读
                fobj = open("E:\\images\\" + str(count) + ext, "wb")  
                count = count + 1
                fobj.write(data2)  # 写数据
                fobj.close()  # 最后要关闭这个进程
                item = RuankeItem()
                item["sNo"] = sNo
                item["schoolName"] = schoolName
                item["city"] = city
                item["offcialUrl"] = officalUrl
                item["info"] = info
                item["mfile"] = mfile
                yield item
    
    

    item:

    import scrapy
    
    
    class RuankeItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        sNo = scrapy.Field()
        schoolName = scrapy.Field()
        city = scrapy.Field()
        officalUrl = scrapy.Field()
        info = scrapy.Field()
        mfile = scrapy.Field()
    

    pipelines:

    import pymysql
    class RuankePipeline(object):
        def open_spider(self, spider):
            print("opened")
            try:
                self.con = pymysql.connect(host='localhost', port=3306, user="root", passwd="1394613257",
                                           db="MyStock", charset="utf8")
                self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
                self.cursor.execute("delete from ranks")
                self.opened = True
                self.count = 0
                self.num = 1
            except Exception as err:
                print(err)
                self.opened = False
    
        def close_spider(self, spider):
            if self.opened:
                self.con.commit()
                self.con.close()
                self.opened = False
    
            print("closed")
            print("总共爬取", self.count, "大学")
    
        def process_item(self, item, spider):
    
            print(item['sNo'])
            print(item['schoolName'])
            print(item['city'])
            print(item['officalUrl'])
            print(item['info'])
            print(item['mfile'])
            print()
    
            if self.opened:
                n = str(self.num)
                self.cursor.execute("insert into ranks(sNo,schoolName,city,officalUrl,info,mfile)values(%s,%s,%s,%s,%s,%s)",(item['sNo'],item['schoolName'],str(item['city']),str(item['officalUrl']),item['info'],item['mfile']))
                self.count += 1
    
            return item
    

    图片:



    (2)心得体会:
    这一次的框架爬取的时候,出现不少的小意外,debug了好久的代码,可能是对于scrapy框架的学习掌握程度还不够,还需要再理清楚一点。另外在创建数据库的时候,还是要尽量考虑到存储内容的大小。

    • 作业③:
      要求:
      熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素加载、网页跳转等内容。
      使用Selenium框架+ MySQL数据库存储技术模拟登录慕课网,并获取学生自己账户中已学课程的信息并保存在MYSQL中。
      其中模拟登录账号环节需要录制gif图。
      code:
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.common.keys import Keys
    import time
    import pymysql
    
    
    header = {
            "User-Agent": "Mozilla/5.0(Windows;U;Windows NT 6.0 x64;en-US;rv:1.9pre)Gecko/2008072531 Minefield/3.0.2pre"}
    url = "https://www.icourse163.org/"
    def login():
        driver = webdriver.Chrome()
        driver.get(url)
        driver.maximize_window()
        begin = driver.find_element_by_xpath("//div[@class='unlogin']//a[@class='f-f0 navLoginBtn']")
        begin.click()
        time.sleep(1)
        driver.find_element_by_xpath("//div[@class='ux-login-set-scan-code_ft']//span[@class='ux-login-set-scan-code_ft_back']").click()
        time.sleep(1)
        driver.find_element_by_xpath("//ul[@class='ux-tabs-underline_hd']//li[@class='']").click()
        time.sleep(1)
        frame1 = driver.find_element_by_xpath("//div[@class='ux-login-set-container']//iframe")
        driver.switch_to.frame(frame1)
        username = driver.find_element_by_xpath("//input[@id='phoneipt']")
        username.send_keys('1*********')
        time.sleep(1)
        driver.find_element_by_xpath("//input[@placeholder='请输入密码']").send_keys("*******")
        time.sleep(1)
        driver.find_element_by_xpath("//div[@class='f-cb loginbox']//a[@id='submitBtn']").click()
        time.sleep(2)
        # click myCourse
        driver.find_element_by_xpath("//div[@class='u-navLogin-myCourse-t']//span[@class='nav']").click()
        time.sleep(2)
        lis = driver.find_elements_by_xpath("//div[@class='course-card-wrapper']")
        No = 0
        for li in lis:
            li.click()
            time.sleep(3)
            curr_window = driver.window_handles[-1]
            driver.switch_to.window(curr_window)
            time.sleep(2)
            driver.find_element_by_xpath('//*[@id="g-body"]/div[3]/div/div[1]/div/a[1]').click()
    
            curr_window1 = driver.window_handles[-1]
            driver.switch_to.window(curr_window1)
            time.sleep(2)
            No = No + 1
            cNo = No
            try:
                Course = driver.find_element_by_xpath("//span[@class='course-title f-ib f-vam']").text
                College = driver.find_element_by_xpath("//a[@data-cate='课程介绍页']").get_attribute("data-label")
                Teacher = driver.find_element_by_xpath("//div[@class='um-list-slider_con_item']//h3[@class='f-fc3']").text
                Count = driver.find_element_by_xpath("//span[@class='course-enroll-info_course-enroll_price-enroll_enroll-count']").text
                Process = driver.find_element_by_xpath("//div[@class='course-enroll-info_course-info_term-info_term-time']//span[position()=2]").text
                Brief = driver.find_element_by_xpath("//div[@class='course-heading-intro_intro']").text
                t = 0
                Team = ''
                while (True):
                    try:
                        Teammerber = driver.find_elements_by_xpath("//div[@class='um-list-slider_con_item']//h3[@class='f-fc3']")[t].text
                        Team += Teammerber + ' '
                        t += 1
                    except:
                        break
                time.sleep(1)
                print(cNo, Course, College, Teacher, Team, Count, Process, Brief)
            except Exception as err:
                print(err)
                Course = ""
                College = ""
                Teacher = ""
                Team = ""
                Count = ""
                Process = ""
                Brief = ""
            insertDB(cNo, Course, College, Teacher, Team, Count, Process, Brief)
            time.sleep(2)
            # 关闭当前页
            driver.close()
            curr_window = driver.window_handles[-1]
            driver.switch_to.window(curr_window)
            driver.close()
            time.sleep(2)
            # 回到之前页面
            backwindow = driver.window_handles[0]
            time.sleep(2)
            driver.switch_to.window(backwindow)
            time.sleep(2)
    
    def startUp(self):
        try:
            self.con = pymysql.connect(host="localhost", port=3306, user="root", passwd="1394613257", db="MyDB",
                          charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("DROP TABLE  IF EXISTS mooc2")
            # 创建表
            self.cursor.execute("CREATE TABLE IF NOT EXISTS mooc2(cNo INT PRIMARY KEY,"
                                "Course VARCHAR(256),"
                                "College VARCHAR(256),"
                                "Teacher VARCHAR(256),"
                                "Team VARCHAR(256),"
                                "Count VARCHAR(256),"
                                "Process VARCHAR(256),"
                                "Brief VARCHAR(1024))")
            self.opened = True
            self.No = 0
        except Exception as err:
            print(err)
            self.opened = False
    
    def closeUp(self):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
            self.driver.close()
        print("closed")
    
    def insertDB(cNo, Course, College, Teacher, Team, Count, Process, Brief):
        try:
            sql = "insert into mooc2 (cNo, Course, College, Teacher, Team, Count, Process, Brief) values (?,?,?,?,?,?,?,?)"
            insertDB.cursor.execute(sql, (cNo, Course, College, Teacher, Team, Count, Process, Brief))
        except Exception as err:
            print(err)
    # (cNo  varchar(32) primary key, Course varchar(256),College varchar(256),Teacher varchar(256),Team varchar(256),Count varchar(256),Process varchar(256),Brief varchar(1024))"
    
    def main():
        print("Spider starting......")
        login()
        startUp()
        print("Spider closing......")
        closeUp()
        print("Spider completed......")
    
    if __name__ == '__main__':
        main()
    

    GIF:

    结果:


    (2)心得体会:
    在爬取自己个人中心的过程的时候,一开始代码一直走到进到这个界面就结束了,还以为是xpath那边解析错误,搞了好久,最后才发现是进去这个页面后,忘记衔接下一步的click操作了,还是需要好好的学习练习啊。

  • 相关阅读:
    软件概要设计
    Jmter进行Rabbitmq测试
    分布式锁
    实用异常
    异常集合
    本地Windows环境安装RabbitMQ Server
    MySQL千万级多表关联SQL语句调优
    mysql数据库多表关联查询的慢SQL优化
    rabbitmq详细配置
    Idea连接服务器docker并部署代码到docker实现一键启动
  • 原文地址:https://www.cnblogs.com/Jelor/p/14046815.html
Copyright © 2020-2023  润新知