• 全国图书馆参考咨询联盟模拟登陆及爬取可爬取的图片


    一、编程思路

    1.模拟登陆 采用selenium PhantomJS 采用Chrome Firefox 这些,我的电脑无法截取验证码位置,读者可以自行尝试 验证码识别可采用tesserocr 我采用手动输入

    2、查询,获取搜索框,用户输入关键字并查询

    3、页面信息,F12查看即可 ,若采用find_element_by_xpath()查询需注意element 返回是第一个节点信息 elements返回是一个列表

    4、书本具体信息、F12查看,后面操作很简单

    5、文献传递页面、这个地方是最难的,右键查看文献传递这个按钮,点击其中href是无法进入的,这个只是一种绑定关系,需要仔细观察进入文献传递页面前后的network中第一个文本中的信息,里面存在很多url,

    只有refer 点击可以进入,分析refer url里面的元素,在进入前的那个页面的url可找到,后面采用切片即可

    6、下载书名页...............,此处我采用的是观察图片的链接直接的关系,从而获取,这个地方需要注意的是,图片需要不断的滑动滑动条才能加载图片,否则无法下载

    7、保存图片 ,注意 'w' 和‘wb’使用即可

    8、最后需要注意爬取频率,否则很容易被发现。

    from selenium import webdriver
    import time
    #import tesserocr
    import pytesseract
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.support.ui import Select
    from PIL import Image
    import requests
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    # from pyquery import PyQuery as pq
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    from lxml import etree
    from urllib.parse import quote
    import os
    
    # cookie_bro = browser.get_cookies()
    #
    # cookie1=cookie_bro[1]['value']
    # print('获取cookie成功')
    def login():
        #输入用户名
        input_user = browser.find_element_by_id("userName")     #查找输入框
        input_user.send_keys("ckho")
        time.sleep(2)
    
        #输入密码
        input_pwd = browser.find_element_by_id("passWord")
        input_pwd.send_keys("chen13560034370")
        time.sleep(2)
    
        #选择
        input_gid = Select(browser.find_element_by_id("gid"))   #实例化input_gid
        input_gid.select_by_value("7320")                       #选择class=7320的选择
        browser.save_screenshot("screen.png")
        # 定位验证码图片的位置,并截取该位置的图片
        code_element = browser.find_element_by_id("vimg")
        print(code_element.location) # {'x': 548, 'y': 523}
        left = code_element.location['x']
        top = code_element.location['y']
        right = code_element.size['width'] + left
        height = code_element.size['height'] + top
        im = Image.open("screen.png")
        img = im.crop((left, top, right, height))
        img.save("screen4.png")
        #验证码识别
        try:
            input_verify = wait.until(
                EC.element_to_be_clickable((By.ID,"verifyCode"))
            )#browser.find_element_by_id("verifyCode")
            result = input("请输入验证码")
            input_verify.send_keys(result)
            #单击登录
            enter = wait.until(
                EC.element_to_be_clickable((By.ID,"submit"))
            )#browser.find_element_by_id("submit").click()
            enter.click()
            print("登录成功")
            browser.save_screenshot("screen6.png")
            return browser.current_url
        except BaseException:
            print(" Enter Error")
    
    #查询书籍信息并且用户可选择页数
    def index_page(url):
        book_name = input("请输入查找的书名")
        input_bookname = browser.find_element_by_id("sw").send_keys(book_name)
        enter = browser.find_element_by_xpath('//*[@id="f2"]/div[2]/input[1]').click()
        print("当前页数为第一页")
        all_book_information()
        page = input("请输入想看页数:")
        print("...正在爬取第"+str(page)+"")
        current_url = browser.current_url #输入书名后的页面的链接
        try:
            if int(page)>1:
                browser.get(current_url)
                print(current_url)
                #查找输入框
                input_page = wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "#jpage"))
                )
                #查找登录建
                sumbit = wait.until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, '#pageinfo > input[type=button]:nth-child(13)'))
                )
                input_page.clear()
                input_page.send_keys(str(page))
                sumbit.click()
            all_book_information()
            return browser.current_url #换页后的链接
        except TimeoutError:
            index_page()
    
    def all_book_information():
        #返回当前页面的书本信息
        addres ='//table[@class="book1"]'
        addres_list = browser.find_elements_by_xpath(addres)
        book_list = []
        for book in addres_list:
            book_list.append(book.text)
        for i in enumerate(book_list, start=1):
            print(i)
    
    
    #获取每本书具体链接并且返回每本书具体信息
    def get_detail_book(url):
        number = input("请输入你想要了解书的编号:")
        browser.get(url)
        addres = '//table[{}][@class="book1"]//a[@class="px14"]'.format(number)
        book_url = browser.find_element_by_xpath(addres).get_attribute("href")
        browser.get(book_url)
        detail_book_information = browser.find_elements_by_xpath('//div[@class="tubox"]//dd')
        for book in detail_book_information:
            print(book.text)
    
        return browser.current_url
    
    #进入图书馆文献传递页面
    def sent_book_emial(url):
        bqy_url = browser.find_element_by_xpath('//*[@id="libinfo"]/div[1]/div/a[1]').get_attribute("href")
        all_page_name(bqy_url)
        answer = input("是否需要下载此书 是请输入是1 否2 看其他书按3 ,下载书名页4 下载前言页5 下载版权页6 下载目录页7 下载正文页8")
        if int(answer)==1:
            base_url = 'http://book.ucdrs.superlib.net/gofirstdrs.jsp?'
            browser.get(url)
            sent_href = browser.find_element_by_xpath('//*[@id="libinfo"]/div[2]//a').get_attribute("href") #页面上文献传递对应href值
            #拆分href,拼接可进入图书参考咨询页面的url 通过进入文献传递,观察network中找到可进入图书参考咨询页面url,直接点击href是不能访问
            list1 = sent_href.split("?", 1)
            list2 = list1[1].split("'", 1)
            tscx_url = base_url+list2[0]
            browser.get(tscx_url)
            browser.save_screenshot("screen5.png")
            book_download()
        elif int(answer)==2:
            print("
    ")
            print("本次查询结束,欢迎下次使用!")
    
        elif int(answer) == 4:
            browser.get(url)
            bqy_url = browser.find_element_by_xpath('//*[@id="libinfo"]/div[1]/div/a[1]').get_attribute("href")
            base_url = bqy_download(bqy_url)
            smy_img(base_url)
        elif int(answer)==5:
            browser.get(url)
            bqy_url = browser.find_element_by_xpath('//*[@id="libinfo"]/div[1]/div/a[1]').get_attribute("href")
            base_url = bqy_download(bqy_url)
            qyy_img(base_url)
        elif int(answer)==6:
            browser.get(url)
            bqy_url = browser.find_element_by_xpath('//*[@id="libinfo"]/div[1]/div/a[1]').get_attribute("href")
            base_url = bqy_download(bqy_url)
            bqy_url(base_url)
        elif int(answer)==7:
            browser.get(url)
            bqy_url = browser.find_element_by_xpath('//*[@id="libinfo"]/div[1]/div/a[1]').get_attribute("href")
            base_url = bqy_download(bqy_url)
            mly_img(base_url)
        elif int(answer)==8:
            browser.get(url)
            bqy_url = browser.find_element_by_xpath('//*[@id="libinfo"]/div[1]/div/a[1]').get_attribute("href")
            base_url = bqy_download(bqy_url)
            zwy_img(base_url)
    
        else:
            url = "http://www.ucdrs.superlib.net/"
            browser.get(url)
            all_book_url_page = index_page(url)
            detail_book_url = get_detail_book(all_book_url_page)
            sent_book_emial(detail_book_url)
    
    
    def all_page_name(url):
        browser.get(url)
        all_page_name = browser.find_element_by_id("pagejump")
        t1 = all_page_name.text.replace(" ", "") #删除空格
        print("该书可看部分仅有:"+t1+"请按照此选择下载,否则可能导致下载错误")
    
    
    #图书下载
    def book_download():
        all_page = browser.find_element_by_xpath('//*[@id="content"]/form/ul/li[3]/p[1]').text
        print(all_page)
        print("每本图书咨询每次不超过50页")
        input1 = input("请输入想看的书初始页")
        input2 = input("请输入想看的书的末页")
        input_start = browser.find_element_by_id("frompage").send_keys(input1)
        input_led = browser.find_element_by_id("endpage").send_keys(input2)
        email = input("请输入你的邮箱账号")
        input_email = browser.find_element_by_id("email").send_keys(email)
        verifycode1 = input("请输入验证码")
        input_verifycode1 = browser.find_element_by_id("verifycode").send_keys(verifycode1)
        input_enter = browser.find_element_by_xpath('//li[@class="sumbit"]').click()
    
    #返回图片的url共同部分
    def bqy_download(url):
        browser.get(url)
        print(url)
        time.sleep(4) #注意 需要留个页面加载时间,模仿人阅读时候网页加载速度  否则加载不出来想要的图片链接
        browser.save_screenshot("screen8.png")
        first_img_url = browser.find_element_by_xpath('//*[@id="reader"]/div/div[1]/input').get_attribute("src")
        print(first_img_url)
        base_url = first_img_url[0:-13]
        print(base_url)
        return base_url
    
    #下载书名页
    def smy_img(base_url):
        i=1
        print("仅下载1页")
        while i<2:
                img_url = base_url + 'bok00{}'.format(i) + '?zoom=0&f=0'
                i += 1
                response = requests.get(img_url)
                print(img_url)
                with open("D:/pycharm/实战案例/前言页/" + str(i-1) + '.png', "wb") as f:
                    f.write(response.content)
                    print("success download")
    
    
                time.sleep(2)
    
    #下载版权页
    def bqy_img(base_url):
        i=1
        print("仅下载1页")
        while i<2:
                img_url = base_url + 'leg00{}'.format(i) + '?zoom=0&f=0'
                i += 1
                response = requests.get(img_url)
                print(img_url)
                with open("D:/pycharm/实战案例/版权页/" + str(i-1) + '.png', "wb") as f:
                    f.write(response.content)
                    print("success download")
    
    
    #下载前言页
    def qyy_img(base_url):
        i=1
        print("仅下载5页")
        while i<6:
                img_url = base_url + 'fow00{}'.format(i) + '?zoom=0&f=0'
                i += 1
                response = requests.get(img_url)
                print(img_url)
                with open("D:/pycharm/实战案例/前言页/" + str(i-1) + '.png', "wb") as f:
                    f.write(response.content)
                    print("success download")
                # try:
                #     response.headers["Accept-Encoding"]
                # except:
                #     break
                time.sleep(2)
    
    #下载目录页
    def mly_img(base_url):
        i=1
        print("仅下载3页")
        while i<4:
                img_url = base_url + '!0000{}'.format(i) + '?zoom=0&f=0'
                i += 1
                response = requests.get(img_url)
                print(img_url)
                with open("D:/pycharm/实战案例/目录页/" + str(i-1) + '.png', "wb") as f:
                    f.write(response.content)
                    print("success download")
                time.sleep(2)
    
    #下载正文页
    def zwy_img(base_url):
        i=1
        print("仅下载15页")
        while i<12:
                if i<16:
                    img_url = base_url + '00000{}'.format(i) + '?zoom=0&f=0'
                else:
                    img_url = base_url + '0000{}'.format(i) + '?zoom=0&f=0'
                i += 1
                response = requests.get(img_url)
                print(img_url)
                with open("D:/pycharm/实战案例/正文页/" + str(i-1) + '.png', "wb") as f:
                    f.write(response.content)
                    print("success download")
                time.sleep(2)
    
    
    if __name__ == '__main__':
        url = "http://www.ucdrs.superlib.net/login/login.action"
        # headers = {
        #      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        #     "Accept-Encoding": "gzip, deflate",
        #     "Accept-Language": "zh-CN,zh;q=0.9",
        #     "Cache-Control": "max-age=0",
        #     "Connection": "keep-alive",
        #     "Cookie": "JSESSIONID=E9B8FFC8B023F0FC12A07A3ECDE91581.jp26; __dxca=d04d4dbb-57fb-4114-b080-190507ee4cbf; route=5ead36b501ee59635125fd6ef4221d0e; UM_distinctid=170b290e53493e-0372017c841b37-4313f6a-144000-170b290e5364a3; CNZZDATA2088844=cnzz_eid%3D382770322-1583543803-%26ntime%3D1583551301",
        #     "Host": "img.duxiu.com",
        #     "Upgrade-Insecure-Requests":"1",
        #     "User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
        # }
        #
        # cookies = {
        #     "Cookie": "JSESSIONID=E9B8FFC8B023F0FC12A07A3ECDE91581.jp26; __dxca=d04d4dbb-57fb-4114-b080-190507ee4cbf; route=5ead36b501ee59635125fd6ef4221d0e; UM_distinctid=170b290e53493e-0372017c841b37-4313f6a-144000-170b290e5364a3; CNZZDATA2088844=cnzz_eid%3D382770322-1583543803-%26ntime%3D1583551301",
        # }
        browser = webdriver.PhantomJS()
        browser.get(url)
        wait = WebDriverWait(browser, 8)
        print("欢迎使用图书查询小程序")
        login()  # 登录 需要手动验证码
        all_book_url_page = index_page(url)  # 查看第几页及返回当前页的书本信息
        detail_book_url = get_detail_book(all_book_url_page) #返回每本书的具体的链接
        sent_book_emial(detail_book_url)

    若有错误,请留言告诉我,谢谢!

  • 相关阅读:
    vue项目webpack配置terser-webpack-plugin 去掉项目中多余的debugger
    difference between count(1) and count(*)
    为什么PostgreSQL WAL归档很慢
    mysql_reset_connection()
    Oracle使用audit跟踪登录失败的连接信息
    .NET Standard 版本
    ASP.NET Web API版本
    我是如何用go-zero 实现一个中台系统的
    JAVA中文件写入的6种方法
    MySql 常用语句
  • 原文地址:https://www.cnblogs.com/chka/p/12447088.html
Copyright © 2020-2023  润新知