• python自动化之PDF


    ###################################处理PDF和Word文档###################################

    '''

    PDF和Word文档是二进制文件,除了文本之外,

    它们还保存了许多字体、颜色和布局信息

    '''

    '''

    从PDF提取文本

    '''

    ###################################从PDF提取文本###################################

    import PyPDF2

    pdfFileObj=open(r'C:UsersAdministratorDesktop est.pdf','rb')

    pdfReader=PyPDF2.PdfFileReader(pdfFileObj)

    pdfReader.numPages

    pageObj=pdfReader.getPage(0)

    pageObj.extractText()

    ###################################解压PDF#########################################

    import PyPDF2

    pdfReader=PyPDF2.PdfFileReader(open(r'C:UsersAdministratorDesktop est.pdf','rb'))

    pdfReader.isEncrypted    ####是否加密

    pdfReader.getPage(0)

    pdfReader.decrypt('rosebud')   ####提供解密口令

    pageObj=pdfReader.getPage(0)

    ###################################创建PDF#########################################

    '''

    PyPDF2不能将任意文本写入PDF:

    PyPDF2写入PDF的能力,仅限于从其他PDF中拷贝页面、旋转页面、重叠页面和加密文件

    '''

    '''

    一般方式:

    1、打开一个或多个已用的PDF(源PDF),得到PdfFileReader对象

    2、创建一个新的PdfFileWriter对象

    3、将页面从PdfFileReader对象拷贝到PdfFileWriter对象中

    4、利用PdfFileWriter对象写入输出的PDF

    '''

    #####################################################拷贝页面###########################################################

    def merge(pdf_one, pdf_two, filename='my.pdf',output_dir=r'C:UsersAdministratorDesktop'):

             input_one = file(pdf_one, 'rb')

             input_two = file(pdf_two, 'rb')

             pdf_input_one = PyPDF2.PdfFileReader(input_one)

             pdf_input_two = PyPDF2.PdfFileReader(input_two)

             numOne = pdf_input_one.getNumPages()

             numTwo = pdf_input_two.getNumPages()

             print numOne, numTwo

             pdf_output = PyPDF2.PdfFileWriter()

             for pageNum in range(numOne):

                       print 'hereo'

                       pageObj=pdf_input_one.getPage(pageNum)

                       pdf_output.addPage(pageObj)

             for pageNum in range(numTwo):

                       print 'heret'

                       pageObj=pdf_input_two.getPage(pageNum)

                       pdf_output.addPage(pageObj)

             pdf_name = output_dir+filename

             print pdf_name

             output_stream = file( pdf_name,'wb')

             pdf_output.write(output_stream)

             output_stream.close()

             input_one.close()

             input_two.close()

             print 'Done!'

    merge(r'C:UsersAdministratorDesktopPairs_Trading_Quantitative Methods and Analysis.pdf',r'C:UsersAdministratorDesktopdeMontjoye.SM.pdf')

    #####################################################旋转页面###########################################################

    '''

    利用rotateClockwise()和rotateCounterClockwise()方法

    PDF文档的页面也可以旋转90度的整数倍,向这些方法传入

    整数90、180或270

    '''

    def merge(pdf_one, pdf_two, filename='my.pdf',output_dir=r'C:UsersAdministratorDesktop'):

             input_one = file(pdf_one, 'rb')

             input_two = file(pdf_two, 'rb')

             pdf_input_one = PyPDF2.PdfFileReader(input_one)

             pdf_input_two = PyPDF2.PdfFileReader(input_two)

             numOne = pdf_input_one.getNumPages()

             numTwo = pdf_input_two.getNumPages()

             print numOne, numTwo

             pdf_output = PyPDF2.PdfFileWriter()

             for pageNum in range(numOne):

                       print 'hereo'

                       pageObj=pdf_input_one.getPage(pageNum)

                       pageObj=pageObj.rotateClockwise(90)

                       pdf_output.addPage(pageObj)

             for pageNum in range(numTwo):

                       print 'heret'

                       pageObj=pdf_input_two.getPage(pageNum)

                       pageObj=pageObj.rotateClockwise(90)

                       pdf_output.addPage(pageObj)

             pdf_name = output_dir+filename

             print pdf_name

             output_stream = file( pdf_name,'wb')

             pdf_output.write(output_stream)

             output_stream.close()

             input_one.close()

             input_two.close()

             print 'Done!'

    merge(r'C:UsersAdministratorDesktopPairs_Trading_Quantitative Methods and Analysis.pdf',r'C:UsersAdministratorDesktopdeMontjoye.SM.pdf')

    #####################################################叠加页面###########################################################

    import PyPDF2

    minutesFile=open(r'C:UsersAdministratorDesktopPairs_Trading_Quantitative Methods and Analysis.pdf','rb')

    pdfReader=PyPDF2.PdfFileReader(minutesFile)

    minutesFirstPage=pdfReader.getPage(0)

    pdfWatermarkReader=PyPDF2.PdfFileReader(open(r'C:UsersAdministratorDesktopdeMontjoye.SM.pdf','rb'))

    minutesFirstPage.mergePage(pdfWatermarkReader.getPage(0))

    pdfWriter=PyPDF2.PdfFileWriter()

    pdfWriter.addPage(minutesFirstPage)

    for pageNum in range(1,pdfReader.numPages):

             pageObj=pdfReader.getPage(pageNum)

             pdfWriter.addPage(pageObj)

    resultPdfFile=open(r'C:UsersAdministratorDesktopmerge.pdf','wb')

    pdfWriter.write(resultPdfFile)

    minutesFile.close()

    resultPdfFile.close()

    #####################################################加密PDF###########################################################

    import PyPDF2

    pdfFile=file(r'C:UsersAdministratorDesktopdeMontjoye.SM.pdf','rb')

    pdfReader=PyPDF2.PdfFileReader(pdfFile)

    pdfWriter=PyPDF2.PdfFileWriter()

    for pageNum in range(pdfReader.numPages):

             pdfWriter.addPage(pdfReader.getPage(pageNum))

    pdfWriter.encrypt('swordfish')

    resultPdf=file(r'C:UsersAdministratorDesktop .pdf','wb')

    pdfWriter.write(resultPdf)

    resultPdf.close()

  • 相关阅读:
    改变Prompt默认路径,Change Default Visual Studio Command Prompt Location
    msbuild,Build failed with Error MSB3073 exited with code 1
    the filename directory name or volume label syntax is incorrect
    常用sql语句记录
    EF中多表公共字段,以及设置EntityBase使所有实体类继承自定义类
    一种在MVC3框架里面设置模板页的方法,不使用_ViewStart
    Java内存模型
    Effective Java(1)-创建和销毁对象
    WireShark 查看UDP码流的丢包率
    拖延心理学读后感
  • 原文地址:https://www.cnblogs.com/dudumiaomiao/p/7242002.html
Copyright © 2020-2023  润新知