• python_读取 doc,docx,pdf


    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    import docx
    
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    from cStringIO import StringIO
    
    from win32com import client
    import sys
    reload(sys)
    sys.setdefaultencoding('gb2312')
    
    def readDocx(docxPath):
        fullText = []
        doc = docx.Document(docxPath)
        paras = doc.paragraphs
        for p in paras:
            fullText.append(p.text.strip())
        return '
    '.join(fullText)
    def readPdf(pdfPath):
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(pdfPath, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos=set()
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
            interpreter.process_page(page)
        fp.close()
        device.close()
        str = retstr.getvalue()
        retstr.close()
        return str
    def readDoc(docPath):
        fullText = []
        word = client.Dispatch('Word.Application')    
        # 打开一个已存在的文件
        doc = word.Documents.Open(docPath)
        #print doc.Content
        #print text
        doc.SaveAs('c:/temp.txt', 2)
        # 关闭
        doc.Close()
        word.Quit()
        f=open(r'c:/temp.txt','r')  
        for line in f.readlines(): 
            #f len(line)!=line.count('
    '):
            fullText.append(line.decode('gbk').strip())
        f.close()
        return '
    '.join(fullText)
    if __name__ == '__main__':
        #docxValue=readDocx('d:/1.docx')
        #print docxValue
        #pdfValue = readPdf('d:/3.pdf')
        #print pdfValue
        docValue = readDoc('d:/2.doc')
        print docValue
  • 相关阅读:
    一个简单的makefile,一次性编译本文件夹下所有的cpp文件
    c++ 最短路两种算法
    C++语言十进制数,CDecimal(未完成)
    C语言面向对象的简便方法
    C语言2048
    C图书借还示例
    Javascript 备忘
    原型与原型链
    css3动画-跳动圈
    学习css3动画
  • 原文地址:https://www.cnblogs.com/zy900406/p/6654017.html
Copyright © 2020-2023  润新知