#!/usr/bin/env python # -*- coding: utf-8 -*- import docx from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from cStringIO import StringIO from win32com import client import sys reload(sys) sys.setdefaultencoding('gb2312') def readDocx(docxPath): fullText = [] doc = docx.Document(docxPath) paras = doc.paragraphs for p in paras: fullText.append(p.text.strip()) return ' '.join(fullText) def readPdf(pdfPath): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(pdfPath, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() str = retstr.getvalue() retstr.close() return str def readDoc(docPath): fullText = [] word = client.Dispatch('Word.Application') # 打开一个已存在的文件 doc = word.Documents.Open(docPath) #print doc.Content #print text doc.SaveAs('c:/temp.txt', 2) # 关闭 doc.Close() word.Quit() f=open(r'c:/temp.txt','r') for line in f.readlines(): #f len(line)!=line.count(' '): fullText.append(line.decode('gbk').strip()) f.close() return ' '.join(fullText) if __name__ == '__main__': #docxValue=readDocx('d:/1.docx') #print docxValue #pdfValue = readPdf('d:/3.pdf') #print pdfValue docValue = readDoc('d:/2.doc') print docValue