最近做了关于计算文档中关键词的程序,使用Tf-idf方法去提取,其中需要使用python读取MS word文档和 MS powerpoint中的内容,现将部分讲解和代码贴出来,请指正。
首先,介绍一下win32com,这是个和window链接的模块,实话说,功能是很强大的,在网上看到很多功能,可以用来打开word,ppt,Excel,Access,模拟浏览器等行为,下载地址:http://starship.python.net/~skippy/win32/Downloads.html,现在,介绍其功能,是从其他网上的资料粘贴过来的。
1.word功能:http://my.oschina.net/duxuefeng/blog/64137(这个写的比较清楚)
1 import win32com 2 from win32com.client import Dispatch, constants 3 4 w = win32com.client.Dispatch('Word.Application') 5 # 或者使用下面的方法,使用启动独立的进程: 6 # w = win32com.client.DispatchEx('Word.Application') 7 8 # 后台运行,不显示,不警告 9 w.Visible = 0 10 w.DisplayAlerts = 0 11 12 # 打开新的文件 13 doc = w.Documents.Open( FileName = filenamein ) 14 # worddoc = w.Documents.Add() # 创建新的文档 15 16 # 插入文字 17 myRange = doc.Range(0,0) 18 myRange.InsertBefore('Hello from Python!') 19 20 # 使用样式 21 wordSel = myRange.Select() 22 wordSel.Style = constants.wdStyleHeading1 23 24 # 正文文字替换 25 w.Selection.Find.ClearFormatting() 26 w.Selection.Find.Replacement.ClearFormatting() 27 w.Selection.Find.Execute(OldStr, False, False, False, False, False, True, 1, True, NewStr, 2) 28 29 # 页眉文字替换 30 w.ActiveDocument.Sections[0].Headers[0].Range.Find.ClearFormatting() 31 w.ActiveDocument.Sections[0].Headers[0].Range.Find.Replacement.ClearFormatting() 32 w.ActiveDocument.Sections[0].Headers[0].Range.Find.Execute(OldStr, False, False, False, False, False, True, 1, False, NewStr, 2) 33 34 # 表格操作 35 doc.Tables[0].Rows[0].Cells[0].Range.Text ='123123' 36 worddoc.Tables[0].Rows.Add() # 增加一行 37 38 # 转换为html 39 wc = win32com.client.constants 40 w.ActiveDocument.WebOptions.RelyOnCSS = 1 41 w.ActiveDocument.WebOptions.OptimizeForBrowser = 1 42 w.ActiveDocument.WebOptions.BrowserLevel = 0 # constants.wdBrowserLevelV4 43 w.ActiveDocument.WebOptions.OrganizeInFolder = 0 44 w.ActiveDocument.WebOptions.UseLongFileNames = 1 45 w.ActiveDocument.WebOptions.RelyOnVML = 0 46 w.ActiveDocument.WebOptions.AllowPNG = 1 47 w.ActiveDocument.SaveAs( FileName = filenameout, FileFormat = wc.wdFormatHTML ) 48 49 # 打印 50 doc.PrintOut() 51 52 # 关闭 53 # doc.Close() 54 w.Documents.Close(wc.wdDoNotSaveChanges) 55 w.Quit()
2. Excel功能:同上的网址
3. Access功能:
1 import win32com.client 2 oAccess = win32com.client.Dispatch('Access.Application') 3 DbFile = r'C:UsershansDocumentsNewDb.mdb' 4 dbLangGeneral = ';LANGID=0x0409;CP=1252;COUNTRY=0' 5 # dbVersion40 64 6 dbVersion = 64 7 oAccess.DBEngine.CreateDatabase(DbFile, dbLangGeneral, dbVersion) 8 oAccess.Quit() 9 del oAccess
4. 模拟浏览器行为:http://www.cnblogs.com/chenzehe/archive/2010/09/01/1814397.html
1 Code highlighting produced by Actipro CodeHighlighter (freeware)http://www.CodeHighlighter.com/--># -*- coding:UTF-8 -*- 2 #!/user/bin/env python 3 ''' 4 Created on 2010-9-1 5 @author: chenzehe 6 ''' 7 import win32com.client 8 from time import sleep 9 10 loginurl='http://passport.cnblogs.com/login.aspx' 11 loginouturl='http://passport.cnblogs.com/logout.aspx' 12 username='XXX' 13 password='XXX' 14 15 ie = win32com.client.Dispatch("InternetExplorer.Application") 16 ie.Visible = 0 17 18 19 ie.Navigate(loginurl) 20 state = ie.ReadyState 21 print "打开登陆页面" 22 while 1: 23 state = ie.ReadyState 24 if state ==4: 25 break 26 sleep(1) 27 print "页面载入完毕,输入用户名密码" 28 state = None 29 30 ie.Document.getElementById("tbUserName").value=username 31 ie.Document.getElementById("tbPassword").value=password 32 ie.Document.getElementById("btnLogin").click() 33 34 35 while 1: 36 state = ie.ReadyState 37 print state 38 if state ==4 and str(ie.LocationURL) == "http://home.cnblogs.com/": 39 break 40 sleep(1) 41 print "登陆成功" 42 print '你的昵称是:' 43 print ie.Document.getElementById('lnk_current_user').title 44 45 #博客园只能登录一次,注销 46 print '注销!' 47 ie.Navigate(loginouturl)
5.播放mp3文件:http://www.sharejs.com/codes/python/5733
1 from win32com.client import Dispatch 2 mp = Dispatch("WMPlayer.OCX") 3 # use an mp3 file you have ... 4 #tune = mp.newMedia("C:/Program Files/Common Files/HP/Memories Disc/2.0/audio/Swing.mp3") 5 # or copy one to the working folder ... 6 #tune = mp.newMedia("Bier1.mp3") 7 # you can also play wma files, this cool sound came with XP ... 8 tune = mp.newMedia("C:/WINDOWS/system32/oobe/images/title.wma") 9 mp.currentPlaylist.appendItem(tune) 10 mp.controls.play() 11 # to stop playing use 12 raw_input("Press Enter to stop playing") 13 mp.controls.stop()
真心感觉这个东西很强大呀!言归正传,该上将word和ppt转化为txt的代码了!如下:
1 #coding:utf-8 2 import win32com 3 import win32con 4 import win32gui 5 import codecs 6 from win32com.client import Dispatch 7 import pythoncom 8 9 class MSOffice2txt(): 10 def __init__(self, fileType=['doc','ppt']): 11 self.docCom = None 12 self.pptCom = None 13 pythoncom.CoInitialize() 14 if type(fileType) is not list: 15 return 'Error, please check the fileType, it must be list[]' 16 for ft in fileType: 17 if ft == 'doc': 18 self.docCom = self.docApplicationOpen() 19 elif ft == 'ppt': 20 self.pptCom = self.pptApplicationOpen() 21 22 def close(self): 23 self.docApplicationClose(self.docCom) 24 self.pptApplicationClose(self.pptCom) 25 26 def docApplicationOpen(self): 27 docCom = win32com.client.Dispatch('Word.Application') 28 docCom.Visible = 1 29 docCom.DisplayAlerts = 0 30 docHwnd = win32gui.FindWindow(None, 'Microsoft Word') 31 win32gui.ShowWindow(docHwnd, win32con.SW_HIDE) 32 return docCom 33 34 def docApplicationClose(self,docCom): 35 if docCom is not None: 36 docCom.Quit() 37 38 def doc2Txt(self, docCom, docFile, txtFile): 39 doc = docCom.Documents.Open(FileName=docFile,ReadOnly=1) 40 doc.SaveAs(txtFile, 2) 41 doc.Close() 42 43 44 45 def pptApplicationOpen(self): 46 pptCom = win32com.client.Dispatch('PowerPoint.Application') 47 pptCom.Visible = 1 48 pptCom.DisplayAlerts = 0 49 pptHwnd = win32gui.FindWindow(None, 'Microsoft PowerPoint') 50 win32gui.ShowWindow(pptHwnd, win32con.SW_HIDE) 51 return pptCom 52 53 def pptApplicationClose(self, pptCom): 54 if pptCom is not None: 55 pptCom.Quit() 56 57 def ppt2txt(self, pptCom, pptFile, txtFile): 58 ppt = pptCom.Presentations.Open(pptFile,ReadOnly=1, Untitled=0, WithWindow=0) 59 f = codecs.open(txtFile,"w",'gb18030') 60 slide_count = ppt.Slides.Count 61 for i in xrange(1,slide_count + 1): 62 shape_count = ppt.Slides(i).Shapes.Count 63 for j in xrange(1,shape_count + 1): 64 if ppt.Slides(i).Shapes(j).HasTextFrame: 65 s = ppt.Slides(i).Shapes(j).TextFrame.TextRange.Text 66 f.write(s) 67 f.close() 68 ppt.Close() 69 70 def translate(self, filename, txtFilename): 71 if filename.endswith('doc') or filename.endswith('docx'): 72 if self.docCom is None: 73 self.docCom = self.docApplicationOpen() 74 self.doc2Txt(self.docCom, filename, txtFilename) 75 return True 76 elif filename.endswith('ppt') or filename.endswith('pptx'): 77 if self.pptCom is None: 78 self.pptCom = self.pptApplicationOpen() 79 self.ppt2txt(self.pptCom, filename, txtFilename) 80 return True 81 else: 82 return False 83 84 if __name__=='__main__': 85 msoffice = MSOffice2txt() 86 filename = u'F:\study.docx' 87 if msoffice.translate(filename, 'temp.txt'): 88 print 'Successed!' 89 else: 90 print 'Failed!' 91 msoffice.close()