motivation:
毕设有一个小部分,是抽取从各种公开课下载的PPT的内容。之前用Python的win32com弄到哭也只弄出了纯文本,而不能识别哪些是标题(理论上应该是可以的?我看到源码里有HaveTitle等等的内容,然而调用就跪,QAQ)。然后就只抽出了纯文本,放弃治疗改用维基百科做知识库了。扔在一边就去做别的了。
于是这周老师又重新让我抽标题OTL,就搜到了这个看起来很厉害的东西。学习ING。
prework:
- 下载:https://www.microsoft.com/en-us/download/details.aspx?displaylang=en&id=5124 download里的两个文件,先装v2再装tool
- 在项目里加引用:解决方案资源管理器里,有个引用,左键添加引用,在扩展里添加DocumentFormat.OpenXml,框架里添加windowspackage
gao:
reference:https://msdn.microsoft.com/zh-cn/library/cc850843(v=office.14)# 获取演示文稿中的所有幻灯片的标题 https://msdn.microsoft.com/zh-cn/library/cc536290.aspx PresentationDocument 方法
示例代码里大部分说的挺清楚的~在此基础上可以获取一个大概的框架。在main函数里call GetSlideTitles(filename)基本就可以跑起来辣
运行示例代码的时候发现挂了 = =,报的错是The document cannot be opened because there is an invalid part with an unexpected content type. ...在stackouverflow里发现了一个解决方法,亲测可用。在调用GetSlideTitles之前先调用一次fixPowerPoint,就可以让文件正常打开了。fixPowerPoint代码如下:
private static void FixPowerpoint(string fileName) { //Opening the package associated with file Console.WriteLine(fileName); using (Package wdPackage = Package.Open(fileName, FileMode.Open, FileAccess.ReadWrite)) { //Uri of the printer settings part var binPartUri = new Uri("/ppt/printerSettings/printerSettings1.bin", UriKind.Relative); if (wdPackage.PartExists(binPartUri)) { //Uri of the presentation part which contains the relationship var presPartUri = new Uri("/ppt/presentation.xml", UriKind.RelativeOrAbsolute); var presPart = wdPackage.GetPart(presPartUri); //Getting the relationship from the URI var presentationPartRels = presPart.GetRelationships().Where(a => a.RelationshipType.Equals("http://schemas.openxmlformats.org/officeDocument/2006/relationships/printerSettings", StringComparison.InvariantCultureIgnoreCase)).SingleOrDefault(); if (presentationPartRels != null) { //Delete the relationship presPart.DeleteRelationship(presentationPartRels.Id); } //Delete the part wdPackage.DeletePart(binPartUri); } wdPackage.Close(); } }
接下来加入提取文本的地方,DocumentFormat.OpenXml.Drawing.Paragraph可以抓出paragraph,DocumentFormat.OpenXml.Drawing.Text抓出text,核心基本是这样的。
LinkedList<string> texts = new LinkedList<string>(); foreach (var paragraph in slidePart.Slide.Descendants<DocumentFormat.OpenXml.Drawing.Paragraph>()) { StringBuilder allText = new StringBuilder(); foreach(var text in paragraph.Descendants<DocumentFormat.OpenXml.Drawing.Text>()) { allText.Append(text.Text); } if (allText.Length > 0) { if (allText.ToString() == titleText.ToString()) ; else texts.AddLast(allText.ToString()); } }
最后我有一个带有所有ppt名字的text,所以就从里面读取处理完所有的ppt。
写到json格式的文件(最后写了.jl,因为懒)
全部代码:在马赛克里改自己的路径就能用了噗
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using DocumentFormat.OpenXml.Packaging; using DocumentFormat.OpenXml.Presentation; using D = DocumentFormat.OpenXml.Drawing; using System.IO.Packaging; using System.IO; namespace getTitle { class Program { public static void GetSlideTitles(string presentationFile, string store) { // Open the presentation as read-only. using (PresentationDocument presentationDocument = PresentationDocument.Open(presentationFile, false)) { GetSlideTitles(presentationDocument, store); } } public static void GetSlideTitles(PresentationDocument presentationDocument, string store) { if (presentationDocument == null) { throw new ArgumentNullException("presentationDocument"); } // Get a PresentationPart object from the PresentationDocument object. PresentationPart presentationPart = presentationDocument.PresentationPart; if (presentationPart != null && presentationPart.Presentation != null) { // Get a Presentation object from the PresentationPart object. Presentation presentation = presentationPart.Presentation; if (presentation.SlideIdList != null) { // Get the title of each slide in the slide order. foreach (var slideId in presentation.SlideIdList.Elements<SlideId>()) { SlidePart slidePart = presentationPart.GetPartById(slideId.RelationshipId) as SlidePart; // Get the slide title. GetSlide(slidePart, store); // An empty title can also be added. } } } } // Get the title string of the slide. public static void GetSlide(SlidePart slidePart, string store) { if (slidePart == null) { throw new ArgumentNullException("presentationDocument"); } // Declare a paragraph separator. string titleSeparator = null; if (slidePart.Slide != null) { // Find all the title shapes. var shapes = from shape in slidePart.Slide.Descendants<Shape>() where IsTitleShape(shape) select shape; StringBuilder titleText = new StringBuilder(); foreach (var shape in shapes) { // Get the text in each paragraph in this shape. foreach (var paragraph in shape.TextBody.Descendants<D.Paragraph>()) { // Add a line break. titleText.Append(titleSeparator); foreach (var text in paragraph.Descendants<D.Text>()) { titleText.Append(text.Text); } titleSeparator = " "; } } if (titleText.Length == 0) return; LinkedList<string> texts = new LinkedList<string>(); foreach (var paragraph in slidePart.Slide.Descendants<DocumentFormat.OpenXml.Drawing.Paragraph>()) { StringBuilder allText = new StringBuilder(); foreach(var text in paragraph.Descendants<DocumentFormat.OpenXml.Drawing.Text>()) { allText.Append(text.Text); } if (allText.Length > 0) { if (allText.ToString() == titleText.ToString()) ; else texts.AddLast(allText.ToString()); } } if (texts.Count > 0) { System.IO.StreamWriter file = new System.IO.StreamWriter(store, true); file.Write("{"Title":"" + titleText.ToString() + "","); file.Write(""Content":""); string inter = ""; foreach (var text in texts) { file.Write(inter + text ); inter = ","; } file.WriteLine(""}"); file.Close(); } } return; } // Determines whether the shape is a title shape. private static bool IsTitleShape(Shape shape) { var placeholderShape = shape.NonVisualShapeProperties.ApplicationNonVisualDrawingProperties.GetFirstChild<PlaceholderShape>(); if (placeholderShape != null && placeholderShape.Type != null && placeholderShape.Type.HasValue) { switch ((PlaceholderValues)placeholderShape.Type) { // Any title shape. case PlaceholderValues.Title: // A centered title. case PlaceholderValues.CenteredTitle: return true; default: return false; } } return false; } static void Main(string[] args) { string pptlistPath = "【马赛克】"; System.IO.StreamReader pptlist = new System.IO.StreamReader(pptlistPath); string ppt; while((ppt = pptlist.ReadLine()) != null) { Console.WriteLine(ppt); string pptname = "【马赛克】" + ppt + ".pptx"; string storepath = "【马赛克】" + ppt + ".jl"; System.IO.StreamWriter file = new System.IO.StreamWriter(storepath, false); file.Close(); FixPowerpoint(pptname); GetSlideTitles(pptname, storepath); } pptlist.Close(); string input = Console.ReadLine(); } private static void FixPowerpoint(string fileName) { //Opening the package associated with file Console.WriteLine(fileName); using (Package wdPackage = Package.Open(fileName, FileMode.Open, FileAccess.ReadWrite)) { //Uri of the printer settings part var binPartUri = new Uri("/ppt/printerSettings/printerSettings1.bin", UriKind.Relative); if (wdPackage.PartExists(binPartUri)) { //Uri of the presentation part which contains the relationship var presPartUri = new Uri("/ppt/presentation.xml", UriKind.RelativeOrAbsolute); var presPart = wdPackage.GetPart(presPartUri); //Getting the relationship from the URI var presentationPartRels = presPart.GetRelationships().Where(a => a.RelationshipType.Equals("http://schemas.openxmlformats.org/officeDocument/2006/relationships/printerSettings", StringComparison.InvariantCultureIgnoreCase)).SingleOrDefault(); if (presentationPartRels != null) { //Delete the relationship presPart.DeleteRelationship(presentationPartRels.Id); } //Delete the part wdPackage.DeletePart(binPartUri); } wdPackage.Close(); } } } }