• 字幕文件处理(2)


    摘要

    上一篇文章我们实现了整数与时间格式的互转,常见的字幕文件的格式有WebVTT, SRT, TTML, 有的系统要求我们提供VTT格式, 有的系统只支持TTML格式,我们字幕做完一个拿到的可能是SRT格式, 所以设计到将不同格式的字幕文件进行转换。

    本文介绍的示例代码实现了VTT 与SRT互转, 也可以将VTT或SRT转化到TTML。

    同样, 匹配时间格式的正则表达式是: 

    "([0-9]+:)?([0-9]+):([0-9]+)([.|,][0-9]+)? --> ([0-9]+:)?([0-9]+):([0-9]+)([.|,][0-9]+)?"

    字幕格式对象是: 

        class ClosedCaption
        {
            public string StartPoint { get; set; }
            public string EndPoint { get; set; }
            public string Transcript { get; set; }
            public override string ToString()
            {
                StringBuilder sb = new StringBuilder();
                sb.AppendLine(string.Format("{0} --> {1}", StartPoint, EndPoint));
                sb.AppendLine(Transcript);
                return sb.ToString();
            }
        }

    从文件中读取字幕格式对象: 

            public static void ReadTranscript(string filePath)
            {
                //0:0:4.480 --> 0:0:7.430
                string timePattern = @"([0-9]+:)?([0-9]+):([0-9]+)([.|,][0-9]+)? --> ([0-9]+:)?([0-9]+):([0-9]+)([.|,][0-9]+)?";
    
                using (var stream = new FileStream(filePath, FileMode.Open))
                {
                    StreamReader reader = new StreamReader(stream);
                    string fileContent = reader.ReadToEnd();
    
                    // handle CC time
                    var cues = Regex.Matches(fileContent, timePattern, RegexOptions.IgnoreCase);
                    Captions = new List<ClosedCaption>();
                    foreach (Match cue in cues)
                    {
                        string timeLine = cue.Value.ToString();
                        string[] timeInfo = timeLine.Split(new string[] { "-->" }, StringSplitOptions.RemoveEmptyEntries);
                        if (timeInfo.Length == 2)
                        {
                            string startInfo = timeInfo[0].Trim();
                            string endInfo = timeInfo[1].Trim();
    
                            startInfo = TimeFormat.ToHHMMSS(TimeFormat.ToDouble(startInfo),"t1");
                            endInfo = TimeFormat.ToHHMMSS(TimeFormat.ToDouble(endInfo), "t1");
                            Captions.Add(new ClosedCaption
                            {
                                StartPoint = startInfo,
                                EndPoint = endInfo
                            });
                        }
                    }
    
                    string newContent = Regex.Replace(fileContent, timePattern, "-->");
                    string[] splitParts = newContent.Split(new string[] { "-->"},StringSplitOptions.RemoveEmptyEntries);
                    if (splitParts.Length -1 == Captions.Count)
                    {
                        for (int i = 1; i < splitParts.Length; i++)
                        {
                            //Captions[i-1].Transcript = splitParts[i];
                            string rawTranscript = splitParts[i];
                            string firstTrim =  rawTranscript.Trim(new char[] { '
    ', '
    ' });
    
                            //trim last digital character
                            int digitalCount = 0;
                            if (firstTrim.Length > 1)
                            {
                                for (int x = firstTrim.Length - 1; x > firstTrim.Length - 5; x--)
                                {
                                    int d = 0;
                                    if (Int32.TryParse(firstTrim[x].ToString(), out d) == true)
                                        digitalCount++;
                                    else
                                        break;
                                }
                            }
    
                            string secondTrim = firstTrim;
                            if (digitalCount != 0)
                            {
                                secondTrim = firstTrim.Remove(firstTrim.Length - digitalCount);                            
                            }
    
                            Captions[i - 1].Transcript = secondTrim.Trim(new char[] { '
    ', '
    '}).Trim();
                        }
                    }
                }
            }


    由字幕对象生成VTT, SRT, 和TTML:

            public static void Write2VTT(string vtt)
            {
                if (Captions.Count > 0)
                {
                    StringBuilder sb = new StringBuilder();
                    sb.AppendLine("WEBVTT");
                    sb.AppendLine();
                    foreach (var item in Captions)
                    {
                        sb.AppendLine(item.ToString()); //here will input a blank line because of two AppendLine();
                    }
    
                    using (StreamWriter writer = new StreamWriter(vtt, false))
                    {
                        writer.Write(sb.ToString());
                        writer.Flush();
                        writer.Close();
                    }
                }
            }
    
            public static void Write2SRT(string srt)
            {
                if (Captions.Count > 0)
                {
                    StringBuilder sb = new StringBuilder();
    
                    for (int i = 0; i < Captions.Count; i++)
                    {
                        sb.AppendLine((i + 1).ToString());
                        sb.AppendLine(Captions[i].ToString()); // note here will input a blank line because of two AppendLine();
                    }
                    using (StreamWriter writer = new StreamWriter(srt))
                    {
                        writer.Write(sb.ToString());
                        writer.Flush();
                        writer.Close();
                    }
                }
            }
    
            public static void Write2TTML(string ttml)
            {
                StringBuilder sbContent = new StringBuilder();
                string Content = string.Empty;
                using (StreamReader sr = new StreamReader("ttSample1.txt"))
                {
                    Content = sr.ReadToEnd();
                }
    
                if (Captions.Count > 0)
                {
                    sbContent.AppendLine("<div region="subtitleArea">");
                    for (int i = 0; i < Captions.Count; i++)
                    {
                        double beginTime = TimeFormat.ToDouble(Captions[i].StartPoint);
                        double endTime = TimeFormat.ToDouble(Captions[i].EndPoint);
    
                        string begin = TimeFormat.ToHHMMSS(beginTime, "t1");
                        string end = TimeFormat.ToHHMMSS(endTime,"t1");
                        string content = HttpUtility.HtmlEncode(Captions[i].Transcript);
                        sbContent.AppendLine(string.Format("<p begin="{1}" id="{0}" end="{2}">{3}</p>", "p" + i, begin, end, content));
                    }
                    sbContent.AppendLine(@"</div>");
    
                    Content = string.Format(Content, sbContent.ToString());
    
                    using (StreamWriter writer = new StreamWriter(ttml))
                    {
                        writer.Write(Content);
                        writer.Flush();
                        writer.Close();
                    }
                }
            }

    转化实例:CCConverter in gitHub

    转载请注明出处http://www.cnblogs.com/qixue/p/5498396.html

  • 相关阅读:
    python_控制台输出带颜色的文字方法
    模拟数据库作业
    js笔记
    CSS 笔记
    html 笔记
    必备技能-Git 使用规范流程
    python 闭包
    30个python编程技巧!
    python 面向对象
    python 线程
  • 原文地址:https://www.cnblogs.com/qixue/p/5498396.html
Copyright © 2020-2023  润新知