marc文件很复杂,非专业人士很难看懂,我这里就根据它的基本的结构把每一个字段列出来。
每条记录结构
记录头标 |
地址目次区 |
数据字段区 |
记录分隔符 |
一、记录头标长度和结构
共24位,每位长度如下结构
5 |
1 |
4 |
1 |
1 |
5 |
3 |
4 |
↑ |
↑ |
↑ |
↑ |
↑ |
↑ |
↑ |
↑ |
记录长度 |
记录状态 |
执行代码 |
提示符长 |
子字段标识符长 |
数据起始地址 |
记录附加定义 |
地址目次区款目结构 |
一、目次区
没有定长,款目数没有规定,每个款目长度为12,款目结构如下:
3 |
4 |
5 |
↑ |
↑ |
↑ |
字段号 |
字段长度 |
起始字符位置 |
目次区即是字段定义区域,知道头标区和目次区就可以读出全部内容了,其中还在注意记录的分隔符用%,但有iso文件中表示有所不同,对应关系如下:
$--------chr(31)
@--------chr(30)
%--------chr(29)
另外还要注意文件的编码都是GB2312
现在开始写程序:
using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.IO;
using System.Text.RegularExpressions;
using System.Text;
public partial class marcLook :PageBase
{
private const string CODING = "GB2312";
private const string HEADER_REG = @"(?<recordLen>\d{5})(?<status>[\w| ]{1})(?<execCode>[\w| ]{4})(?<pCode>\d{1})(\d{1})(?<start>\d{5})([\w| ]{3})(?<addSchema>[\w| ]{4})";
private const string ADD_REG = @"(?<name>\w{3})(?<len>\d{4})(?<start>\d{5})";
protected void Page_Load(object sender, EventArgs e)
{
if (!IsPostBack)
{
}
}
protected void btnLoad_Click(object sender, EventArgs e)
{
if (File1.PostedFile.ContentLength == 0)
{
Alert("文件没有任何内容");
return;
}
if (Path.GetExtension(File1.PostedFile.FileName).ToLower() != ".iso")
{
Alert("文件类型必须是.iso格式");
return;
}
StreamReader sr = new StreamReader(File1.PostedFile.InputStream, System.Text.Encoding.GetEncoding(CODING));
System.Text.StringBuilder output = new StringBuilder("");
string[] recordArr = sr.ReadToEnd().Split('\x1D');
sr.Close();
int count = 0;
foreach (string sour in recordArr)
{
if (sour == null || sour == "") break;
string hrStr = sour.Substring(0, 24);
Regex hrExp = new Regex(HEADER_REG);
Match m = hrExp.Match(hrStr);
if (m.Success)
{
output.AppendFormat("\n记录{0}\t",++count );
output.Append("长度:");
output.Append(m.Groups["recordLen"].Value);
output.Append("\t开始位置:");
output.Append(m.Groups["start"].Value);
output.Append("\n--------------------------------------------------------------------\n");
}
else
break;
string[,] fields = new string[100, 3];
int len = int.Parse(m.Groups["recordLen"].Value) - 1;
int start = int.Parse(m.Groups["start"].Value);
//前面没有中文字符,直接用substring即可
string addrString = sour.Substring(24, start - 24);
Regex adExp = new Regex(ADD_REG);
int p = 0;
for (int i = 0; i < addrString.Length / 12; i++)
{
string s = addrString.Substring(p, 12);
m = adExp.Match(s);
if (m.Success)
{
fields[i, 0] = m.Groups["name"].Value;
fields[i, 1] = m.Groups["len"].Value;
fields[i, 2] = m.Groups["start"].Value;
}
p += 12;
}
byte[] sourBy = System.Text.Encoding.GetEncoding(CODING).GetBytes(sour);
for (int i = 0; i < fields.Length / 3; i++)
{
if (fields[i, 0] == null) break;
string fName = fields[i, 0];
int fLen = int.Parse(fields[i, 1]);
int fStart = int.Parse(fields[i, 2]);
output.AppendFormat("{0}:", fName);
// 国为有中文字符,用getString获取字串才是准确的
string val = System.Text.Encoding.GetEncoding(CODING).GetString(sourBy, start + fStart, fLen);
output.AppendFormat("{0}\n", val);
}
}
this.txtMarc.Text = output.ToString();
}
}
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.IO;
using System.Text.RegularExpressions;
using System.Text;
public partial class marcLook :PageBase
{
private const string CODING = "GB2312";
private const string HEADER_REG = @"(?<recordLen>\d{5})(?<status>[\w| ]{1})(?<execCode>[\w| ]{4})(?<pCode>\d{1})(\d{1})(?<start>\d{5})([\w| ]{3})(?<addSchema>[\w| ]{4})";
private const string ADD_REG = @"(?<name>\w{3})(?<len>\d{4})(?<start>\d{5})";
protected void Page_Load(object sender, EventArgs e)
{
if (!IsPostBack)
{
}
}
protected void btnLoad_Click(object sender, EventArgs e)
{
if (File1.PostedFile.ContentLength == 0)
{
Alert("文件没有任何内容");
return;
}
if (Path.GetExtension(File1.PostedFile.FileName).ToLower() != ".iso")
{
Alert("文件类型必须是.iso格式");
return;
}
StreamReader sr = new StreamReader(File1.PostedFile.InputStream, System.Text.Encoding.GetEncoding(CODING));
System.Text.StringBuilder output = new StringBuilder("");
string[] recordArr = sr.ReadToEnd().Split('\x1D');
sr.Close();
int count = 0;
foreach (string sour in recordArr)
{
if (sour == null || sour == "") break;
string hrStr = sour.Substring(0, 24);
Regex hrExp = new Regex(HEADER_REG);
Match m = hrExp.Match(hrStr);
if (m.Success)
{
output.AppendFormat("\n记录{0}\t",++count );
output.Append("长度:");
output.Append(m.Groups["recordLen"].Value);
output.Append("\t开始位置:");
output.Append(m.Groups["start"].Value);
output.Append("\n--------------------------------------------------------------------\n");
}
else
break;
string[,] fields = new string[100, 3];
int len = int.Parse(m.Groups["recordLen"].Value) - 1;
int start = int.Parse(m.Groups["start"].Value);
//前面没有中文字符,直接用substring即可
string addrString = sour.Substring(24, start - 24);
Regex adExp = new Regex(ADD_REG);
int p = 0;
for (int i = 0; i < addrString.Length / 12; i++)
{
string s = addrString.Substring(p, 12);
m = adExp.Match(s);
if (m.Success)
{
fields[i, 0] = m.Groups["name"].Value;
fields[i, 1] = m.Groups["len"].Value;
fields[i, 2] = m.Groups["start"].Value;
}
p += 12;
}
byte[] sourBy = System.Text.Encoding.GetEncoding(CODING).GetBytes(sour);
for (int i = 0; i < fields.Length / 3; i++)
{
if (fields[i, 0] == null) break;
string fName = fields[i, 0];
int fLen = int.Parse(fields[i, 1]);
int fStart = int.Parse(fields[i, 2]);
output.AppendFormat("{0}:", fName);
// 国为有中文字符,用getString获取字串才是准确的
string val = System.Text.Encoding.GetEncoding(CODING).GetString(sourBy, start + fStart, fLen);
output.AppendFormat("{0}\n", val);
}
}
this.txtMarc.Text = output.ToString();
}
}