1 识别nutch-1.7的编码,完成
以前1.2是在 org.apache.nutch.parse.html.HtmlParser
EncodingDetector detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
String encoding = detector.guessEncoding(content,
defaultCharEncoding);
metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
------在这里讲识别出来的编码写进content的metadata中
System.out.println("set charset in Content " + encoding);
content.getMetadata().add("charset", encoding);
但是在1.7中却不行了。原因不详
目前的策略是,org.apache.nutch.protocol.http.api.HttpBase
位置:src/plugin/lib-http/src/java下的org.apache.nutch.protocol.http.api包中
中进行修改。
做法是将HtmlParser中的代码搬过来。
EncodingDetector detector = new EncodingDetector(conf);
detector.autoDetectClues(c, true);
detector
.addClue(sniffCharacterEncoding(c.getContent()), "sniffed");
String encoding = detector.guessEncoding(c, "windows-1252");
System.out.println("set charset in Content " + encoding);
c.getMetadata().add("charset", encoding);
同时将相关函数字段拷贝过来:
// NUTCH-1006 Meta equiv with single quotes not accepted
private static Pattern metaPattern = Pattern.compile(
"<meta\s+([^>]*http-equiv=("|')?content-type("|')?[^>]*)>",
Pattern.CASE_INSENSITIVE);
private static Pattern charsetPattern = Pattern.compile(
"charset=\s*([a-z][_\-0-9a-z]*)", Pattern.CASE_INSENSITIVE);
// I used 1000 bytes at first, but found that some documents have
// meta tag well past the first 1000 bytes.
// (e.g. http://cn.promo.yahoo.com/customcare/music.html)
private static final int CHUNK_SIZE = 2000;
/**
* Given a <code>byte[]</code> representing an html file of an
* <em>unknown</em> encoding, read out 'charset' parameter in the meta tag
* from the first <code>CHUNK_SIZE</code> bytes. If there's no meta tag for
* Content-Type or no charset is specified, <code>null</code> is returned. <br />
* FIXME: non-byte oriented character encodings (UTF-16, UTF-32) can't be
* handled with this. We need to do something similar to what's done by
* mozilla
* (http://lxr.mozilla.org/seamonkey/source/parser/htmlparser/src/nsParser
* .cpp#1993). See also http://www.w3.org/TR/REC-xml/#sec-guessing <br />
*
* @param content
* <code>byte[]</code> representation of an html file
*/
private static String sniffCharacterEncoding(byte[] content) {
int length = content.length < CHUNK_SIZE ? content.length : CHUNK_SIZE;
// We don't care about non-ASCII parts so that it's sufficient
// to just inflate each byte to a 16-bit value by padding.
// For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
// {U+0041, U+0082, U+00B7}.
String str = "";
try {
str = new String(content, 0, length, Charset.forName("ASCII")
.toString());
} catch (UnsupportedEncodingException e) {
// code should never come here, but just in case...
return null;
}
Matcher metaMatcher = metaPattern.matcher(str);
String encoding = null;
if (metaMatcher.find()) {
Matcher charsetMatcher = charsetPattern.matcher(metaMatcher
.group(1));
if (charsetMatcher.find())
encoding = new String(charsetMatcher.group(1));
}
return encoding;
}
同时修改:SegmentReader的中的方法
public void reduce(Text key, Iterator<NutchWritable> values,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
StringBuffer dump = new StringBuffer();
dump.append("
Recno:: ").append(recNo++).append("
");
dump.append("URL:: " + key.toString() + "
");
while (values.hasNext()) {
Writable value = values.next().get(); // unwrap
if (value instanceof CrawlDatum) {
dump.append("
CrawlDatum::
").append(((CrawlDatum) value).toString());
} else if (value instanceof Content) {
Content ct = (Content)value;
String charset = ct.getMetadata().get("charset");
dump.append("
Content::
").append(ct.toString(charset));
} else if (value instanceof ParseData) {
dump.append("
ParseData::
").append(((ParseData) value).toString());
} else if (value instanceof ParseText) {
dump.append("
ParseText::
").append(((ParseText) value).toString());
} else if (LOG.isWarnEnabled()) {
LOG.warn("Unrecognized type: " + value.getClass());
}
}
output.collect(key, new Text(dump.toString()));
}