由于工作的需要,最近一直在研究Lucene.Net,在测试中我发现当索引库达到5GB左右的时候,搜索速度将变得奇慢。在网上查找一些资料,说分词器会影响搜索速度,但又苦于好的免费的分词器,于是只有改写Java版的CJKAnalyzer,我把它共享给大家。虽然我很久就申请了这个Blog,但是一直没有写什么东西,这篇文章也算是我的处女作,希望今后能够和大家多多交流。
1
2/**
3 * Copyright 2004-2005 The Apache Software Foundation
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17using System;
18using System.Collections;
19using System.IO;
20
21using Lucene.Net.Analysis;
22
23namespace NSharp.SearchEngine.Lucene.Analysis.Cjk
24{
25 /**
26 * Filters CJKTokenizer with StopFilter.
27 *
28 * @author Che, Dong
29 */
30 public class CJKAnalyzer:Analyzer
31 {
32 //~ Static fields/initializers ---------------------------------------------
33
34 /**
35 * An array containing some common English words that are not usually
36 * useful for searching and some double-byte interpunctions.
37 */
38 public static string[] STOP_WORDS = {
39 "a", "and", "are", "as", "at", "be",
40 "but", "by", "for", "if", "in",
41 "into", "is", "it", "no", "not",
42 "of", "on", "or", "s", "such", "t",
43 "that", "the", "their", "then",
44 "there", "these", "they", "this",
45 "to", "was", "will", "with", "",
46 "www"
47 };
48
49 //~ Instance fields --------------------------------------------------------
50
51 /**
52 * stop word list
53 */
54 private Hashtable stopTable;
55
56 //~ Constructors -----------------------------------------------------------
57
58 /**
59 * Builds an analyzer which removes words in {@link #STOP_WORDS}.
60 */
61 public CJKAnalyzer()
62 {
63 stopTable = StopFilter.MakeStopSet(STOP_WORDS);
64 }
65
66 /**
67 * Builds an analyzer which removes words in the provided array.
68 *
69 * @param stopWords stop word array
70 */
71 public CJKAnalyzer(string[] stopWords)
72 {
73 stopTable = StopFilter.MakeStopSet(stopWords);
74 }
75
76 //~ Methods ----------------------------------------------------------------
77
78 /**
79 * get token stream from input
80 *
81 * @param fieldName lucene field name
82 * @param reader input reader
83 * @return TokenStream
84 */
85 public override TokenStream TokenStream(string fieldName, TextReader reader)
86 {
87 TokenStream ts=new CJKTokenizer(reader);
88 return new StopFilter(ts, stopTable);
89 //return new StopFilter(new CJKTokenizer(reader), stopTable);
90 }
91 }
92}
2/**
3 * Copyright 2004-2005 The Apache Software Foundation
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17using System;
18using System.Collections;
19using System.IO;
20
21using Lucene.Net.Analysis;
22
23namespace NSharp.SearchEngine.Lucene.Analysis.Cjk
24{
25 /**
26 * Filters CJKTokenizer with StopFilter.
27 *
28 * @author Che, Dong
29 */
30 public class CJKAnalyzer:Analyzer
31 {
32 //~ Static fields/initializers ---------------------------------------------
33
34 /**
35 * An array containing some common English words that are not usually
36 * useful for searching and some double-byte interpunctions.
37 */
38 public static string[] STOP_WORDS = {
39 "a", "and", "are", "as", "at", "be",
40 "but", "by", "for", "if", "in",
41 "into", "is", "it", "no", "not",
42 "of", "on", "or", "s", "such", "t",
43 "that", "the", "their", "then",
44 "there", "these", "they", "this",
45 "to", "was", "will", "with", "",
46 "www"
47 };
48
49 //~ Instance fields --------------------------------------------------------
50
51 /**
52 * stop word list
53 */
54 private Hashtable stopTable;
55
56 //~ Constructors -----------------------------------------------------------
57
58 /**
59 * Builds an analyzer which removes words in {@link #STOP_WORDS}.
60 */
61 public CJKAnalyzer()
62 {
63 stopTable = StopFilter.MakeStopSet(STOP_WORDS);
64 }
65
66 /**
67 * Builds an analyzer which removes words in the provided array.
68 *
69 * @param stopWords stop word array
70 */
71 public CJKAnalyzer(string[] stopWords)
72 {
73 stopTable = StopFilter.MakeStopSet(stopWords);
74 }
75
76 //~ Methods ----------------------------------------------------------------
77
78 /**
79 * get token stream from input
80 *
81 * @param fieldName lucene field name
82 * @param reader input reader
83 * @return TokenStream
84 */
85 public override TokenStream TokenStream(string fieldName, TextReader reader)
86 {
87 TokenStream ts=new CJKTokenizer(reader);
88 return new StopFilter(ts, stopTable);
89 //return new StopFilter(new CJKTokenizer(reader), stopTable);
90 }
91 }
92}
1
2
3/**
4 * Copyright 2004-2005 The Apache Software Foundation
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19using System;
20using System.Collections;
21using System.IO;
22
23using Lucene.Net.Analysis;
24
25/**
26 * CJKTokenizer was modified from StopTokenizer which does a decent job for
27 * most European languages. It performs other token methods for double-byte
28 * Characters: the token will return at each two charactors with overlap match.<br>
29 * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
30 * also need filter filter zero length token ""<br>
31 * for Digit: digit, '+', '#' will token as letter<br>
32 * for more info on Asia language(Chinese Japanese Korean) text segmentation:
33 * please search <a
34 * href="http://www.google.com/search?q=word+chinese+segment">google</a>
35 *
36 * @author Che, Dong
37 */
38namespace NSharp.SearchEngine.Lucene.Analysis.Cjk
39{
40 public class CJKTokenizer:Tokenizer
41 {
42 //~ Static fields/initializers ---------------------------------------------
43
44 /** Max word length */
45 private static int MAX_WORD_LEN = 255;
46
47 /** buffer size: */
48 private static int IO_BUFFER_SIZE = 256;
49
50 //~ Instance fields --------------------------------------------------------
51
52 /** word offset, used to imply which character(in ) is parsed */
53 private int offset = 0;
54
55 /** the index used only for ioBuffer */
56 private int bufferIndex = 0;
57
58 /** data length */
59 private int dataLen = 0;
60
61 /**
62 * character buffer, store the characters which are used to compose <br>
63 * the returned Token
64 */
65 private char[] buffer = new char[MAX_WORD_LEN];
66
67 /**
68 * I/O buffer, used to store the content of the input(one of the <br>
69 * members of Tokenizer)
70 */
71 private char[] ioBuffer = new char[IO_BUFFER_SIZE];
72
73 /** word type: single=>ASCII double=>non-ASCII word=>default */
74 private string tokenType = "word";
75
76 /**
77 * tag: previous character is a cached double-byte character "C1C2C3C4"
78 * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
79 * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
80 */
81 private bool preIsTokened = false;
82
83 //~ Constructors -----------------------------------------------------------
84
85 /**
86 * Construct a token stream processing the given input.
87 *
88 * @param in I/O reader
89 */
90 public CJKTokenizer(TextReader reader)
91 {
92 input = reader;
93 }
94
95 //~ Methods ----------------------------------------------------------------
96
97 /**
98 * Returns the next token in the stream, or null at EOS.
99 * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
100 * for detail.
101 *
102 * @return Token
103 *
104 * @throws java.io.IOException - throw IOException when read error <br>
105 * hanppened in the InputStream
106 *
107 */
108 public override Token Next()
109 {
110 /** how many character(s) has been stored in buffer */
111 int length = 0;
112
113 /** the position used to create Token */
114 int start = offset;
115
116 while (true)
117 {
118 /** current charactor */
119 char c;
120
121
122 offset++;
123
124 /*
125 if (bufferIndex >= dataLen)
126 {
127 dataLen = input.read(ioBuffer); //Java中read读到最后不会出错,但.Net会,
128 bufferIndex = 0;
129 }
130 */
131
132 if (bufferIndex >= dataLen )
133 {
134 if (dataLen==0 || dataLen>=ioBuffer.Length)//Java中read读到最后不会出错,但.Net会,所以此处是为了拦截异常
135 {
136 dataLen = input.Read(ioBuffer,0,ioBuffer.Length);
137 bufferIndex = 0;
138 }
139 else
140 {
141 dataLen=0;
142 }
143 }
144
145 if (dataLen ==0)
146 {
147 if (length > 0)
148 {
149 if (preIsTokened == true)
150 {
151 length = 0;
152 preIsTokened = false;
153 }
154
155 break;
156 }
157 else
158 {
159 return null;
160 }
161 }
162 else
163 {
164 //get current character
165 c = ioBuffer[bufferIndex++];
166 }
167
168 //if the current character is ASCII or Extend ASCII
169 if (IsAscii(c) || IsHALFWIDTH_AND_FULLWIDTH_FORMS(c))
170 {
171 if (IsHALFWIDTH_AND_FULLWIDTH_FORMS(c))
172 {
173 /** convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
174 int i = (int) c;
175 i = i - 65248;
176 c = (char) i;
177 }
178 if the current character is a letter or "_" "+" "#
236
237 }
238 else
239 {
240 // non-ASCII letter, eg."C1C2C3C4"
291 }
292 }
293
294 return new Token(new String(buffer, 0, length), start, start + length,
295 tokenType
296 );
297 }
298
299 public bool IsAscii(char c)
300 {
301 return c<256 && c>=0;
302 }
303
304 public bool IsHALFWIDTH_AND_FULLWIDTH_FORMS(char c)
305 {
306 return c<=0xFFEF && c>=0xFF00;
307 }
308 }
309}
2
3/**
4 * Copyright 2004-2005 The Apache Software Foundation
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19using System;
20using System.Collections;
21using System.IO;
22
23using Lucene.Net.Analysis;
24
25/**
26 * CJKTokenizer was modified from StopTokenizer which does a decent job for
27 * most European languages. It performs other token methods for double-byte
28 * Characters: the token will return at each two charactors with overlap match.<br>
29 * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
30 * also need filter filter zero length token ""<br>
31 * for Digit: digit, '+', '#' will token as letter<br>
32 * for more info on Asia language(Chinese Japanese Korean) text segmentation:
33 * please search <a
34 * href="http://www.google.com/search?q=word+chinese+segment">google</a>
35 *
36 * @author Che, Dong
37 */
38namespace NSharp.SearchEngine.Lucene.Analysis.Cjk
39{
40 public class CJKTokenizer:Tokenizer
41 {
42 //~ Static fields/initializers ---------------------------------------------
43
44 /** Max word length */
45 private static int MAX_WORD_LEN = 255;
46
47 /** buffer size: */
48 private static int IO_BUFFER_SIZE = 256;
49
50 //~ Instance fields --------------------------------------------------------
51
52 /** word offset, used to imply which character(in ) is parsed */
53 private int offset = 0;
54
55 /** the index used only for ioBuffer */
56 private int bufferIndex = 0;
57
58 /** data length */
59 private int dataLen = 0;
60
61 /**
62 * character buffer, store the characters which are used to compose <br>
63 * the returned Token
64 */
65 private char[] buffer = new char[MAX_WORD_LEN];
66
67 /**
68 * I/O buffer, used to store the content of the input(one of the <br>
69 * members of Tokenizer)
70 */
71 private char[] ioBuffer = new char[IO_BUFFER_SIZE];
72
73 /** word type: single=>ASCII double=>non-ASCII word=>default */
74 private string tokenType = "word";
75
76 /**
77 * tag: previous character is a cached double-byte character "C1C2C3C4"
78 * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
79 * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
80 */
81 private bool preIsTokened = false;
82
83 //~ Constructors -----------------------------------------------------------
84
85 /**
86 * Construct a token stream processing the given input.
87 *
88 * @param in I/O reader
89 */
90 public CJKTokenizer(TextReader reader)
91 {
92 input = reader;
93 }
94
95 //~ Methods ----------------------------------------------------------------
96
97 /**
98 * Returns the next token in the stream, or null at EOS.
99 * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
100 * for detail.
101 *
102 * @return Token
103 *
104 * @throws java.io.IOException - throw IOException when read error <br>
105 * hanppened in the InputStream
106 *
107 */
108 public override Token Next()
109 {
110 /** how many character(s) has been stored in buffer */
111 int length = 0;
112
113 /** the position used to create Token */
114 int start = offset;
115
116 while (true)
117 {
118 /** current charactor */
119 char c;
120
121
122 offset++;
123
124 /*
125 if (bufferIndex >= dataLen)
126 {
127 dataLen = input.read(ioBuffer); //Java中read读到最后不会出错,但.Net会,
128 bufferIndex = 0;
129 }
130 */
131
132 if (bufferIndex >= dataLen )
133 {
134 if (dataLen==0 || dataLen>=ioBuffer.Length)//Java中read读到最后不会出错,但.Net会,所以此处是为了拦截异常
135 {
136 dataLen = input.Read(ioBuffer,0,ioBuffer.Length);
137 bufferIndex = 0;
138 }
139 else
140 {
141 dataLen=0;
142 }
143 }
144
145 if (dataLen ==0)
146 {
147 if (length > 0)
148 {
149 if (preIsTokened == true)
150 {
151 length = 0;
152 preIsTokened = false;
153 }
154
155 break;
156 }
157 else
158 {
159 return null;
160 }
161 }
162 else
163 {
164 //get current character
165 c = ioBuffer[bufferIndex++];
166 }
167
168 //if the current character is ASCII or Extend ASCII
169 if (IsAscii(c) || IsHALFWIDTH_AND_FULLWIDTH_FORMS(c))
170 {
171 if (IsHALFWIDTH_AND_FULLWIDTH_FORMS(c))
172 {
173 /** convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
174 int i = (int) c;
175 i = i - 65248;
176 c = (char) i;
177 }
178 if the current character is a letter or "_" "+" "#
236
237 }
238 else
239 {
240 // non-ASCII letter, eg."C1C2C3C4"
291 }
292 }
293
294 return new Token(new String(buffer, 0, length), start, start + length,
295 tokenType
296 );
297 }
298
299 public bool IsAscii(char c)
300 {
301 return c<256 && c>=0;
302 }
303
304 public bool IsHALFWIDTH_AND_FULLWIDTH_FORMS(char c)
305 {
306 return c<=0xFFEF && c>=0xFF00;
307 }
308 }
309}