最近参与一个小project,需要编写一个针对英文单词的stem 算法。
1. 最为常见的stem 算法 就是The English (Porter2) stemming algorithm http://snowball.tartarus.org/algorithms/english/stemmer.html
// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.Among; /** * This class was automatically generated by a Snowball to Java compiler * It implements the stemming algorithm defined by a snowball script. */ public class englishStemmer extends org.tartarus.snowball.SnowballStemmer { private static final long serialVersionUID = 1L; private final static englishStemmer methodObject = new englishStemmer (); private final static Among a_0[] = { new Among ( "arsen", -1, -1, "", methodObject ), new Among ( "commun", -1, -1, "", methodObject ), new Among ( "gener", -1, -1, "", methodObject ) }; private final static Among a_1[] = { new Among ( "'", -1, 1, "", methodObject ), new Among ( "'s'", 0, 1, "", methodObject ), new Among ( "'s", -1, 1, "", methodObject ) }; private final static Among a_2[] = { new Among ( "ied", -1, 2, "", methodObject ), new Among ( "s", -1, 3, "", methodObject ), new Among ( "ies", 1, 2, "", methodObject ), new Among ( "sses", 1, 1, "", methodObject ), new Among ( "ss", 1, -1, "", methodObject ), new Among ( "us", 1, -1, "", methodObject ) }; private final static Among a_3[] = { new Among ( "", -1, 3, "", methodObject ), new Among ( "bb", 0, 2, "", methodObject ), new Among ( "dd", 0, 2, "", methodObject ), new Among ( "ff", 0, 2, "", methodObject ), new Among ( "gg", 0, 2, "", methodObject ), new Among ( "bl", 0, 1, "", methodObject ), new Among ( "mm", 0, 2, "", methodObject ), new Among ( "nn", 0, 2, "", methodObject ), new Among ( "pp", 0, 2, "", methodObject ), new Among ( "rr", 0, 2, "", methodObject ), new Among ( "at", 0, 1, "", methodObject ), new Among ( "tt", 0, 2, "", methodObject ), new Among ( "iz", 0, 1, "", methodObject ) }; private final static Among a_4[] = { new Among ( "ed", -1, 2, "", methodObject ), new Among ( "eed", 0, 1, "", methodObject ), new Among ( "ing", -1, 2, "", methodObject ), new Among ( "edly", -1, 2, "", methodObject ), new Among ( "eedly", 3, 1, "", methodObject ), new Among ( "ingly", -1, 2, "", methodObject ) }; private final static Among a_5[] = { new Among ( "anci", -1, 3, "", methodObject ), new Among ( "enci", -1, 2, "", methodObject ), new Among ( "ogi", -1, 13, "", methodObject ), new Among ( "li", -1, 16, "", methodObject ), new Among ( "bli", 3, 12, "", methodObject ), new Among ( "abli", 4, 4, "", methodObject ), new Among ( "alli", 3, 8, "", methodObject ), new Among ( "fulli", 3, 14, "", methodObject ), new Among ( "lessli", 3, 15, "", methodObject ), new Among ( "ousli", 3, 10, "", methodObject ), new Among ( "entli", 3, 5, "", methodObject ), new Among ( "aliti", -1, 8, "", methodObject ), new Among ( "biliti", -1, 12, "", methodObject ), new Among ( "iviti", -1, 11, "", methodObject ), new Among ( "tional", -1, 1, "", methodObject ), new Among ( "ational", 14, 7, "", methodObject ), new Among ( "alism", -1, 8, "", methodObject ), new Among ( "ation", -1, 7, "", methodObject ), new Among ( "ization", 17, 6, "", methodObject ), new Among ( "izer", -1, 6, "", methodObject ), new Among ( "ator", -1, 7, "", methodObject ), new Among ( "iveness", -1, 11, "", methodObject ), new Among ( "fulness", -1, 9, "", methodObject ), new Among ( "ousness", -1, 10, "", methodObject ) }; private final static Among a_6[] = { new Among ( "icate", -1, 4, "", methodObject ), new Among ( "ative", -1, 6, "", methodObject ), new Among ( "alize", -1, 3, "", methodObject ), new Among ( "iciti", -1, 4, "", methodObject ), new Among ( "ical", -1, 4, "", methodObject ), new Among ( "tional", -1, 1, "", methodObject ), new Among ( "ational", 5, 2, "", methodObject ), new Among ( "ful", -1, 5, "", methodObject ), new Among ( "ness", -1, 5, "", methodObject ) }; private final static Among a_7[] = { new Among ( "ic", -1, 1, "", methodObject ), new Among ( "ance", -1, 1, "", methodObject ), new Among ( "ence", -1, 1, "", methodObject ), new Among ( "able", -1, 1, "", methodObject ), new Among ( "ible", -1, 1, "", methodObject ), new Among ( "ate", -1, 1, "", methodObject ), new Among ( "ive", -1, 1, "", methodObject ), new Among ( "ize", -1, 1, "", methodObject ), new Among ( "iti", -1, 1, "", methodObject ), new Among ( "al", -1, 1, "", methodObject ), new Among ( "ism", -1, 1, "", methodObject ), new Among ( "ion", -1, 2, "", methodObject ), new Among ( "er", -1, 1, "", methodObject ), new Among ( "ous", -1, 1, "", methodObject ), new Among ( "ant", -1, 1, "", methodObject ), new Among ( "ent", -1, 1, "", methodObject ), new Among ( "ment", 15, 1, "", methodObject ), new Among ( "ement", 16, 1, "", methodObject ) }; private final static Among a_8[] = { new Among ( "e", -1, 1, "", methodObject ), new Among ( "l", -1, 2, "", methodObject ) }; private final static Among a_9[] = { new Among ( "succeed", -1, -1, "", methodObject ), new Among ( "proceed", -1, -1, "", methodObject ), new Among ( "exceed", -1, -1, "", methodObject ), new Among ( "canning", -1, -1, "", methodObject ), new Among ( "inning", -1, -1, "", methodObject ), new Among ( "earring", -1, -1, "", methodObject ), new Among ( "herring", -1, -1, "", methodObject ), new Among ( "outing", -1, -1, "", methodObject ) }; private final static Among a_10[] = { new Among ( "andes", -1, -1, "", methodObject ), new Among ( "atlas", -1, -1, "", methodObject ), new Among ( "bias", -1, -1, "", methodObject ), new Among ( "cosmos", -1, -1, "", methodObject ), new Among ( "dying", -1, 3, "", methodObject ), new Among ( "early", -1, 9, "", methodObject ), new Among ( "gently", -1, 7, "", methodObject ), new Among ( "howe", -1, -1, "", methodObject ), new Among ( "idly", -1, 6, "", methodObject ), new Among ( "lying", -1, 4, "", methodObject ), new Among ( "news", -1, -1, "", methodObject ), new Among ( "only", -1, 10, "", methodObject ), new Among ( "singly", -1, 11, "", methodObject ), new Among ( "skies", -1, 2, "", methodObject ), new Among ( "skis", -1, 1, "", methodObject ), new Among ( "sky", -1, -1, "", methodObject ), new Among ( "tying", -1, 5, "", methodObject ), new Among ( "ugly", -1, 8, "", methodObject ) }; private static final char g_v[] = {17, 65, 16, 1 }; private static final char g_v_WXY[] = {1, 17, 65, 208, 1 }; private static final char g_valid_LI[] = {55, 141, 2 }; private boolean B_Y_found; private int I_p2; private int I_p1; private void copy_from(englishStemmer other) { B_Y_found = other.B_Y_found; I_p2 = other.I_p2; I_p1 = other.I_p1; super.copy_from(other); } private boolean r_prelude() { int v_1; int v_2; int v_3; int v_4; int v_5; // (, line 25 // unset Y_found, line 26 B_Y_found = false; // do, line 27 v_1 = cursor; lab0: do { // (, line 27 // [, line 27 bra = cursor; // literal, line 27 if (!(eq_s(1, "'"))) { break lab0; } // ], line 27 ket = cursor; // delete, line 27 slice_del(); } while (false); cursor = v_1; // do, line 28 v_2 = cursor; lab1: do { // (, line 28 // [, line 28 bra = cursor; // literal, line 28 if (!(eq_s(1, "y"))) { break lab1; } // ], line 28 ket = cursor; // <-, line 28 slice_from("Y"); // set Y_found, line 28 B_Y_found = true; } while (false); cursor = v_2; // do, line 29 v_3 = cursor; lab2: do { // repeat, line 29 replab3: while(true) { v_4 = cursor; lab4: do { // (, line 29 // goto, line 29 golab5: while(true) { v_5 = cursor; lab6: do { // (, line 29 if (!(in_grouping(g_v, 97, 121))) { break lab6; } // [, line 29 bra = cursor; // literal, line 29 if (!(eq_s(1, "y"))) { break lab6; } // ], line 29 ket = cursor; cursor = v_5; break golab5; } while (false); cursor = v_5; if (cursor >= limit) { break lab4; } cursor++; } // <-, line 29 slice_from("Y"); // set Y_found, line 29 B_Y_found = true; continue replab3; } while (false); cursor = v_4; break replab3; } } while (false); cursor = v_3; return true; } private boolean r_mark_regions() { int v_1; int v_2; // (, line 32 I_p1 = limit; I_p2 = limit; // do, line 35 v_1 = cursor; lab0: do { // (, line 35 // or, line 41 lab1: do { v_2 = cursor; lab2: do { // among, line 36 if (find_among(a_0, 3) == 0) { break lab2; } break lab1; } while (false); cursor = v_2; // (, line 41 // gopast, line 41 golab3: while(true) { lab4: do { if (!(in_grouping(g_v, 97, 121))) { break lab4; } break golab3; } while (false); if (cursor >= limit) { break lab0; } cursor++; } // gopast, line 41 golab5: while(true) { lab6: do { if (!(out_grouping(g_v, 97, 121))) { break lab6; } break golab5; } while (false); if (cursor >= limit) { break lab0; } cursor++; } } while (false); // setmark p1, line 42 I_p1 = cursor; // gopast, line 43 golab7: while(true) { lab8: do { if (!(in_grouping(g_v, 97, 121))) { break lab8; } break golab7; } while (false); if (cursor >= limit) { break lab0; } cursor++; } // gopast, line 43 golab9: while(true) { lab10: do { if (!(out_grouping(g_v, 97, 121))) { break lab10; } break golab9; } while (false); if (cursor >= limit) { break lab0; } cursor++; } // setmark p2, line 43 I_p2 = cursor; } while (false); cursor = v_1; return true; } private boolean r_shortv() { int v_1; // (, line 49 // or, line 51 lab0: do { v_1 = limit - cursor; lab1: do { // (, line 50 if (!(out_grouping_b(g_v_WXY, 89, 121))) { break lab1; } if (!(in_grouping_b(g_v, 97, 121))) { break lab1; } if (!(out_grouping_b(g_v, 97, 121))) { break lab1; } break lab0; } while (false); cursor = limit - v_1; // (, line 52 if (!(out_grouping_b(g_v, 97, 121))) { return false; } if (!(in_grouping_b(g_v, 97, 121))) { return false; } // atlimit, line 52 if (cursor > limit_backward) { return false; } } while (false); return true; } private boolean r_R1() { if (!(I_p1 <= cursor)) { return false; } return true; } private boolean r_R2() { if (!(I_p2 <= cursor)) { return false; } return true; } private boolean r_Step_1a() { int among_var; int v_1; int v_2; // (, line 58 // try, line 59 v_1 = limit - cursor; lab0: do { // (, line 59 // [, line 60 ket = cursor; // substring, line 60 among_var = find_among_b(a_1, 3); if (among_var == 0) { cursor = limit - v_1; break lab0; } // ], line 60 bra = cursor; switch(among_var) { case 0: cursor = limit - v_1; break lab0; case 1: // (, line 62 // delete, line 62 slice_del(); break; } } while (false); // [, line 65 ket = cursor; // substring, line 65 among_var = find_among_b(a_2, 6); if (among_var == 0) { return false; } // ], line 65 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 66 // <-, line 66 slice_from("ss"); break; case 2: // (, line 68 // or, line 68 lab1: do { v_2 = limit - cursor; lab2: do { // (, line 68 // hop, line 68 { int c = cursor - 2; if (limit_backward > c || c > limit) { break lab2; } cursor = c; } // <-, line 68 slice_from("i"); break lab1; } while (false); cursor = limit - v_2; // <-, line 68 slice_from("ie"); } while (false); break; case 3: // (, line 69 // next, line 69 if (cursor <= limit_backward) { return false; } cursor--; // gopast, line 69 golab3: while(true) { lab4: do { if (!(in_grouping_b(g_v, 97, 121))) { break lab4; } break golab3; } while (false); if (cursor <= limit_backward) { return false; } cursor--; } // delete, line 69 slice_del(); break; } return true; } private boolean r_Step_1b() { int among_var; int v_1; int v_3; int v_4; // (, line 74 // [, line 75 ket = cursor; // substring, line 75 among_var = find_among_b(a_4, 6); if (among_var == 0) { return false; } // ], line 75 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 77 // call R1, line 77 if (!r_R1()) { return false; } // <-, line 77 slice_from("ee"); break; case 2: // (, line 79 // test, line 80 v_1 = limit - cursor; // gopast, line 80 golab0: while(true) { lab1: do { if (!(in_grouping_b(g_v, 97, 121))) { break lab1; } break golab0; } while (false); if (cursor <= limit_backward) { return false; } cursor--; } cursor = limit - v_1; // delete, line 80 slice_del(); // test, line 81 v_3 = limit - cursor; // substring, line 81 among_var = find_among_b(a_3, 13); if (among_var == 0) { return false; } cursor = limit - v_3; switch(among_var) { case 0: return false; case 1: // (, line 83 // <+, line 83 { int c = cursor; insert(cursor, cursor, "e"); cursor = c; } break; case 2: // (, line 86 // [, line 86 ket = cursor; // next, line 86 if (cursor <= limit_backward) { return false; } cursor--; // ], line 86 bra = cursor; // delete, line 86 slice_del(); break; case 3: // (, line 87 // atmark, line 87 if (cursor != I_p1) { return false; } // test, line 87 v_4 = limit - cursor; // call shortv, line 87 if (!r_shortv()) { return false; } cursor = limit - v_4; // <+, line 87 { int c = cursor; insert(cursor, cursor, "e"); cursor = c; } break; } break; } return true; } private boolean r_Step_1c() { int v_1; int v_2; // (, line 93 // [, line 94 ket = cursor; // or, line 94 lab0: do { v_1 = limit - cursor; lab1: do { // literal, line 94 if (!(eq_s_b(1, "y"))) { break lab1; } break lab0; } while (false); cursor = limit - v_1; // literal, line 94 if (!(eq_s_b(1, "Y"))) { return false; } } while (false); // ], line 94 bra = cursor; if (!(out_grouping_b(g_v, 97, 121))) { return false; } // not, line 95 { v_2 = limit - cursor; lab2: do { // atlimit, line 95 if (cursor > limit_backward) { break lab2; } return false; } while (false); cursor = limit - v_2; } // <-, line 96 slice_from("i"); return true; } private boolean r_Step_2() { int among_var; // (, line 99 // [, line 100 ket = cursor; // substring, line 100 among_var = find_among_b(a_5, 24); if (among_var == 0) { return false; } // ], line 100 bra = cursor; // call R1, line 100 if (!r_R1()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 101 // <-, line 101 slice_from("tion"); break; case 2: // (, line 102 // <-, line 102 slice_from("ence"); break; case 3: // (, line 103 // <-, line 103 slice_from("ance"); break; case 4: // (, line 104 // <-, line 104 slice_from("able"); break; case 5: // (, line 105 // <-, line 105 slice_from("ent"); break; case 6: // (, line 107 // <-, line 107 slice_from("ize"); break; case 7: // (, line 109 // <-, line 109 slice_from("ate"); break; case 8: // (, line 111 // <-, line 111 slice_from("al"); break; case 9: // (, line 112 // <-, line 112 slice_from("ful"); break; case 10: // (, line 114 // <-, line 114 slice_from("ous"); break; case 11: // (, line 116 // <-, line 116 slice_from("ive"); break; case 12: // (, line 118 // <-, line 118 slice_from("ble"); break; case 13: // (, line 119 // literal, line 119 if (!(eq_s_b(1, "l"))) { return false; } // <-, line 119 slice_from("og"); break; case 14: // (, line 120 // <-, line 120 slice_from("ful"); break; case 15: // (, line 121 // <-, line 121 slice_from("less"); break; case 16: // (, line 122 if (!(in_grouping_b(g_valid_LI, 99, 116))) { return false; } // delete, line 122 slice_del(); break; } return true; } private boolean r_Step_3() { int among_var; // (, line 126 // [, line 127 ket = cursor; // substring, line 127 among_var = find_among_b(a_6, 9); if (among_var == 0) { return false; } // ], line 127 bra = cursor; // call R1, line 127 if (!r_R1()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 128 // <-, line 128 slice_from("tion"); break; case 2: // (, line 129 // <-, line 129 slice_from("ate"); break; case 3: // (, line 130 // <-, line 130 slice_from("al"); break; case 4: // (, line 132 // <-, line 132 slice_from("ic"); break; case 5: // (, line 134 // delete, line 134 slice_del(); break; case 6: // (, line 136 // call R2, line 136 if (!r_R2()) { return false; } // delete, line 136 slice_del(); break; } return true; } private boolean r_Step_4() { int among_var; int v_1; // (, line 140 // [, line 141 ket = cursor; // substring, line 141 among_var = find_among_b(a_7, 18); if (among_var == 0) { return false; } // ], line 141 bra = cursor; // call R2, line 141 if (!r_R2()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 144 // delete, line 144 slice_del(); break; case 2: // (, line 145 // or, line 145 lab0: do { v_1 = limit - cursor; lab1: do { // literal, line 145 if (!(eq_s_b(1, "s"))) { break lab1; } break lab0; } while (false); cursor = limit - v_1; // literal, line 145 if (!(eq_s_b(1, "t"))) { return false; } } while (false); // delete, line 145 slice_del(); break; } return true; } private boolean r_Step_5() { int among_var; int v_1; int v_2; // (, line 149 // [, line 150 ket = cursor; // substring, line 150 among_var = find_among_b(a_8, 2); if (among_var == 0) { return false; } // ], line 150 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 151 // or, line 151 lab0: do { v_1 = limit - cursor; lab1: do { // call R2, line 151 if (!r_R2()) { break lab1; } break lab0; } while (false); cursor = limit - v_1; // (, line 151 // call R1, line 151 if (!r_R1()) { return false; } // not, line 151 { v_2 = limit - cursor; lab2: do { // call shortv, line 151 if (!r_shortv()) { break lab2; } return false; } while (false); cursor = limit - v_2; } } while (false); // delete, line 151 slice_del(); break; case 2: // (, line 152 // call R2, line 152 if (!r_R2()) { return false; } // literal, line 152 if (!(eq_s_b(1, "l"))) { return false; } // delete, line 152 slice_del(); break; } return true; } private boolean r_exception2() { // (, line 156 // [, line 158 ket = cursor; // substring, line 158 if (find_among_b(a_9, 8) == 0) { return false; } // ], line 158 bra = cursor; // atlimit, line 158 if (cursor > limit_backward) { return false; } return true; } private boolean r_exception1() { int among_var; // (, line 168 // [, line 170 bra = cursor; // substring, line 170 among_var = find_among(a_10, 18); if (among_var == 0) { return false; } // ], line 170 ket = cursor; // atlimit, line 170 if (cursor < limit) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 174 // <-, line 174 slice_from("ski"); break; case 2: // (, line 175 // <-, line 175 slice_from("sky"); break; case 3: // (, line 176 // <-, line 176 slice_from("die"); break; case 4: // (, line 177 // <-, line 177 slice_from("lie"); break; case 5: // (, line 178 // <-, line 178 slice_from("tie"); break; case 6: // (, line 182 // <-, line 182 slice_from("idl"); break; case 7: // (, line 183 // <-, line 183 slice_from("gentl"); break; case 8: // (, line 184 // <-, line 184 slice_from("ugli"); break; case 9: // (, line 185 // <-, line 185 slice_from("earli"); break; case 10: // (, line 186 // <-, line 186 slice_from("onli"); break; case 11: // (, line 187 // <-, line 187 slice_from("singl"); break; } return true; } private boolean r_postlude() { int v_1; int v_2; // (, line 203 // Boolean test Y_found, line 203 if (!(B_Y_found)) { return false; } // repeat, line 203 replab0: while(true) { v_1 = cursor; lab1: do { // (, line 203 // goto, line 203 golab2: while(true) { v_2 = cursor; lab3: do { // (, line 203 // [, line 203 bra = cursor; // literal, line 203 if (!(eq_s(1, "Y"))) { break lab3; } // ], line 203 ket = cursor; cursor = v_2; break golab2; } while (false); cursor = v_2; if (cursor >= limit) { break lab1; } cursor++; } // <-, line 203 slice_from("y"); continue replab0; } while (false); cursor = v_1; break replab0; } return true; } public boolean stem() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; int v_9; int v_10; int v_11; int v_12; int v_13; // (, line 205 // or, line 207 lab0: do { v_1 = cursor; lab1: do { // call exception1, line 207 if (!r_exception1()) { break lab1; } break lab0; } while (false); cursor = v_1; lab2: do { // not, line 208 { v_2 = cursor; lab3: do { // hop, line 208 { int c = cursor + 3; if (0 > c || c > limit) { break lab3; } cursor = c; } break lab2; } while (false); cursor = v_2; } break lab0; } while (false); cursor = v_1; // (, line 208 // do, line 209 v_3 = cursor; lab4: do { // call prelude, line 209 if (!r_prelude()) { break lab4; } } while (false); cursor = v_3; // do, line 210 v_4 = cursor; lab5: do { // call mark_regions, line 210 if (!r_mark_regions()) { break lab5; } } while (false); cursor = v_4; // backwards, line 211 limit_backward = cursor; cursor = limit; // (, line 211 // do, line 213 v_5 = limit - cursor; lab6: do { // call Step_1a, line 213 if (!r_Step_1a()) { break lab6; } } while (false); cursor = limit - v_5; // or, line 215 lab7: do { v_6 = limit - cursor; lab8: do { // call exception2, line 215 if (!r_exception2()) { break lab8; } break lab7; } while (false); cursor = limit - v_6; // (, line 215 // do, line 217 v_7 = limit - cursor; lab9: do { // call Step_1b, line 217 if (!r_Step_1b()) { break lab9; } } while (false); cursor = limit - v_7; // do, line 218 v_8 = limit - cursor; lab10: do { // call Step_1c, line 218 if (!r_Step_1c()) { break lab10; } } while (false); cursor = limit - v_8; // do, line 220 v_9 = limit - cursor; lab11: do { // call Step_2, line 220 if (!r_Step_2()) { break lab11; } } while (false); cursor = limit - v_9; // do, line 221 v_10 = limit - cursor; lab12: do { // call Step_3, line 221 if (!r_Step_3()) { break lab12; } } while (false); cursor = limit - v_10; // do, line 222 v_11 = limit - cursor; lab13: do { // call Step_4, line 222 if (!r_Step_4()) { break lab13; } } while (false); cursor = limit - v_11; // do, line 224 v_12 = limit - cursor; lab14: do { // call Step_5, line 224 if (!r_Step_5()) { break lab14; } } while (false); cursor = limit - v_12; } while (false); cursor = limit_backward; // do, line 227 v_13 = cursor; lab15: do { // call postlude, line 227 if (!r_postlude()) { break lab15; } } while (false); cursor = v_13; } while (false); return true; } public boolean equals( Object o ) { return o instanceof englishStemmer; } public int hashCode() { return englishStemmer.class.getName().hashCode(); } }
然而,porter stemming 仅仅是一个基于后缀的词干提取技术,它仅仅定义了一些基本的后缀规则,能识别出"books"->"book"等. 然而针对一些诸如 "bought"->"buy","brought"->"bring"等异常形式并不能识别出来。
2. The dragon toolkit (http://dragon.ischool.drexel.edu/download.asp)
然后发现上面nlp 处理工具,其中的EngLemmatiser 类就是stem类,能提取出单词的词干。
它首先定义一些基本点后缀规则(只有十几条),然后定义一些独立于这些规则的异常词库(master slave 的形式,这样就能基本实现单词词干的正确提取,解决了porter stemming 存在的问题。
String dictionaryPath = "lemmatiser"; EngLemmatiser lemmatiser = new EngLemmatiser(dictionaryPath, false, true); String a = "brought"; String lemmatizedWord = lemmatiser.lemmatize(a); System.out.println(lemmatizedWord);
然而我还是觉得,在规则基础之上附加词典的技术过于死板,不够灵活。
3. Stanford CoreNLP
后来发现斯坦福大学的一个NLP工具,其中提取词干的技术:针对大量语料库进行机器学习,利用有限自动机提炼并生成规则(不必附加词典)。能完美解决词干的提取问题,准确率很高。它对地名、人名等专有词识别不出来,但达到了基本的需求。
String word="magnificus"; Morphology morph=new Morphology(); System.out.println(morph.stem(word));