Levenshtein Distance(LD)算法是比较两字符串的相似性。它们的距离就是一个字符串转换成那一个字符串过程中的添加、删除、修改数值,如果s是源字符串souce,t是目标字符串target,比较s和t是否相似,如下:
如果s="test",t="test",那么LD(str1,str2) = 0。没有经过转换。
如果s="test",t="tent",那么LD(str1,str2) = 1。str1的"s"转换"n",转换了一个字符,所以是1。如果它们的距离越大,说明它们越不同。
Levenshtein distance最先是由俄国科学家Vladimir Levenshtein在1965年发明,用他的名字命名。
Levenshtein distance可以用来:
Spell checking(拼写检查)
Speech recognition(语句识别)
DNA analysis(DNA分析)
Plagiarism detection(抄袭检测)
LD算法大概过程:
Steps
Step | Description |
---|---|
1 | Set n to be the length of s. |
2 | Initialize the first row to 0..n. |
3 | Examine each character of s (i from 1 to n). |
4 | Examine each character of t (j from 1 to m). |
5 | If s[i] equals t[j], the cost is 0. |
6 | Set cell d[i,j] of the matrix equal to the minimum of: |
7 | After the iteration steps (3, 4, 5, 6) are complete, the distance is found in cell d[n,m] |
distance头文件
- class Distance
- {
- public:
- int LD (char const *s, char const *t);
- private:
- int Minimum (int a, int b, int c);
- int *GetCellPointer (int *pOrigin, int col, int row, int nCols);
- int GetAt (int *pOrigin, int col, int row, int nCols);
- void PutAt (int *pOrigin, int col, int row, int nCols, int x);
- };
distance代码文件
- #include "distance.h"
- #include <string.h>
- #include <malloc.h>
- //****************************
- // Get minimum of three values
- //****************************
- int Distance::Minimum (int a, int b, int c)
- {
- int mi;
- mi = a;
- if (b < mi) {
- mi = b;
- }
- if (c < mi) {
- mi = c;
- }
- return mi;
- }
- //**************************************************
- // Get a pointer to the specified cell of the matrix
- //**************************************************
- int *Distance::GetCellPointer (int *pOrigin, int col, int row, int nCols)
- {
- return pOrigin + col + (row * (nCols + 1));
- }
- //*****************************************************
- // Get the contents of the specified cell in the matrix
- //*****************************************************
- int Distance::GetAt (int *pOrigin, int col, int row, int nCols)
- {
- int *pCell;
- pCell = GetCellPointer (pOrigin, col, row, nCols);
- return *pCell;
- }
- //*******************************************************
- // Fill the specified cell in the matrix with the value x
- //*******************************************************
- void Distance::PutAt (int *pOrigin, int col, int row, int nCols, int x)
- {
- int *pCell;
- pCell = GetCellPointer (pOrigin, col, row, nCols);
- *pCell = x;
- }
- //*****************************
- // Compute Levenshtein distance
- //*****************************
- int Distance::LD (char const *s, char const *t)
- {
- int *d; // pointer to matrix
- int n; // length of s
- int m; // length of t
- int i; // iterates through s
- int j; // iterates through t
- char s_i; // ith character of s
- char t_j; // jth character of t
- int cost; // cost
- int result; // result
- int cell; // contents of target cell
- int above; // contents of cell immediately above
- int left; // contents of cell immediately to left
- int diag; // contents of cell immediately above and to left
- int sz; // number of cells in matrix
- // Step 1
- n = strlen (s);
- m = strlen (t);
- if (n == 0) {
- return m;
- }
- if (m == 0) {
- return n;
- }
- sz = (n+1) * (m+1) * sizeof (int);
- d = (int *) malloc (sz);
- // Step 2
- for (i = 0; i <= n; i++) {
- PutAt (d, i, 0, n, i);
- }
- for (j = 0; j <= m; j++) {
- PutAt (d, 0, j, n, j);
- }
- // Step 3
- for (i = 1; i <= n; i++) {
- s_i = s[i-1];
- // Step 4
- for (j = 1; j <= m; j++) {
- t_j = t[j-1];
- // Step 5
- if (s_i == t_j) {
- cost = 0;
- }
- else {
- cost = 1;
- }
- // Step 6
- above = GetAt (d,i-1,j, n);
- left = GetAt (d,i, j-1, n);
- diag = GetAt (d, i-1,j-1, n);
- cell = Minimum (above + 1, left + 1, diag + cost);
- PutAt (d, i, j, n, cell);
- }
- }
- // Step 7
- result = GetAt (d, n, m, n);
- free (d);
- return result;
- }