分类问题有时是个回归问题。这就需要找到阈值来将样本准确地划分到类别。
例如一个文本情感分类问题:情感有0(消极)、1(中性)、2(积极)三种类别。回归器返回的情感的分值分别为0.2,0.3,0.4,0.45,0.66,1.2,1.3,1.4,它们对应的类别分别为0,0,1,2,1,1,2,2,需要找到两个阈值x,y,小于x的表示0类别,x和y之间的表示1类别,大于y的表示2类别。
如果寻找最佳答案,复杂度为O(样本数^类别数)。
如果使用贪心法,问题复杂度可以降为O(样本数)的复杂度。
把每次寻找阈值当做一个二分类问题。这种方法能够牺牲准确性换来时间效率的提升。
import java.util.Arrays;
import java.util.Comparator;
import java.util.Random;
public class Main {
Random r = new Random();
class Sample {
int score;//样本的得分
int type;//样本的类别
Sample(int score, int type) {
this.score = score;
this.type = type;
}
@Override
public String toString() {
return "(" + score + "," + type + ")";
}
}
Sample[] generateProblem() {
int n = r.nextInt(4) + 2;
Sample[] a = new Sample[n];
for (int i = 0; i < n; i++) {
a[i] = new Sample(r.nextInt(50), r.nextInt(3));
}
Arrays.sort(a, Comparator.comparingInt(x -> x.score));
return a;
}
int bruteforceScore(Sample[] a) {
int bestI = 0, bestJ = 0;
int bestScore = 0;
for (int i = 0; i <= a.length; i++) {//第一个阈值
for (int j = i; j <= a.length; j++) {//第二个阈值
int score = getScore(a, i, j);
if (score > bestScore) {
bestScore = score;
bestI = i;
bestJ = j;
}
}
}
System.out.println("ans i: " + bestI + " ans j:" + bestJ);
return bestScore;
}
int getScore(Sample[] a, int i, int j) {
int rightCount = 0;
for (int k = 0; k < a.length; k++) {
if (k < i && a[k].type == 0) {
rightCount++;
} else if (k >= i && k < j && a[k].type == 1) {
rightCount++;
} else if (k >= j && a[k].type == 2) {
rightCount++;
}
}
return rightCount;
}
int mine(Sample[] a) {
int bestI = 0;
long bestRightCOunt = 0;
long rightCount = Arrays.stream(a).filter(x -> x.type != 0).count();
for (int i = 0; i < a.length; i++) {
if (rightCount >= bestRightCOunt) {
bestRightCOunt = rightCount;
bestI = i;
}
if (a[i].type == 0) rightCount++;
else rightCount--;
}
if (rightCount >= bestRightCOunt) {
bestI = a.length;
}
bestRightCOunt = 0;
final int goodI = bestI;
if (goodI == a.length) {
//全0的情况
return getScore(a, a.length, a.length);
}
rightCount = Arrays.stream(a).filter(x -> x.score >= a[goodI].score && x.type == 2).count();
int bestJ = 0;
for (int i = bestI; i < a.length; i++) {
if (rightCount > bestRightCOunt) {
bestRightCOunt = rightCount;
bestJ = i;
}
if (a[i].type == 1) rightCount++;
else if (a[i].type == 2) rightCount--;
}
if (rightCount > bestRightCOunt) {
bestJ = a.length;
}
System.out.println();
System.out.println("bestI " + bestI + " bestJ " + bestJ);
return getScore(a, bestI, bestJ);
}
Main() {
while (true) {
Sample a[] = generateProblem();
for (Sample i : a) {
System.out.print(i);
}
System.out.println();
int x = bruteforceScore(a);
int y = mine(a);
System.out.println(x + " " + y);
if (x != y) break;
}
}
public static void main(String[] args) {
new Main();
}
}