需要重采样的数据文件(Libsvm format),如heart_scale
+1 1:0.708333 2:1 3:1 4:-0.320755 5:-0.105023 6:-1 7:1 8:-0.419847 9:-1 10:-0.225806 12:1 13:-1 -1 1:0.583333 2:-1 3:0.333333 4:-0.603774 5:1 6:-1 7:1 8:0.358779 9:-1 10:-0.483871 12:-1 13:1 ....
重采样后的数据保存文件(Libsvm format),这里heart_scale_balance.txt
Python code:
from sklearn.datasets import load_svmlight_file from sklearn.datasets import dump_svmlight_file import numpy as np from sklearn.utils import check_random_state from scipy.sparse import hstack,vstack def fit_sample(X, y): """Resample the dataset. """ label = np.unique(y) stats_c_ = {} maj_n = 0 for i in label: nk = sum(y==i) stats_c_[i] = nk if nk > maj_n: maj_n = nk maj_c_ = i # Keep the samples from the majority class X_resampled = X[y == maj_c_] y_resampled = y[y == maj_c_] # Loop over the other classes over picking at random for key in stats_c_.keys(): # If this is the majority class, skip it if key == maj_c_: continue # Define the number of sample to create num_samples = int(stats_c_[maj_c_] -stats_c_[key]) # Pick some elements at random random_state = check_random_state(42) indx = random_state.randint(low=0, high=stats_c_[key],size=num_samples) # Concatenate to the majority class X_resampled = vstack([X_resampled,X[y == key],X[y == key][indx]]) print np.shape(y_resampled),np.shape(y[y == key]),np.shape(y[y == key][indx]) y_resampled = list(y_resampled)+list(y[y == key])+list(y[y == key][indx]) return X_resampled, y_resampled X_train, y_train = load_svmlight_file("heart_scale") # Apply the random over-sampling X_train, y_train = fit_sample(X_train,y_train) dump_svmlight_file(X_train, y_train,'heart_scale_balance.txt',zero_based=False)