机器学习：PLA

国庆期间的作业：

* 了解掌握线性感知机算法（PLA）的基本原理和算法流程，并使用PLA来解决一个实际的分类问题。

数据集介绍：

data1.csv —— 维度为100x3，包含100个样本，前两列是数据特征，最后一列是输出标签label。该数据集线性可分。

线性可分，采用PLA

线性不可分：采用Pocket Learning Algorithm

#利用Python实现感知机算法的原始形式
# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#1、创建数据集
def createdata():
 df=pd.read_csv('data1.csv',names=['x','y','labels'])
 samples=df[['x','y']]
 samples=samples.values
 labels=df['labels']
 labels=labels.tolist()
 return samples,labels

#训练感知机模型
class Perceptron:
 def __init__(self,x,y,a=1):
  self.x=x
  self.y=y
  self.w=np.zeros((x.shape[1],1))#初始化权重，w1,w2均为0
  self.b=0
  self.a=1#学习率
  self.numsamples=self.x.shape[0]
  self.numfeatures=self.x.shape[1]

 def sign(self,w,b,x):
  y=np.dot(x,w)+b
  return int(y)

 def update(self,label_i,data_i):
  tmp=label_i*self.a*data_i
  tmp=tmp.reshape(self.w.shape)
  #更新w和b
  self.w=tmp+self.w
  self.b=self.b+label_i*self.a

 def train(self):
  isFind=False
  while not isFind:
   count=0
   for i in range(self.numsamples):
    tmpY=self.sign(self.w,self.b,self.x[i,:])
   # print ('a')
    if tmpY*self.y[i]<=0:#如果是一个误分类实例点
 #     ss="误分类点为："+self.x[i,:]+"此时的w和b为："+self.w+ self.b
     print ("误分类点为：",self.x[i,:],"此时的w和b为：",self.w, self.b)
     count+=1
     self.update(self.y[i],self.x[i,:])
   if count==0:
    print ('最终训练得到的w和b为：',self.w,self.b)
    isFind=True
  return self.w,self.b

#画图描绘
class Picture:
 def __init__(self,data,labels,w,b):
  self.b=b
  self.w=w
  self.data=data
  self.labels=labels
  plt.figure(1)
  plt.title('Perceptron Learning Algorithm',size=14)
  plt.xlabel('x0-axis',size=14)
  plt.ylabel('x1-axis',size=14)

  xData=np.linspace(4,7,100)
  yData=self.expression(xData)
  plt.plot(xData,yData,color='r',label='sample data')

  for i in range(data.shape[0]):
   if labels[i] != -1:
    plt.scatter(data[i][0],data[i][1],s=15)
   else:
    plt.scatter(data[i][0],data[i][1],s=15,marker='x')
  plt.savefig('2d.png',dpi=175)
    
 def expression(self,x):
  y=(-self.b-self.w[0]*x)/self.w[1]#注意在此，把x0，x1当做两个坐标轴，把x1当做自变量，x2为因变量
  return y

 def Show(self):
  plt.show()


if __name__ == '__main__':
 samples,labels=createdata()
 myperceptron=Perceptron(x=samples,y=labels)
 weights,bias=myperceptron.train()
 Picture=Picture(samples,labels,weights,bias)
 Picture.Show()

2. 线性不可分

经过测试，针对该数据庥，迭代次数=10000时，效果较好。

#!/usr/bin/python3
# -*- coding: utf-8 -*-

"""
Description :
    pocket algorithm
"""

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# load data from DataSet.txt
data_set = []
data_label = []
#file = open('DataSet.txt')
# file = open('DataSet_linear_separable.txt')
'''
for line in file:
    line = line.split('	')
    for i in range(len(line)):
        line[i] = float(line[i])
    data_set.append(line[0:2])
    data_label.append(int(line[-1]))
file.close()
data = np.array(data_set)

for i in range(len(data_label)):
    if data_label[i] != 1:
        data_label[i] = -1
label = np.array(data_label)
'''
df=pd.read_csv('data2.csv',names=['x','y','label'])
data=df[['x','y']]
data=data.values
label=df['label']
label=label.tolist()


# Initialize w, b, alpha
w = np.array([0.5, 1])
b = 0
alpha = 0.4
trainLoss = []
# Calculate train_loss
f = (np.dot(data, w.T) + b) * label
idx = np.where(f <= 0)
idx = np.array(idx, dtype=int)
idx = idx.tolist()
'''
l=[]
for m in idx:
    for i in m:
        l.append(i)
'''
idx = [i for item in idx for i in item]

for i in idx:
 train_loss = -np.sum((np.dot(data[i], w.T) + b) * label[i]) / (np.sqrt(w[0]**2+w[1]**2))
 trainLoss.append(train_loss)
# iteration
max_iter = 10000
iteration = 1
start = time.time()
while iteration <= max_iter:
    print('iteration:',iteration)
    if f[idx].size == 0:
        break
    for sample in data[idx]:
        i = 0
        w += alpha * sample * label[idx[i]]
        b += alpha * label[idx[i]]
        i += 1
    print('Iteration:%d  w:%s  b:%s' % (iteration, w, b))
    f = (np.dot(data, w.T) + b) * label
    idx = np.where(f <= 0)
    idx = (np.array(idx, dtype=int)).tolist()
    idx = [i for item in idx for i in item]
    for i in idx:
        train_loss = -np.sum((np.dot(data[i], w.T) + b) * label[i]) / (np.sqrt(w[0] ** 2 + w[1] ** 2))
        trainLoss.append(train_loss)
    iteration = iteration + 1

if f[idx].size == 0:
    accuracy = 100
else:
    accuracy = len(f[idx]) / len(label) * 100
end = time.time()
print('Pocket learning algorithm is over')
print('train time is %f s.' % (end - start))
print('-'*50)
print('min trainLoss: %f' % np.min(trainLoss))
print('Classification accuracy: %.2f%%' % accuracy)

# draw
'''
x1 = np.arange(1, 100, 0.1)
x2 = (w[0] * x1 + b) / (-w[1])
idx_p = np.where(label == 1)
idx_n = np.where(label != 1)
data_p = data[idx_p]
data_n = data[idx_n]
plt.figure()
plt.scatter(data_p[:, 0], data_p[:, 1], color='b')
plt.scatter(data_n[:, 0], data_n[:, 1], color='r')
plt.plot(x1, x2)
plt.show()
'''

plt.figure(1)
plt.title('Pocket learning algorithm',size=14)
plt.xlabel('x0-axis',size=14)
plt.ylabel('x1-axis',size=14)

xData=np.linspace(5,8,100)
yData=(w[0] * xData + b) / (-w[1])
plt.plot(xData,yData,color='r',label='sample data')

for i in range(data.shape[0]):
    if label[i] != -1:
        plt.scatter(data[i][0],data[i][1],s=15)
    else:
       plt.scatter(data[i][0],data[i][1],s=15,marker='x')
plt.savefig('PLA(PocketLearning_1.png',dpi=175)


plt.figure()
plt.plot(trainLoss)
plt.ylabel('trainLoss')
plt.xlabel('Iteration')
plt.savefig('PLA(PocketLearning_2.png',dpi=175)
plt.show()

相关阅读:
数组优化 Dijkstra 最短路
 F
树 (p155, 从中序和后续回复二叉树)
矩阵连乘 LRJ白书 p141 栈解析表达式
 Train Problem II HDU 1023 卡特兰数
 codevs 1166 矩阵取数游戏
 BZOJ 2754: [SCOI2012]喵星球上的点名
 2017.6.11 校内模拟赛
 HDU 2896 病毒侵袭
 UvaLive 4670 Dominating Patterns
原文地址：https://www.cnblogs.com/marathoner/p/9772500.html