1- 问题提出
2- 线性回归
3- 理论推导
4- Python/Spark实现
1 # -*- coding: utf-8 -*- 2 from pyspark import SparkContext 3 4 5 theta = [0, 0] 6 alpha = 0.001 7 8 sc = SparkContext('local') 9 10 def func_theta_x(x): 11 return sum([i * j for i, j in zip(theta, x)]) 12 13 def cost(x): 14 thx = func_theta_x(x) 15 return thx - x[-1] 16 17 def partial_theta(x): 18 dif = cost(x) 19 return [dif * i for i in x[:-1]] 20 21 rdd = sc.textFile('/home/freyr/linearRegression.txt') 22 .map(lambda line: map(float, line.strip().split(' '))) 23 24 maxiter = 400 25 iter = 0 26 while True: 27 parTheta = rdd.map(partial_theta) 28 .reduce(lambda x, y: [i + j for i, j in zip(x, y)]) 29 30 for i in range(2): 31 theta[i] = theta[i] - alpha * parTheta[i] 32 33 iter += 1 34 35 if iter <= maxiter: 36 if sum(map(abs, parTheta)) <= 0.01: 37 print 'I get it!!!' 38 print 'Iter = %s' % iter 39 print 'Theta = %s' % theta 40 break 41 else: 42 print 'Failed...' 43 break
PS: 1. linearRegression.txt