• Spark机器学习读书笔记-CH03


    3.1.获取数据:

    wget http://files.grouplens.org/datasets/movielens/ml-100k.zip

     3.2.探索与可视化数据:

    In [3]: user_data=sc.textFile("file:///root/studio/MachineLearningWithSpark/ch03/ml-100k/u.user")

    In [4]: user_data.first()

    Out[4]: u'1|24|M|technician|85711'

    In [5]: user_fields=user_data.map(lambda line: line.split("|"))

    In [8]: num_users = user_fields.map(lambda fields: fields[0]).count()

    In [10]: num_genders=user_fields.map(lambda fields: fields[2]).distinct().count()

    In [11]: num_occupations=user_fields.map(lambda fields: fields[3]).distinct().count()

    In [12]: num_zIpcodes=user_fields.map(lambda fields: fields[4]).distinct().count()

    In [16]: print "Users: %d, genders: %d, occupations: %d, zip codes: %d" %(num_users, num_genders, num_occupations, num_zipcodes)
    Users: 943, genders: 2, occupations: 21, zip codes: 795

    In [17]: ages = user_fields.map(lambda x: int(x[1])).collect()

    In [18]: hist(ages, bins=20, color='lightblue', normed=True)
    Out[18]:
    (array([ 0.00064269, 0.00192808, 0.00449886, 0.0279572 , 0.02956393,
    0.03374144, 0.04563129, 0.02538642, 0.02088756, 0.01863813,
    0.02088756, 0.01606735, 0.0170314 , 0.01863813, 0.00674829,
    0.00482021, 0.0054629 , 0.00192808, 0.00128539, 0.00128539]),
    array([ 7. , 10.3, 13.6, 16.9, 20.2, 23.5, 26.8, 30.1, 33.4,
    36.7, 40. , 43.3, 46.6, 49.9, 53.2, 56.5, 59.8, 63.1,
    66.4, 69.7, 73. ]),
    <a list of 20 Patch objects>)

    n [19]: fig = matplotlib.pyplot.gcf()

    In [20]: fig.set_size_inches(16, 10)

    In [23]: count_by_occupation = user_fields.map(lambda fields: (fields[3], 1)).reduceByKey(lambda x, y: x + y).collect()

    In [24]: import numpy as np

    In [25]: x_axis1 = np.array([c[0] for c in count_by_occupation])

    In [26]: y_axis1 = np.array([c[1] for c in count_by_occupation])

    In [27]: x_axis = x_axis1[np.argsort(x_axis1)]

    In [28]: y_axis = y_axis1[np.argsort(y_axis1)]

    In [29]: pos = np.arange(len(x_axis))

    In [30]: width = 1.0

    In [31]: ax = plt.axes()

    In [32]: ax.set_xticks(pos + (width / 2))
    Out[32]:
    [<matplotlib.axis.XTick at 0x7f1257bc6f50>,
    <matplotlib.axis.XTick at 0x7f1257bc6a10>,
    <matplotlib.axis.XTick at 0x7f1256fa2050>,
    <matplotlib.axis.XTick at 0x7f1256fa2910>,
    <matplotlib.axis.XTick at 0x7f1256fbe090>,
    <matplotlib.axis.XTick at 0x7f1256fbe7d0>,
    <matplotlib.axis.XTick at 0x7f1256fbef10>,
    <matplotlib.axis.XTick at 0x7f1256fc9690>,
    <matplotlib.axis.XTick at 0x7f1256fc9dd0>,
    <matplotlib.axis.XTick at 0x7f124e6033d0>,
    <matplotlib.axis.XTick at 0x7f1257b604d0>,
    <matplotlib.axis.XTick at 0x7f124e603c90>,
    <matplotlib.axis.XTick at 0x7f1257b602d0>,
    <matplotlib.axis.XTick at 0x7f1257b60d90>,
    <matplotlib.axis.XTick at 0x7f124e60f510>,
    <matplotlib.axis.XTick at 0x7f124e60fc50>,
    <matplotlib.axis.XTick at 0x7f124e6183d0>,
    <matplotlib.axis.XTick at 0x7f124e618b10>,
    <matplotlib.axis.XTick at 0x7f124e623290>,
    <matplotlib.axis.XTick at 0x7f124e6239d0>,
    <matplotlib.axis.XTick at 0x7f121c583150>]

    In [34]: ax.set_xticklabels(x_axis)
    Out[34]:
    [<matplotlib.text.Text at 0x7f1257bc6410>,
    <matplotlib.text.Text at 0x7f1257b68350>,
    <matplotlib.text.Text at 0x7f1256fa2790>,
    <matplotlib.text.Text at 0x7f1256fa2ed0>,
    <matplotlib.text.Text at 0x7f1256fbe650>,
    <matplotlib.text.Text at 0x7f1256fbed90>,
    <matplotlib.text.Text at 0x7f1256fc9510>,
    <matplotlib.text.Text at 0x7f1256fc9c50>,
    <matplotlib.text.Text at 0x7f1256fd23d0>,
    <matplotlib.text.Text at 0x7f1257c29ad0>,
    <matplotlib.text.Text at 0x7f124e603f10>,
    <matplotlib.text.Text at 0x7f1257b60510>,
    <matplotlib.text.Text at 0x7f1257b60c10>,
    <matplotlib.text.Text at 0x7f124e60f390>,
    <matplotlib.text.Text at 0x7f124e60fad0>,
    <matplotlib.text.Text at 0x7f124e618250>,
    <matplotlib.text.Text at 0x7f124e618990>,
    <matplotlib.text.Text at 0x7f124e623110>,
    <matplotlib.text.Text at 0x7f124e623850>,
    <matplotlib.text.Text at 0x7f124e623f90>,
    <matplotlib.text.Text at 0x7f121c583710>]

    In [35]: plt.bar(pos, y_axis, width, color='lightblue')
    Out[35]: <Container object of 21 artists>

    In [36]: plt.xticks(rotation=30)
    Out[36]:
    (array([ 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5,
    9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5, 16.5, 17.5,

    18.5, 19.5, 20.5]), <a list of 21 Text xticklabel objects>)

    In [37]: fig = matplotlib.pyplot.gcf()

    In [38]: fig.set_size_inches(16, 10)

    In [39]: count_by_occupation2 = user_fields.map(lambda fields: fields[3]).countByValue()

    In [46]: print "Map-reduce approach: "
    Map-reduce approach:

    In [47]: print dict(count_by_occupation)
    {u'administrator': 79, u'writer': 45, u'retired': 14, u'lawyer': 12, u'doctor': 7, u'marketing': 26, u'executive': 32, u'none': 9, u'entertainment': 18, u'healthcare': 16, u'scientist': 31, u'student': 196, u'educator': 95, u'technician': 27, u'librarian': 51, u'programmer': 66, u'artist': 28, u'salesman': 12, u'other': 105, u'homemaker': 7, u'engineer': 67}

    In [48]: print ""


    In [49]: print "countByValue approach:"
    countByValue approach:

    In [50]: print dict(count_by_occupation2)
    {u'administrator': 79, u'retired': 14, u'lawyer': 12, u'healthcare': 16, u'marketing': 26, u'executive': 32, u'scientist': 31, u'student': 196, u'technician': 27, u'librarian': 51, u'programmer': 66, u'salesman': 12, u'homemaker': 7, u'engineer': 67, u'none': 9, u'doctor': 7, u'writer': 45, u'entertainment': 18, u'other': 105, u'educator': 95, u'artist': 28}

    In [51]: movie_data=sc.textFile("file:///root/studio/MachineLearningWithSpark/ch03/ml-100k/u.item")

    In [52]: print movie_data.first()
    1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0

    In [53]: num_movies = movie_data.count()

    In [54]: print "Movies: %d " % num_movies
    Movies: 1682

    In [51]: movie_data=sc.textFile("file:///root/studio/MachineLearningWithSpark/ch03/ml-100k/u.item")

    In [52]: print movie_data.first()
    1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0

    In [53]: num_movies = movie_data.count()

    In [54]: print "Movies: %d " % num_movies
    Movies: 1682

    In [55]: def convert_year(x):
    ....: try:
    ....: return int(x[-4:])
    ....: except:
    ....: return 1990
    ....:

    In [56]: movie_fields = movie_data.map(lambda lines: lines.split("|"))

    In [57]: years = movie_fields.map(lambda fields: fields[2]).map(lambda x: convert_year(x))

    In [58]: years_filtered = years.filter(lambda x: x != 1900)

    In [59]: movie_ages = years_filtered.map(lambda yr: 1998 - yr).countByValue()

    In [60]: values = movie_ages.values()

    In [61]: bins = movie_ages.keys()

    In [62]: hist(values, bins=bins, color='lightblue', normed=True)
    Out[62]:
    (array([ 0. , 0.07575758, 0.09090909, 0.09090909, 0.18181818,
    0.18181818, 0.04545455, 0.07575758, 0.07575758, 0.03030303,
    0. , 0.01515152, 0.01515152, 0.03030303, 0. ,
    0.03030303, 0. , 0. , 0. , 0. ,
    0. , 0. , 0.01515152, 0. , 0. ,
    0.01515152, 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0. , 0. , 0.01515152, 0. , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0.01515152, 0. , 0. , 0. , 0. ]),
    array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
    34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
    51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
    68, 72, 76]),
    <a list of 70 Patch objects>)

    In [63]: fig = matplotlib.pyplot.gcf()

    In [64]: fig.set_size_inches(16, 10)

    In [65]: rating_data = sc.textFile("file:///root/studio/MachineLearningWithSpark/ch03/ml-100k/u.data")

    In [66]: print rating_data.first()
    196 242 3 881250949

    In [67]: num_ratings = rating_data.count()

    In [68]: print "Ratings: %d " % num_ratings
    Ratings: 100000

    In [76]: rating_data = rating_data.map(lambda line: line.split(" "))

    In [77]: ratings = rating_data.map(lambda fields: int(fields[2]))

    In [78]: max_rating = ratings.reduce(lambda x, y: max(x, y))

    In [79]: min_rating = ratings.reduce(lambda x, y: min(x, y))

    In [80]: mean_rating = ratings.reduce(lambda x, y: x + y)/num_ratings

    In [81]: median_rating = np.median(ratings.collect())

    In [82]: ratings_per_uer = num_ratings / num_users

    In [76]: rating_data = rating_data.map(lambda line: line.split(" "))

    In [77]: ratings = rating_data.map(lambda fields: int(fields[2]))

    In [78]: max_rating = ratings.reduce(lambda x, y: max(x, y))

    In [79]: min_rating = ratings.reduce(lambda x, y: min(x, y))

    In [80]: mean_rating = ratings.reduce(lambda x, y: x + y)/num_ratings

    In [81]: median_rating = np.median(ratings.collect())

    In [82]: ratings_per_uer = num_ratings / num_users

    In [83]: ratings_per_movie = num_ratings / num_movies

    In [84]: print "Min ratings: %d" % min_rating
    Min ratings: 1

    In [85]: print "Max ratings: %d" % max_rating
    Max ratings: 5

    In [86]: print "Average rating: %2.2f" % mean_rating
    Average rating: 3.00

    In [87]: print "Median rating: %d" % mean_rating
    Median rating: 3

    In [88]: print "Average # of ratings per user: %2.2f" % ratings_per_uer
    Average # of ratings per user: 106.00

    In [89]: print "Average # of ratings per movie: %2.2f" % ratings_per_movie
    Average # of ratings per movie: 59.00


    In [90]: ratings.stats()
    Out[90]: (count: 100000, mean: 3.52986, stdev: 1.12566797076, max: 5.0, min: 1.0)

    In [91]: count_by_rating = ratings.countByValue()

    In [92]: x_axis = np.array(count_by_rating.keys())

    In [93]: y_axis = np.array([float(c) for c in count_by_rating.values()])

    In [94]: y_axis_normed = y_axis / y_axis.sum()

    In [95]: pos = np.arange(len(x_axis))

    In [96]: width = 1.0

    In [97]: ax = plt.axes()

    In [98]: ax.set_xticks(pos + (width / 2))
    Out[98]:
    [<matplotlib.axis.XTick at 0x7f121c371250>,
    <matplotlib.axis.XTick at 0x7f121c360d90>,
    <matplotlib.axis.XTick at 0x7f121c2e0e10>,
    <matplotlib.axis.XTick at 0x7f121c2df5d0>,
    <matplotlib.axis.XTick at 0x7f121c2dfd10>]

    In [99]: ax.set_xticklabels(x_axis)
    Out[99]:
    [<matplotlib.text.Text at 0x7f121c290ed0>,
    <matplotlib.text.Text at 0x7f121c298c90>,
    <matplotlib.text.Text at 0x7f121c2df450>,
    <matplotlib.text.Text at 0x7f121c2dfb90>,
    <matplotlib.text.Text at 0x7f121c2fd310>]

    In [100]:

    In [100]: plt.bar(pos, y_axis_normed, width, color='lightblue')
    Out[100]: <Container object of 5 artists>

    In [101]: plt.xticks(rotation=30)
    Out[101]: (array([ 0.5, 1.5, 2.5, 3.5, 4.5]), <a list of 5 Text xticklabel objects>)

    In [102]: fig = matplotlib.pyplot.gcf()

    In [103]: fig.set_size_inches(16, 10)

    In [104]: user_ratings_grouped = rating_data.map(lambda fields: (int(fields[0]), int(fields[2]))).groupByKey()

    In [105]: user_ratings_by_user = user_ratings_grouped.map(lambda (k, v): (k, len(v)))

    In [106]: user_ratings_by_user.take(5)
    Out[106]: [(2, 62), (4, 24), (6, 211), (8, 59), (10, 184)]

    In [107]: user_ratings_by_user_local = user_ratings_by_user.map(lambda (k, v): v).collect()

    In [108]: hist(user_ratings_by_user_local, bins=200, color='lightblue', normed=True)
    Out[108]:
    (array([ 0.02958007, 0.02129765, 0.01212783, 0.01212783, 0.00798662,
    0.00946562, 0.00916982, 0.00739502, 0.00769082, 0.00621181,
    0.00887402, 0.00532441, 0.00562021, 0.00414121, 0.00384541,
    0.00532441, 0.00236641, 0.00354961, 0.0017748 , 0.0017748 ,
    0.00295801, 0.00266221, 0.00325381, 0.00414121, 0.00414121,
    0.00266221, 0.0017748 , 0.00236641, 0.00266221, 0.00295801,
    0.0020706 , 0.0020706 , 0.00354961, 0.0017748 , 0.00236641,
    0.00384541, 0.0017748 , 0.00295801, 0.001479 , 0.00266221,
    0.0011832 , 0.001479 , 0.0017748 , 0.0008874 , 0.001479 ,
    0.00236641, 0.0020706 , 0.001479 , 0.0008874 , 0.001479 ,
    0.0008874 , 0.0020706 , 0.0011832 , 0.0008874 , 0.0020706 ,
    0.0002958 , 0.0017748 , 0.0011832 , 0.0011832 , 0.0017748 ,
    0.001479 , 0.0011832 , 0.0008874 , 0.0002958 , 0.0005916 ,
    0.0002958 , 0.0008874 , 0.0008874 , 0.0002958 , 0.0008874 ,
    0.0017748 , 0.001479 , 0.0008874 , 0.0008874 , 0.0005916 ,
    0. , 0.0011832 , 0.0002958 , 0.0002958 , 0.0011832 ,
    0.0002958 , 0.0005916 , 0.0005916 , 0.0005916 , 0.0005916 ,
    0.0008874 , 0. , 0.0008874 , 0. , 0.0002958 ,
    0. , 0. , 0.0002958 , 0. , 0.0011832 ,
    0.0002958 , 0.0002958 , 0.0002958 , 0. , 0.0002958 ,
    0.0005916 , 0. , 0.0011832 , 0. , 0. ,
    0.0008874 , 0.0002958 , 0.0002958 , 0. , 0.0002958 ,
    0. , 0. , 0. , 0. , 0. ,
    0.0005916 , 0. , 0. , 0. , 0.0002958 ,
    0. , 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0.0002958 , 0.0002958 ,
    0. , 0.0005916 , 0. , 0. , 0. ,
    0. , 0. , 0. , 0.0002958 , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0.0002958 , 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0. , 0.0002958 , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0.0002958 , 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0.0002958 ]),
    array([ 20. , 23.585, 27.17 , 30.755, 34.34 , 37.925,
    41.51 , 45.095, 48.68 , 52.265, 55.85 , 59.435,
    63.02 , 66.605, 70.19 , 73.775, 77.36 , 80.945,
    84.53 , 88.115, 91.7 , 95.285, 98.87 , 102.455,
    106.04 , 109.625, 113.21 , 116.795, 120.38 , 123.965,
    127.55 , 131.135, 134.72 , 138.305, 141.89 , 145.475,
    149.06 , 152.645, 156.23 , 159.815, 163.4 , 166.985,
    170.57 , 174.155, 177.74 , 181.325, 184.91 , 188.495,
    192.08 , 195.665, 199.25 , 202.835, 206.42 , 210.005,
    213.59 , 217.175, 220.76 , 224.345, 227.93 , 231.515,
    235.1 , 238.685, 242.27 , 245.855, 249.44 , 253.025,
    256.61 , 260.195, 263.78 , 267.365, 270.95 , 274.535,
    278.12 , 281.705, 285.29 , 288.875, 292.46 , 296.045,
    299.63 , 303.215, 306.8 , 310.385, 313.97 , 317.555,
    321.14 , 324.725, 328.31 , 331.895, 335.48 , 339.065,
    342.65 , 346.235, 349.82 , 353.405, 356.99 , 360.575,
    364.16 , 367.745, 371.33 , 374.915, 378.5 , 382.085,
    385.67 , 389.255, 392.84 , 396.425, 400.01 , 403.595,
    407.18 , 410.765, 414.35 , 417.935, 421.52 , 425.105,
    428.69 , 432.275, 435.86 , 439.445, 443.03 , 446.615,
    450.2 , 453.785, 457.37 , 460.955, 464.54 , 468.125,
    471.71 , 475.295, 478.88 , 482.465, 486.05 , 489.635,
    493.22 , 496.805, 500.39 , 503.975, 507.56 , 511.145,
    514.73 , 518.315, 521.9 , 525.485, 529.07 , 532.655,
    536.24 , 539.825, 543.41 , 546.995, 550.58 , 554.165,
    557.75 , 561.335, 564.92 , 568.505, 572.09 , 575.675,
    579.26 , 582.845, 586.43 , 590.015, 593.6 , 597.185,
    600.77 , 604.355, 607.94 , 611.525, 615.11 , 618.695,
    622.28 , 625.865, 629.45 , 633.035, 636.62 , 640.205,
    643.79 , 647.375, 650.96 , 654.545, 658.13 , 661.715,
    665.3 , 668.885, 672.47 , 676.055, 679.64 , 683.225,
    686.81 , 690.395, 693.98 , 697.565, 701.15 , 704.735,
    708.32 , 711.905, 715.49 , 719.075, 722.66 , 726.245,
    729.83 , 733.415, 737. ]),
    <a list of 200 Patch objects>)

    In [109]: fig = matplotlib.pyplot.gcf()

    In [110]: fig.set_size_inches(16, 10)

    In [111]: hist(user_ratings_by_user_local, bins=200, color='lightblue', normed=True)
    Out[111]:
    (array([ 0.02958007, 0.02129765, 0.01212783, 0.01212783, 0.00798662,
    0.00946562, 0.00916982, 0.00739502, 0.00769082, 0.00621181,
    0.00887402, 0.00532441, 0.00562021, 0.00414121, 0.00384541,
    0.00532441, 0.00236641, 0.00354961, 0.0017748 , 0.0017748 ,
    0.00295801, 0.00266221, 0.00325381, 0.00414121, 0.00414121,
    0.00266221, 0.0017748 , 0.00236641, 0.00266221, 0.00295801,
    0.0020706 , 0.0020706 , 0.00354961, 0.0017748 , 0.00236641,
    0.00384541, 0.0017748 , 0.00295801, 0.001479 , 0.00266221,
    0.0011832 , 0.001479 , 0.0017748 , 0.0008874 , 0.001479 ,
    0.00236641, 0.0020706 , 0.001479 , 0.0008874 , 0.001479 ,
    0.0008874 , 0.0020706 , 0.0011832 , 0.0008874 , 0.0020706 ,
    0.0002958 , 0.0017748 , 0.0011832 , 0.0011832 , 0.0017748 ,
    0.001479 , 0.0011832 , 0.0008874 , 0.0002958 , 0.0005916 ,
    0.0002958 , 0.0008874 , 0.0008874 , 0.0002958 , 0.0008874 ,
    0.0017748 , 0.001479 , 0.0008874 , 0.0008874 , 0.0005916 ,
    0. , 0.0011832 , 0.0002958 , 0.0002958 , 0.0011832 ,
    0.0002958 , 0.0005916 , 0.0005916 , 0.0005916 , 0.0005916 ,
    0.0008874 , 0. , 0.0008874 , 0. , 0.0002958 ,
    0. , 0. , 0.0002958 , 0. , 0.0011832 ,
    0.0002958 , 0.0002958 , 0.0002958 , 0. , 0.0002958 ,
    0.0005916 , 0. , 0.0011832 , 0. , 0. ,
    0.0008874 , 0.0002958 , 0.0002958 , 0. , 0.0002958 ,
    0. , 0. , 0. , 0. , 0. ,
    0.0005916 , 0. , 0. , 0. , 0.0002958 ,
    0. , 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0.0002958 , 0.0002958 ,
    0. , 0.0005916 , 0. , 0. , 0. ,
    0. , 0. , 0. , 0.0002958 , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0.0002958 , 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0. , 0.0002958 , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0.0002958 , 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0. ,
    0. , 0. , 0. , 0. , 0.0002958 ]),
    array([ 20. , 23.585, 27.17 , 30.755, 34.34 , 37.925,
    41.51 , 45.095, 48.68 , 52.265, 55.85 , 59.435,
    63.02 , 66.605, 70.19 , 73.775, 77.36 , 80.945,
    84.53 , 88.115, 91.7 , 95.285, 98.87 , 102.455,
    106.04 , 109.625, 113.21 , 116.795, 120.38 , 123.965,
    127.55 , 131.135, 134.72 , 138.305, 141.89 , 145.475,
    149.06 , 152.645, 156.23 , 159.815, 163.4 , 166.985,
    170.57 , 174.155, 177.74 , 181.325, 184.91 , 188.495,
    192.08 , 195.665, 199.25 , 202.835, 206.42 , 210.005,
    213.59 , 217.175, 220.76 , 224.345, 227.93 , 231.515,
    235.1 , 238.685, 242.27 , 245.855, 249.44 , 253.025,
    256.61 , 260.195, 263.78 , 267.365, 270.95 , 274.535,
    278.12 , 281.705, 285.29 , 288.875, 292.46 , 296.045,
    299.63 , 303.215, 306.8 , 310.385, 313.97 , 317.555,
    321.14 , 324.725, 328.31 , 331.895, 335.48 , 339.065,
    342.65 , 346.235, 349.82 , 353.405, 356.99 , 360.575,
    364.16 , 367.745, 371.33 , 374.915, 378.5 , 382.085,
    385.67 , 389.255, 392.84 , 396.425, 400.01 , 403.595,
    407.18 , 410.765, 414.35 , 417.935, 421.52 , 425.105,
    428.69 , 432.275, 435.86 , 439.445, 443.03 , 446.615,
    450.2 , 453.785, 457.37 , 460.955, 464.54 , 468.125,
    471.71 , 475.295, 478.88 , 482.465, 486.05 , 489.635,
    493.22 , 496.805, 500.39 , 503.975, 507.56 , 511.145,
    514.73 , 518.315, 521.9 , 525.485, 529.07 , 532.655,
    536.24 , 539.825, 543.41 , 546.995, 550.58 , 554.165,
    557.75 , 561.335, 564.92 , 568.505, 572.09 , 575.675,
    579.26 , 582.845, 586.43 , 590.015, 593.6 , 597.185,
    600.77 , 604.355, 607.94 , 611.525, 615.11 , 618.695,
    622.28 , 625.865, 629.45 , 633.035, 636.62 , 640.205,
    643.79 , 647.375, 650.96 , 654.545, 658.13 , 661.715,
    665.3 , 668.885, 672.47 , 676.055, 679.64 , 683.225,
    686.81 , 690.395, 693.98 , 697.565, 701.15 , 704.735,
    708.32 , 711.905, 715.49 , 719.075, 722.66 , 726.245,
    729.83 , 733.415, 737. ]),
    <a list of 200 Patch objects>)

    3.3. 处理与转换数据;

    In [112]: years_pre_processed = movie_fields.map(lambda fields: fields[2]).map(lambda x: convert_year(x)).collect()

    In [113]: years_pre_processed_array = np.array(years_pre_processed)

    In [114]: mean_year = np.mean(years_pre_processed_array[years_pre_processed_array != 1900])

    In [115]: median_year = np.median(years_pre_processed_array[years_pre_processed_array != 1900])

    In [122]: index_bad_data = np.where(years_pre_processed_array == 1900)[0]

    In [123]: index_bad_data
    Out[123]: array([], dtype=int64)

    In [124]: years_pre_processed_array[index_bad_data] = median_year

    In [125]: print "Mean year of release: %d" % mean_year
    Mean year of release: 1989

    In [126]: print "Median year of release: %d" % median_year
    Median year of release: 1995

    In [130]: print "Index of '1900' after assigning median: %s" % np.where(years_pre_processed_array == 1900)[0]
    Index of '1900' after assigning median: []

    3.4.从数据中提取有用特征:

    In [131]: all_occupations = user_fields.map(lambda fields: fields[3]).distinct().collect()

    In [132]: all_occupations.sort()

    In [133]:

    In [133]: idx = 0

    In [134]: all_occupations_dict = {}

    In [135]: for o in all_occupations:
    .....: all_occupations_dict[o] = idx
    .....: idx += 1
    .....:

    In [136]: print "Encoding of 'doctor': %d" %all_occupations_dict['doctor']
    Encoding of 'doctor': 2

    In [137]: print "Encoding of 'programmer': %d" %all_occupations_dict['programmer']
    Encoding of 'programmer': 14

    In [139]: k = len(all_occupations_dict)

    In [140]: binary_x = np.zeros(k)

    In [141]: k_programmer = all_occupations_dict['programmer']

    In [142]: binary_x[k_programmer] = 1

    In [143]: print "Binary feature vector: %s" %binary_x
    Binary feature vector: [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
    0. 0. 0.]

    In [144]: print "Length of binary vector: %d" %k
    Length of binary vector: 21

    In [145]: def extract_datetime(ts):
    .....: import datetime
    .....: return datetime.datetime.fromtimestamp(ts)
    .....:

    In [149]: timestamps = rating_data.map(lambda fields: int(fields[3]))

    In [150]: hour_of_day = timestamps.map(lambda ts: extract_datetime(ts).hour)

    In [151]: hour_of_day.take(5)
    Out[151]: [23, 3, 15, 13, 13]

    In [154]: def assign_tod(hr):
    .....: times_of_day = {
    .....: 'morning' : range(7, 12),
    .....: 'lunch' : range(12, 14),
    .....: 'afternoon' : range(14, 18),
    .....: 'evening' : range(18, 23),
    .....: 'night' : range(23, 7)
    .....: }
    .....: for k, v in times_of_day.iteritems():
    .....: if hr in v:
    .....: return k
    .....:

    In [166]: def assign_tod(hr):
    .....: times_of_day = {
    .....: 'morning' : range(7, 12),
    .....: 'lunch' : range(12, 14),
    .....: 'afternoon' : range(14, 18),
    .....: 'evening' : range(18, 23),
    .....: 'night' : range(23, 24) + range(0, 7)
    .....: }
    .....: for k, v in times_of_day.iteritems():
    .....: if hr in v:
    .....: return k
    .....:

    In [167]:

    In [167]: time_of_day = hour_of_day.map(lambda hr: assign_tod(hr))

    In [168]: time_of_day.take(5)
    Out[168]: ['night', 'night', 'afternoon', 'lunch', 'lunch']

    In [170]: def extract_titile(raw):
    .....: import re
    .....: grps = re.search("((w+))", raw)
    .....: if grps:
    .....: return raw[:grps.start()].strip()
    .....: else:
    .....: return raw
    .....:

    In [171]: raw_titles = movie_fields.map(lambda fields: fields[1])

    In [172]: for raw_title in raw_titles.take(5):
    .....: print extract_titile(raw_title)
    .....:
    Toy Story
    GoldenEye
    Four Rooms
    Get Shorty
    Copycat

    In [173]: movie_titles = raw_titles.map(lambda m: extract_titile(m))

    In [174]: title_terms = movie_titles.map(lambda t: t.split(" "))

    In [175]: print title_terms.take(5)
    [[u'Toy', u'Story'], [u'GoldenEye'], [u'Four', u'Rooms'], [u'Get', u'Shorty'], [u'Copycat']]

    In [176]: all_terms = title_terms.flatMap(lambda x: x).distinct().collect()

    In [177]: idx = 0

    In [178]: all_terms_dict = {}

    In [179]: for term in all_terms:
    .....: all_occupations_dict[term] = idx
    .....: idx += 1
    .....:

    In [180]: print "Total number of terms: %d" % len(all_terms_dict)
    Total number of terms: 0

    In [181]: print "Index of term 'Dead': %d" % all_occupations_dict['Dead']
    Index of term 'Dead': 147

    In [182]: print "Index of term 'Rooms': %d" % all_occupations_dict['Rooms']
    Index of term 'Rooms': 1963

    In [184]: %paste

    def create_vector(terms, term_dict):
    from scipy import sparse as sp
    num_terms = len(term_dict)
    x = sp.csc_matrix((1, num_terms))
    for t in terms:
    if t in term_dict:
    idx = term_dict[t]
    x[0, idx] = 1
    return x
    ## -- End pasted text --

    In [185]:

    In [185]: all_terms_bcast = sc.broadcast(all_terms_dict)

    In [186]: term_vectors = title_terms.map(lambda terms: create_vector(terms, all_terms_bcast.value))

    In [187]: term_vectors.take(5)
    Out[187]:
    [<1x0 sparse matrix of type '<type 'numpy.float64'>'
    with 0 stored elements in Compressed Sparse Column format>,
    <1x0 sparse matrix of type '<type 'numpy.float64'>'
    with 0 stored elements in Compressed Sparse Column format>,
    <1x0 sparse matrix of type '<type 'numpy.float64'>'
    with 0 stored elements in Compressed Sparse Column format>,
    <1x0 sparse matrix of type '<type 'numpy.float64'>'
    with 0 stored elements in Compressed Sparse Column format>,
    <1x0 sparse matrix of type '<type 'numpy.float64'>'
    with 0 stored elements in Compressed Sparse Column format>]

    In [188]: np.random.seed(42)

    In [189]: x = np.random.randn(10)

    In [190]: norm_x_2 = np.linalg.norm(x)

    In [191]: normalized_x = x /norm_x_2

    In [192]: print "x: %s" % x
    x:
    [ 0.49671415 -0.1382643 0.64768854 1.52302986 -0.23415337 -0.23413696
    1.57921282 0.76743473 -0.46947439 0.54256004]

    In [193]: print "Normalized x: %s" % normalized_x
    Normalized x: 
    [ 0.19172213 -0.05336737 0.24999534 0.58786029 -0.09037871 -0.09037237
    0.60954584 0.29621508 -0.1812081 0.20941776]


    In [194]: print "2-Norm of normalized_x: %2.4f" % np.linalg.norm(normalized_x)
    2-Norm of normalized_x: 1.0000

    In [199]: vector = sc.parallelize([x])

    In [200]: from pyspark.mllib.feature import Normalizer

    In [201]: normalizer = Normalizer()

    In [202]: vector = sc.parallelize([x])

    In [203]: normalized_x_mllib = normalizer.transform(vector).first().toArray()

    In [204]: print "x: %s" % x
    x:
    [ 0.49671415 -0.1382643 0.64768854 1.52302986 -0.23415337 -0.23413696
    1.57921282 0.76743473 -0.46947439 0.54256004]

    In [205]: print "2-Norm of x: %2.4f" % norm_x_2
    2-Norm of x: 2.5908

    In [206]: print "Normalized x MLlib: %s" % normalized_x_mllib
    Normalized x MLlib:
    [ 0.19172213 -0.05336737 0.24999534 0.58786029 -0.09037871 -0.09037237
    0.60954584 0.29621508 -0.1812081 0.20941776]

    In [207]: print "2-Norm of normalized_x_mllib: %2.4f" % np.linalg.norm(normalized_x_mllib)
    2-Norm of normalized_x_mllib: 1.0000

  • 相关阅读:
    java 可变參数列表
    Java -Xms -Xmx -Xss -XX:MaxNewSize -XX:MaxPermSize含义记录
    hdu 4939
    什么是堆和栈,它们在哪儿?
    PPAPI插件与浏览器的通信
    Java&amp;Xml教程(十一)JAXB实现XML与Java对象转换
    Heavy Transportation
    Python学习笔记-小记
    C/C++知识要点5——智能指针原理及自己定义实现
    小米2S电池电量用尽充电无法开机解决方法
  • 原文地址:https://www.cnblogs.com/littlesuccess/p/5155484.html
Copyright © 2020-2023  润新知