• 111


    数据分析 (电影数据)

    import pandas as pd
    
    uname = ['user_id', 'gender', 'age', 'occupation', 'zip']
    fuser  = '//home//yunpiao//data/1M//users.dat'
    fmovie = '/home/yunpiao/data/1M/movies.dat'
    fratings = '/home/yunpiao/data/1M/ratings.dat'
    
    pusers = pd.read_table(fuser, sep='::', header=None, names=uname, engine='python')
    uname = ['user_id','movie_id', 'rating', 'timestamp']
    prating = pd.read_table(fratings, sep='::', header=None, names=uname, engine='python')
    uname = ['movie_id', 'title', 'genres']
    %timeit pmovie = pd.read_table(fmovie, sep='::', header=None, names=uname,engine='python')
    
    
    100 loops, best of 3: 11.5 ms per loop
    

    切片

    pusers[:5]
    
    user_id gender age occupation zip
    0 1 F 1 10 48067
    1 2 M 56 16 70072
    2 3 M 25 15 55117
    3 4 M 45 7 02460
    4 5 M 25 20 55455
    prating[:5]
    
    user_id movie_id rating timestamp
    0 1 1193 5 978300760
    1 1 661 3 978302109
    2 1 914 3 978301968
    3 1 3408 4 978300275
    4 1 2355 5 978824291
    pmovie[1:10:4]
    
    movie_id title genres
    1 2 Jumanji (1995) Adventure|Children's|Fantasy
    5 6 Heat (1995) Action|Crime|Thriller
    9 10 GoldenEye (1995) Action|Adventure|Thriller
    data = pd.merge(pd.merge(prating,pusers),pmovie)
    print(data.ix[6])
    
    user_id                                           19
    movie_id                                        1193
    rating                                             5
    timestamp                                  982730936
    gender                                             M
    age                                                1
    occupation                                        10
    zip                                            48073
    title         One Flew Over the Cuckoo's Nest (1975)
    genres                                         Drama
    Name: 6, dtype: object
    
    mean_ratings = data.pivot_table('rating',index='title', columns='gender', aggfunc='mean')
    mean_ratings[:5]
    
    gender F M
    title
    $1,000,000 Duck (1971) 3.375000 2.761905
    'Night Mother (1986) 3.388889 3.352941
    'Til There Was You (1997) 2.675676 2.733333
    'burbs, The (1989) 2.793478 2.962085
    ...And Justice for All (1979) 3.828571 3.689024
    rating_by_title = data.groupby('title').size()
    rating_by_title[:4]
    
    title
    $1,000,000 Duck (1971)        37
    'Night Mother (1986)          70
    'Til There Was You (1997)     52
    'burbs, The (1989)           303
    dtype: int64
    
    active_title = rating_by_title.index[rating_by_title >= 250]
    print(active_title)
    
    Index([u''burbs, The (1989)', u'10 Things I Hate About You (1999)',
           u'101 Dalmatians (1961)', u'101 Dalmatians (1996)',
           u'12 Angry Men (1957)', u'13th Warrior, The (1999)',
           u'2 Days in the Valley (1996)', u'20,000 Leagues Under the Sea (1954)',
           u'2001: A Space Odyssey (1968)', u'2010 (1984)',
           ...
           u'X-Men (2000)', u'Year of Living Dangerously (1982)',
           u'Yellow Submarine (1968)', u'You've Got Mail (1998)',
           u'Young Frankenstein (1974)', u'Young Guns (1988)',
           u'Young Guns II (1990)', u'Young Sherlock Holmes (1985)',
           u'Zero Effect (1998)', u'eXistenZ (1999)'],
          dtype='object', name=u'title', length=1216)
    
    mean_ratings = mean_ratings.ix[active_title]
    mean_ratings[:3]
    
    gender F M
    title
    'burbs, The (1989) 2.793478 2.962085
    10 Things I Hate About You (1999) 3.646552 3.311966
    101 Dalmatians (1961) 3.791444 3.500000
    top_demale_ratings = mean_ratings.sort_values(by='M',ascending=False)
    top_demale_ratings['M'][:3]
    
    title
    Godfather, The (1972)                                                  4.583333
    Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)    4.576628
    Shawshank Redemption, The (1994)                                       4.560625
    Name: M, dtype: float64
    
    mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
    mean_ratings[:5]
    
    gender F M diff
    title
    'burbs, The (1989) 2.793478 2.962085 0.168607
    10 Things I Hate About You (1999) 3.646552 3.311966 -0.334586
    101 Dalmatians (1961) 3.791444 3.500000 -0.291444
    101 Dalmatians (1996) 3.240000 2.911215 -0.328785
    12 Angry Men (1957) 4.184397 4.328421 0.144024
    top_diff = mean_ratings.sort_values(by="diff", ascending=False)
    top_diff[:4:1]
    
    gender F M diff
    title
    Good, The Bad and The Ugly, The (1966) 3.494949 4.221300 0.726351
    Kentucky Fried Movie, The (1977) 2.878788 3.555147 0.676359
    Dumb & Dumber (1994) 2.697987 3.336595 0.638608
    Longest Day, The (1962) 3.411765 4.031447 0.619682
    rating_std_by_title = data.groupby('title')['rating'].std()
    rating_std_by_title = rating_std_by_title.ix[active_title]
    rating_std_by_title.sort_values(ascending=False)[:10]
    
    title
    Dumb & Dumber (1994)                     1.321333
    Blair Witch Project, The (1999)          1.316368
    Natural Born Killers (1994)              1.307198
    Tank Girl (1995)                         1.277695
    Rocky Horror Picture Show, The (1975)    1.260177
    Eyes Wide Shut (1999)                    1.259624
    Evita (1996)                             1.253631
    Billy Madison (1995)                     1.249970
    Fear and Loathing in Las Vegas (1998)    1.246408
    Bicentennial Man (1999)                  1.245533
    Name: rating, dtype: float64
    
  • 相关阅读:
    1. 两数之和
    RabbitMQ主题交换机
    RabbitMQ路由
    RabbitMQ发布订阅
    RabbitMQ简介和安装
    55. 跳跃游戏
    63. 不同路径 II
    62. 不同路径
    6. Z 字形变换
    sql注入攻击与防御第一章(笔记)
  • 原文地址:https://www.cnblogs.com/yunpiao111/p/5840226.html
Copyright © 2020-2023  润新知