• netflix数据处理2(转)


    原始数据:
    $head -10 mv_0006890.txt
    6890:
    1735266,1,2004-04-02
    1008399,1,2004-06-22
    2360117,2,2003-11-08
    1294425,2,2004-03-15
    439931,4,2004-03-27
    1583311,1,2004-03-11
    2431832,3,2005-02-13
    620771,2,2004-03-20
    1110906,1,2004-03-04

    结果数据:user_id movie_id rating
    $head -10 ratings_0.txt
    499040 9419 3
    2071637 9419 4
    896780 9419 3
    2625420 9419 2
    652121 9419 3
    1003291 9419 4
    818736 9419 3
    332152 9419 2
    2174771 9419 4
    47411 9419 5


    import sys
    import os
    import re

    CHUNK_FILES = True

    def mkdir(path):
        if not os.path.exists(path):
            os.makedirs(path)
        else:
            pass

    def main(args):
        outfile = open('reformatted_movie_titles.txt', 'w')
        movie_title_file = open('movie_titles.txt','r')
        movie_title_exp=re.compile("([\w]+),([\w]+),(.*)")
        movie_titles={}
        for line in movie_title_file:
          m = movie_title_exp.match(line.strip())
          outfile.write('%s\t%s\n' % (m.group(1), m.group(3)))
        outfile.close()
        movie_title_file.close()  
     
        in_dir= args[1] #'输入文件路径'
        out_dir = args[2] #'输出文件路径'
        filenames = [in_dir +'/' + file for file in os.listdir(in_dir)]
        rating_count = 0
        L = 0
        outfile_num = 0
        mkdir(out_dir)
        outfilename = out_dir+ '/' + 'ratings_'+ str(outfile_num) +'.txt'    
        output_file = open(outfilename, 'w')
        for i, moviefile in enumerate(filenames):
            print "processing movie %s " % (i+1)
            f = open(moviefile,'r')
            for j, line in enumerate(f.readlines()):
                if j == 0:
                    movieid = line.split(':')[0]
                else:
                    (userid, rating, date) = line.split(',')
                    nextline = ' '.join([userid, movieid, rating+'\n'])
                    L += len(nextline) # 如果长度达到 65536, 新建一个文件
                    if L/1000 > 65536 and CHUNK_FILES:
                        output_file.close()
                        outfile_num += 1
                        outfilename = out_dir+ '/' + \
                        'ratings_'+ str(outfile_num) +'.txt'
                        print "--- starting new file: %s" % outfilename
                        output_file = open(outfilename, 'w')
                        L = len(nextline)
                    output_file.write(nextline)
                    rating_count += 1
            f.close()   
        output_file.close()
              

    if __name__ == '__main__':
        main(sys.argv)

    经过处理,得到多个用户评分数据集,合并到一个文件
    #!/bin/bash
    for x in netflix-data/ratings_*.txt ;
     do cat $x >> result.txt ;
    done &

    $head -10 result.txt
    499040 9419 3
    2071637 9419 4
    896780 9419 3
    2625420 9419 2
    652121 9419 3
    1003291 9419 4
    818736 9419 3
    332152 9419 2
    2174771 9419 4
    47411 9419 5
  • 相关阅读:
    常用css3属性
    jQuery瀑布流
    jQuery事件对象
    jQuery动画
    面向对象复习
    php 面向对象
    git
    存储数据
    ajax
    对象
  • 原文地址:https://www.cnblogs.com/qq78292959/p/2076601.html
Copyright © 2020-2023  润新知