• netflix 推荐算法学习1(转)


    http://www.csie.ntu.edu.tw/~r95007/thesis/svdnetflix/report/report.pdf
    http://eecs.wsu.edu/~vjakkula/MLProject.pdf
    http://michielvanwezel.com/papers/kagie_vdloos_vwezelV2.pdf
    http://cseweb.ucsd.edu/users/elkan/KddNetflixWorkshop.pdf
    http://www.cs.uic.edu/~liub/KDD-cup-2007/proceedings/The-Netflix-Prize-Bennett.pdf

    准备数据集
    1shell 将所有测试数据集文件合并为一个文件
    #!/bin/bash
    for x in netflix/training_set/mv_*.txt ;
     do cat $x >> ratings.txt ;
    done &

    http://www.netflixprize.com/community/viewtopic.php?id=87
    需要下载path模块
    #!/usr/bin/env python

    import sys
    import csv
    from path import path

    NULL = '\N'

    class Dialect(csv.excel):
        delimiter = '\t'
        lineterminator = '\n'
        doublequote = False
        escapechar = None
        quoting = csv.QUOTE_MINIMAL

    def csvDump(iter_rows_func, basename, dir='.', csvdir='csv', dialect=Dialect):
        dir,csvdir = path(dir),path(csvdir)
        if not csvdir.exists():
            csvdir.mkdir()
        inpath = dir/basename
        outfile = csvdir/inpath.namebase + '.csv'
        if not outfile.exists():
            write = csv.writer(open(outfile, 'wb'), dialect).writerow
            print >> sys.stderr, 'Writing %s ...' % outfile
            for row in iter_rows_func(inpath):
                write(row)

    def iterMovieRows(path):
        for line in open(path):
            id,year,title = line.rstrip().split(',',2)
            year = year!='NULL' and int(year) or NULL
            yield (int(id), year, title)

    def iterTrainingSetRows(dir):
        for path in dir.walkfiles():
            iterlines = (line.strip() for line in open(path))
            movie_id = int(iterlines.next()[:-1])
            for line in iterlines:
                user_id,rating,date = line.split(',',2)
                yield (movie_id, int(user_id), date, float(rating))

    def iterProbeSetRows(path):
        for line in (line.strip() for line in open(path)):
            try:
                user_id = int(line)
            except ValueError:
                movie_id = int(line[:-1])
            else:
                yield (movie_id,user_id)

    def iterQualifyingSetRows(path):
        for line in (line.strip() for line in open(path)):
            try:
                user_id,date = line.split(',')
            except ValueError:
                movie_id = int(line[:-1])
            else:
                yield (movie_id,user_id,date)


    if __name__ == '__main__':
        kwds = {}
        if len(sys.argv) > 1:
            kwds['dir'] = sys.argv[1]
        if len(sys.argv) > 2:
            kwds['csvdir'] = sys.argv[2]
        for iterfunc, basename in [
            (iterMovieRows,         'movie_titles.txt'),
            (iterTrainingSetRows,   'training_set'),
            (iterProbeSetRows,      'probe.txt'),
            (iterQualifyingSetRows, 'qualifying.txt')]:
                csvDump(iterfunc, basename, **kwds)
                
    perl脚本     
    #!/usr/bin/perl

    use strict;

    my $dir = '/path/to/your/training_set';
    opendir DIR, $dir or die("could not open $dir");

    while(my $fname = readdir DIR) {
            my $fname = "$dir/$fname";
            open FILE, $fname or die("could not open $fname");
            (my $mid = <FILE>) =~ s/:.*//s;
            while(<FILE>) {
                    chomp;
                    print qq("$mid",);
                    map { print qq("$_",) } split /,/;
                    print "\n";
            }
            close FILE;
    }
    closedir DIR;
    exit;

    $ time ./bigcsv.pl > bigcsv.csv

    real    35m11.521s
    user    10m36.272s
    sys     4m9.940s

    mysql> LOAD DATA INFILE 'bigcsv.csv' INTO TABLE main FIELDS TERMINATED BY ',' ENCLOSED BY '"' LINES TERMINATED BY '\n';
    Query OK, 100480507 rows affected (5 min 34.39 sec)
    Records: 100480507  Deleted: 0  Skipped: 0  Warnings: 0
  • 相关阅读:
    两个链表的第一个公共结点
    数组中的逆序对
    第一个只出现一次的字符(字符流中第一个只出现一次的字符)
    丑数
    最长不含有重复字符的子字符串
    礼物的最大价值
    把数字翻译成字符串
    把数组排成最小的数
    [CSP-S模拟测试]:赤(red)(WQS二分+DP)
    [CSP-S模拟测试]:斯诺(snow)(数学+前缀和+树状数组)
  • 原文地址:https://www.cnblogs.com/qq78292959/p/2076602.html
Copyright © 2020-2023  润新知