-
数据挖掘-MovieLens数据集_电影推荐_亲和性分析_Aprioro算法
-
- import os
- import pandas as pd
- ''
- ''
- data_folder=os.path.join( os.path.expanduser("~"),"ml-100k")
- ratings_filename=os.path.join( data_folder,"u.data")
- all_ratings=pd.read_csv( ratings_filename, delimiter=" ",header=None, names=["UserID","MovieID","Rating","Datetime"])
-
- all_ratings[:1]
- ''
- all_ratings["Datetime"]=pd.to_datetime(all_ratings["Datetime"],unit='s')
- all_ratings[:1]
-
- all_ratings["Favorable"]=all_ratings["Rating"]>3
- all_ratings[:10]
- ''
- ratings=all_ratings[ all_ratings['UserID'].isin(range(200))]
- favorable_ratings=ratings[ratings["Favorable"]]
- favorable_ratings[:5]
-
- from collections import defaultdict
- favorable_reviews_by_users=dict((k,frozenset(v.values))
- for k,v in favorable_ratings.groupby("UserID")["MovieID"])
- print("length: {0}".format( len(favorable_reviews_by_users) ) )
-
- num_favorable_by_movie=ratings[["MovieID","Favorable"]].groupby("MovieID").sum()
- num_favorable_by_movie
- num_favorable_by_movie.sort( "Favorable",ascending=False)[:5]
-
-
- ''
- def find_frequent_itemsets( favorable_reviews_by_users, k_1_itemsets, min_support):
- counts=defaultdict( int )
-
- for user,reviews in favorable_reviews_by_users.items():
-
- for itemset in k_1_itemsets:
- if itemset.issubset( reviews):
-
- for other_reviewed_movie in reviews-itemset:
- current_superset=itemset|frozenset( (other_reviewed_movie,))
- counts[current_superset]+=1
-
- return dict( [(itemset,frequency) for itemset,frequency in counts.items() if frequency>=min_support ] )
-
- import sys
-
- frequent_itemsets={}
- min_support=50
- frequent_itemsets[1]= dict((frozenset((movie_id,)),row["Favorable"]) for movie_id,row in num_favorable_by_movie.iterrows() if row["Favorable"]>min_support)
- frequent_itemsets[1]
- print("there are {0} movie with more than {1} favorable reviews".format( len(frequent_itemsets[1]), min_support))
- sys.stdout.flush()
-
- max_length=20
- for k in range(2, max_length):
- cur_frequent_itemsets=find_frequent_itemsets( favorable_reviews_by_users, frequent_itemsets[k-1], min_support )
- if len(cur_frequent_itemsets)==0:
- print("can not find any frequent itemsets of length {0}".format( k ))
- sys.stdout.flush()
- break
- else:
- print(" find {0} frequent itemsets of length {1}".format(len(cur_frequent_itemsets), k))
- print(" data as following:")
-
- sys.stdout.flush()
- frequent_itemsets[k]=cur_frequent_itemsets
-
- candidate_rules=[]
- for itemset_length,itemset_counts in frequent_itemsets.items():
-
- for itemset in itemset_counts.keys():
-
- for conclusion in itemset:
- premise=itemset-set((conclusion,))
- candidate_rules.append((premise,conclusion))
- print("there are {0} candidate rules".format( len(candidate_rules)))
-
- correct_counts=defaultdict(int)
- incorrect_counts=defaultdict(int)
- for user, reviews in favorable_reviews_by_users.items():
-
- for candidate_rule in candidate_rules:
- premise,conclusion=candidate_rule
-
- if premise.issubset(reviews):
- if conclusion in reviews:
- correct_counts[candidate_rule]+=1
- else:
- incorrect_counts[candidate_rule]+=1
- rule_confidence={candidate_rule: correct_counts[candidate_rule]/ float(correct_counts[candidate_rule]+incorrect_counts[candidate_rule]) for candidate_rule in candidate_rules}
- min_confidence=0.9
- rule_confidence={candidate_rule: confidence for candidate_rule,confidence in rule_confidence.items() if confidence>min_confidence}
- print( "the total of rules which bigger than min_confidence is {}".format( len(rule_confidence )) )
-
- from operator import itemgetter
- sorted_confidence=sorted( rule_confidence.items(),key=itemgetter(1),reverse=True)
- for index in range(5):
- print("Rule #{0}".format(index+1))
- (premise,conclusion)=sorted_confidence[index][0]
- print("Rule: if a person recommends {0} they will also recommend {1}".format( premise, conclusion))
- print( " - Confidence: {0:.3f}".format( rule_confidence[(premise,conclusion)]))
- print("")
-
- movie_name_filename=os.path.join( data_folder,"u.item")
- movie_name_data=pd.read_csv(movie_name_filename,delimiter="|",header=None,encoding="mac-roman")
- movie_name_data.columns=["MovieID", "Title", "Release Date", "Video Release", "IMDB", "<UNK>", "Action", "Adventure",
- "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir",
- "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
- def get_movie_name(movie_id):
- title_object=movie_name_data[movie_name_data["MovieID"]==movie_id]["Title"]
- title=title_object.values[0]
- return title
- get_movie_name(4)
-
- for index in range(5):
- print("Rule #{0}".format(index+1))
- (premise,conclusion)=sorted_confidence[index][0]
- premise_names=",".join( get_movie_name(idx) for idx in premise )
- conclusion_name=get_movie_name( conclusion)
- print("Rule: if a person recommends {0} they will also recommend {1}".format( premise_names, conclusion_name))
- print( " - Confidence: {0:.3f}".format( rule_confidence[(premise,conclusion)]))
- print("")
-
-
- test_dataset= all_ratings[~all_ratings['UserID'].isin(range(200))]
- test_favorable_ratings=test_dataset[test_dataset["Favorable"]]
- test_favorable_reviews_by_users=dict((k,frozenset(v.values))
- for k,v in test_favorable_ratings.groupby("UserID")["MovieID"])
- test_correct_counts=defaultdict(int)
- test_incorrect_counts=defaultdict(int)
- for user, reviews in test_favorable_reviews_by_users.items():
-
- for candidate_rule in candidate_rules:
- premise,conclusion=candidate_rule
-
- if premise.issubset(reviews):
- if conclusion in reviews:
- test_correct_counts[candidate_rule]+=1
- else:
- test_incorrect_counts[candidate_rule]+=1
-
- test_rule_confidence={candidate_rule: test_correct_counts[candidate_rule]/ float(test_correct_counts[candidate_rule]+test_incorrect_counts[candidate_rule]) for candidate_rule in candidate_rules}
- print( len(test_rule_confidence))
- sorted_test_confidence=sorted( test_rule_confidence.items(),key=itemgetter(1),reverse=True )
- print( sorted_test_confidence[:5])
-
- for index in range(10):
- print("Rule #{0}".format(index+1))
- (premise,conclusion)=sorted_confidence[index][0]
- premise_names=",".join( get_movie_name(idx) for idx in premise )
- conclusion_name=get_movie_name( conclusion)
- print("Rule: if a person recommends {0} they will also recommend {1}".format( premise_names, conclusion_name))
- print( " - Train Confidence: {0:.3f}".format( rule_confidence[(premise,conclusion)]))
- print( " - Test Confidence: {0:.3f}".format( test_rule_confidence[(premise,conclusion)]))
- print("")
-
相关阅读:
【转】PostgreSQL与MySQL比较
HIVE出现Read past end of RLE integer from compressed stream Stream for column 1 kind LENGTH position: 359 length: 359 range: 0错误
HSDF查看各级目录的大小
windows7搜索python java go php等其他文件内容
Tomcat配置https后,并发较大时,频繁超时情况。
Tomcat7配置Https
部分手机浏览器存在将ajax请求当成广告过滤的情况,及解决方案
百度广告联盟api probuf协议对接
SQL查询时,根据日期范围查询周
执行Hive出现Error running child : java.lang.OutOfMemoryError: Java heap space错误
-
原文地址:https://www.cnblogs.com/wanghuaijun/p/7089227.html
Copyright © 2020-2023
润新知