import os
import pandas as pd
from collections import defaultdict
from operator import itemgetter
ratings_filename = os.path.join('../data/ml-100k', "u.data")
all_ratings = pd.read_csv(
ratings_filename,
sep="\t",
header=None,
names=["UserID", "MovieID", "Rating", "Datetime"]
)
all_ratings["Datetime"] = pd.to_datetime(
all_ratings['Datetime'], unit='s')
all_ratings["Favorable"] = all_ratings["Rating"] > 3
ratings = all_ratings[all_ratings['UserID'].isin(range(200))]
favorable_ratings = ratings[ratings["Favorable"]]
favorable_reviews_by_users = dict(
(
k, frozenset(v.values)
) for k, v in favorable_ratings.groupby("UserID")["MovieID"])
num_favorable_by_movie = ratings[["MovieID", "Favorable"]].groupby(
"MovieID").sum()
num_favorable_by_movie.sort_values("Favorable", ascending=False)[
:5]
def find_frequent_itemsets(
favorable_reviews_by_users, k_1_itemsets, min_support):
counts = defaultdict(int)
for user, reviews in favorable_reviews_by_users.items():
for itemset in k_1_itemsets:
if itemset.issubset(reviews):
for other_reviewed_movie in reviews - itemset:
current_superset = itemset | frozenset(
(other_reviewed_movie,))
counts[current_superset] += 1
return dict(
[
(itemset, frequency) for itemset, frequency in counts.items(
) if frequency >= min_support
]
)
frequent_itemsets = {}
min_support = 50
frequent_itemsets[1] = dict(
(
frozenset(
(movie_id,)
),
row["Favorable"]
) for movie_id, row in num_favorable_by_movie.iterrows(
) if row["Favorable"] > min_support
)
print(u"共有{}电影有超过{}喜欢标注".format(len(frequent_itemsets[1]), min_support))
for k in range(2, 20):
cur_frequent_itemsets = find_frequent_itemsets(
favorable_reviews_by_users,
frequent_itemsets[k - 1],
min_support
)
if len(cur_frequent_itemsets) == 0:
print(u"未找到长度为{}的频繁项集".format(k))
break
else:
print(u"共找到{}条频繁项集对应于长度{}".format(
len(cur_frequent_itemsets), k))
frequent_itemsets[k] = cur_frequent_itemsets
del frequent_itemsets[1]
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
for itemset in itemset_counts.keys():
for conclusion in itemset:
premise = itemset - set((conclusion,))
candidate_rules.append((premise, conclusion))
print(u"共有{}候选规则".format(len(candidate_rules)))
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in favorable_reviews_by_users.items():
for candidate_rule in candidate_rules:
premise, conclusion = candidate_rule
if premise.issubset(reviews):
if conclusion in reviews:
correct_counts[candidate_rule] += 1
else:
incorrect_counts[candidate_rule] += 1
rule_confidence = {
candidate_rule: correct_counts[
candidate_rule] / float(
correct_counts[
candidate_rule
] + incorrect_counts[
candidate_rule]
) for candidate_rule in candidate_rules
}
sorted_confidence = sorted(
rule_confidence.items(),
key=itemgetter(1),
reverse=True
)
movie_name_filename = os.path.join('../data/ml-100k', "u.item")
movie_name_data = pd.read_csv(
movie_name_filename,
sep="|",
header=None,
encoding="mac-roman"
)
movie_name_data.columns = [
"MovieID", "Title", "Release Date", "Video Release",
"IMDB", "<UNK>", "Action", "Adventure",
"Animation", "Children's", "Comedy", "Crime",
"Documentary", "Drama", "Fantasy", "Film-Noir",
"Horror", "Musical", "Mystery", "Romance",
"Sci-Fi", "Thriller", "War", "Western"
]
def get_movie_name(movie_id):
title_object = movie_name_data[
movie_name_data["MovieID"] == movie_id]["Title"]
title = title_object.values[0]
return title
for index in range(5):
print(u"规则 #{0}".format(index + 1))
(premise, conclusion) = sorted_confidence[index][0]
premise_names = ", ".join(get_movie_name(idx) for idx in premise)
conclusion_name = get_movie_name(conclusion)
print(
u"-规则: 如果某人喜欢{0}他们也会喜欢{1}"
.format(premise_names, conclusion_name))
print(
u"-置信度: {0:.3f}".format(
rule_confidence[(premise, conclusion)]))
print("")
test_dataset = all_ratings[~all_ratings['UserID'].isin(range(200))]
test_favorable = test_dataset[test_dataset["Favorable"]]
test_favorable_by_users = dict(
(
k,
frozenset(v.values)
) for k, v in test_favorable.groupby("UserID")["MovieID"]
)
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in test_favorable_by_users.items():
for candidate_rule in candidate_rules:
premise, conclusion = candidate_rule
if premise.issubset(reviews):
if conclusion in reviews:
correct_counts[candidate_rule] += 1
else:
incorrect_counts[candidate_rule] += 1
test_confidence = {
candidate_rule: correct_counts[
candidate_rule] / float(
correct_counts[
candidate_rule] + incorrect_counts[
candidate_rule]) for candidate_rule in rule_confidence}
sorted_test_confidence = sorted(
test_confidence.items(),
key=itemgetter(1),
reverse=True
)
print(sorted_test_confidence[:5])
for index in range(10):
print(u"规则 #{0}".format(index + 1))
premise, conclusion = sorted_confidence[index][0]
premise_names = ", ".join(get_movie_name(idx) for idx in premise)
conclusion_name = get_movie_name(conclusion)
print(u"规则: 如果某人喜欢{0}, 他们也会喜欢{1}".format(
premise_names, conclusion_name))
print(u"- 训练集: {0:.3f}".format(
rule_confidence.get((premise, conclusion), -1)))
print(u"- 测试集: {0:.3f}".format(
test_confidence.get((premise, conclusion), -1)))
print("")