# User-User Collaborative Filtering

## Importing libraries

In [175]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Reader, Dataset, KNNBasic

## Reading and preprocessing data

Data source: https://www.kaggle.com/code/rounakbanik/movie-recommender-systems/notebook

We have already preprocessed the original dataset by selecting a subset of users that rated more than 690 movies and movies that were rated by more than 180 users. Additionally, we have added titles to the rated movies.  The resulting dataset is an intersection of these user and movie subsets.

In [348]:
data = pd.read_excel('movies.xlsx')
data.head()

Unnamed: 0,userId,movieId,rating,original_title
0,15,1,2.0,Toy Story
1,23,1,3.0,Toy Story
2,30,1,4.0,Toy Story
3,73,1,5.0,Toy Story
4,212,1,3.0,Toy Story


In [362]:
# the correspondence between movies ids and titles
movie_title = data[['movieId', 'original_title']].drop_duplicates()

In [349]:
# we make a user-movie matrix and fill in non-rated movies with 0 to subsequently use this user-movie pairs to predict a rating 
full_data = data.pivot_table(values='rating',
                                index='userId',
                                columns='movieId').fillna(0)
full_data

movieId,1,32,47,50,110,150,260,296,318,356,...,1210,1270,1580,2028,2571,2762,2858,2959,4993,5952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15,2.0,4.0,5.0,5.0,3.0,3.0,5.0,5.0,2.0,1.0,...,5.0,5.0,4.0,3.0,5.0,1.0,4.0,5.0,5.0,5.0
23,3.0,4.0,4.5,4.0,3.5,3.5,4.5,4.5,5.0,4.5,...,4.0,4.5,3.5,4.0,4.0,4.0,3.5,3.5,4.0,4.0
30,4.0,2.0,4.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,...,4.0,5.0,4.0,5.0,3.0,5.0,5.0,4.0,3.0,0.0
73,5.0,5.0,5.0,5.0,4.0,3.5,4.5,5.0,5.0,5.0,...,5.0,5.0,3.0,4.5,4.5,4.0,4.5,5.0,5.0,5.0
212,3.0,3.5,3.5,3.5,5.0,4.0,4.0,4.0,4.5,4.0,...,0.0,3.0,1.5,4.0,5.0,3.5,4.0,5.0,5.0,5.0
213,3.0,1.5,2.5,0.0,2.5,1.5,5.0,0.0,0.0,2.0,...,5.0,3.0,4.0,0.0,4.0,2.5,0.0,0.0,4.5,4.0
294,4.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,3.0,4.0,...,4.0,4.0,4.5,0.0,4.5,0.0,0.0,0.0,4.0,4.0
311,3.0,0.0,0.5,3.0,3.0,5.0,4.0,3.0,4.5,5.0,...,3.5,4.5,3.0,5.0,4.0,4.0,0.0,0.0,0.0,0.0
380,4.0,5.0,5.0,5.0,5.0,4.0,4.0,5.0,4.0,5.0,...,4.0,3.0,3.0,5.0,5.0,5.0,5.0,5.0,4.5,4.0
388,0.0,4.0,5.0,5.0,5.0,4.5,4.5,5.0,5.0,4.0,...,5.0,4.0,3.0,5.0,5.0,4.0,4.0,3.0,5.0,4.5


In [350]:
# we restore the original data but with additional user-movie pairs with 0 rating (meaning the movie is unrated)
full_data = full_data.reset_index().melt(id_vars='userId', var_name='movieId', value_name='rating')

In [351]:
full_data.head()

Unnamed: 0,userId,movieId,rating
0,15,1,2.0
1,23,1,3.0
2,30,1,4.0
3,73,1,5.0
4,212,1,3.0


In [352]:
# we do train test split by leaving all the unrated movies (and corresponding users) in the test set 
# and all the rated movies (and corresponding users) in the train set
train = full_data[full_data.rating != 0]
test = full_data[full_data.rating == 0]

## Bulding a recommender system

### scikit surprise

In [353]:
# transforming data into the format acceptable by the scikit learn suprise framework 
reader = Reader(rating_scale=(0.5, 5))
train_data = Dataset.load_from_df(train[['userId', 'movieId', 'rating']], reader)
trainset = train_data.build_full_trainset()
test_data = Dataset.load_from_df(test[['userId', 'movieId', 'rating']], reader)
testset = [test_data.df.loc[i].to_list() for i in test_data.df.index]

In [354]:
# we apply a basic nearest neighbors approach, i.e., a basic collaborative filtering algorithm that uses a cosine distance 
# and computes similarities between users
sim_options = {'name': 'pearson_baseline', 'user_based': True}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x200226ebdf0>

In [355]:
pred = algo.test(testset)
predictions = pd.DataFrame(columns = ['userId','movieId','rating'])
for i in range(len(pred)):
    predictions = pd.concat([predictions, pd.DataFrame({"userId": [pred[i].uid], 
                                                        "movieId": [pred[i].iid], "rating": [pred[i].est]})], axis = 0)

In [366]:
predictions.merge(movie_title, how = 'inner')

Unnamed: 0,userId,movieId,rating,original_title
0,388,1,3.498589,Toy Story
1,294,32,3.399455,Twelve Monkeys
2,311,32,2.033549,Twelve Monkeys
3,452,32,2.597558,Twelve Monkeys
4,461,32,3.928357,Twelve Monkeys
...,...,...,...,...
94,30,5952,2.152863,The Lord of the Rings: The Two Towers
95,311,5952,1.924435,The Lord of the Rings: The Two Towers
96,518,5952,3.638191,The Lord of the Rings: The Two Towers
97,547,5952,2.814763,The Lord of the Rings: The Two Towers


### Simple average of K-nearest neighbours

In [185]:
# we will predict the rating for user 294 by averaging the ratings of its 3 closest neighbors 
# first we select the movies rated by user 294
non_zero_test_data = train[train['userId'] == 294].pivot_table(index = 'userId', columns = 'movieId', values = 'rating')
non_zero_test_data

movieId,1,110,260,318,356,364,377,380,457,480,...,595,780,1196,1198,1210,1270,1580,2571,4993,5952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
294,4.0,3.0,4.0,3.0,4.0,4.5,4.0,4.0,3.5,4.0,...,5.0,4.5,4.0,4.5,4.0,4.0,4.5,4.5,4.0,4.0


In [186]:
# we select movies from other users in train data that are also rated by a test user 
train_data_for_similarity = train[(train.movieId.isin(non_zero_test_data.columns))
                                  &(train.userId != 294)].pivot_table(index = 'userId', 
                                                                      columns = 'movieId', 
                                                                      values = 'rating').fillna(0)

In [187]:
# we calculate a cosine similarity between a test user and users from a train set 
similarity = cosine_similarity(non_zero_test_data, train_data_for_similarity.fillna(0))
sim = pd.DataFrame(similarity)
sim.columns = train_data_for_similarity.index

In [188]:
# find top 3 neighbours of a user from a test set 
top3_neighbours = pd.DataFrame(sim.columns.values[np.argsort(-sim.values, axis=1)[:, :3]],
                  columns = ['1st Max','2nd Max','3rd Max'])
train_data_for_similarity.loc[top3_neighbours.values.tolist()[0],:]

movieId,1,110,260,318,356,364,377,380,457,480,...,595,780,1196,1198,1210,1270,1580,2571,4993,5952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
73,5.0,4.0,4.5,5.0,5.0,5.0,3.0,3.0,4.0,4.0,...,4.5,3.5,5.0,5.0,5.0,5.0,3.0,4.5,5.0,5.0
30,4.0,5.0,4.0,5.0,5.0,3.0,4.0,4.0,5.0,4.0,...,5.0,4.0,4.0,5.0,4.0,5.0,4.0,3.0,3.0,0.0
461,3.5,4.0,4.5,5.0,4.0,1.5,4.0,2.5,2.5,5.0,...,4.5,1.5,5.0,5.0,4.5,5.0,2.0,4.5,4.5,4.5


In [189]:
# calculate the average of movies not-yet-rated by a test user using the average of ratings of its 3 nearest neighbours 
predicted_rating = train.pivot_table(index = 'userId',  
                                     columns = 'movieId', 
                                     values = 'rating').fillna(0).loc[top3_neighbours.values.tolist()[0],
                                                                      :].mean()[test[test['userId'] == 294].pivot_table(
                                                            index = 'userId', 
                                                         columns = 'movieId', 
                                                         values = 'rating').columns]

In [367]:
# displaying predicted ratings of movies not yet rated by a test user
pd.DataFrame(predicted_rating, columns = ['rating']).sort_values(by='rating', 
                                                                 ascending = False).reset_index().merge(movie_title, 
                                                                                                        how = 'inner')

Unnamed: 0,movieId,rating,original_title
0,858,5.0,The Godfather
1,296,4.833333,Pulp Fiction
2,47,4.666667,Se7en
3,50,4.666667,The Usual Suspects
4,2959,4.666667,Fight Club
5,593,4.5,The Silence of the Lambs
6,2762,4.5,The Sixth Sense
7,608,4.333333,Fargo
8,2028,4.333333,Saving Private Ryan
9,150,3.833333,Apollo 13


We can compare the predictions above with the predictions produced by the KNNBasic algorithm:

In [368]:
predictions[predictions['userId'] == 294].sort_values(by='rating', ascending = False).merge(movie_title, 
                                                                                                        how = 'inner')

Unnamed: 0,userId,movieId,rating,original_title
0,294,296,4.878825,Pulp Fiction
1,294,593,4.608507,The Silence of the Lambs
2,294,47,4.289277,Se7en
3,294,50,4.289116,The Usual Suspects
4,294,2762,4.233439,The Sixth Sense
5,294,608,4.071758,Fargo
6,294,858,4.061348,The Godfather
7,294,2028,3.933848,Saving Private Ryan
8,294,2959,3.479836,Fight Club
9,294,2858,3.422756,American Beauty
