{ "cells": [ { "cell_type": "markdown", "id": "7675567f", "metadata": {}, "source": [ "# Item-Item Collaborative Filtering" ] }, { "cell_type": "markdown", "id": "807dfbd0", "metadata": {}, "source": [ "## Importing libraries" ] }, { "cell_type": "code", "execution_count": 303, "id": "dc37970f", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from scipy import sparse\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "\n", "from surprise import KNNWithMeans, SVD\n", "from surprise import Dataset\n", "from surprise import Reader" ] }, { "cell_type": "markdown", "id": "09ebce79", "metadata": {}, "source": [ "## Reading and preprocessing data" ] }, { "cell_type": "markdown", "id": "17cbca49", "metadata": {}, "source": [ "Data source: https://www.kaggle.com/code/rounakbanik/movie-recommender-systems/notebook\n", "\n", "We have selected a subset of original dataset with users that rated more than 690 movies and movies that were rated by more than 180 users. Additionally, we have added titles to the rated movies. The resulting dataset is an intersection of these user and movie subsets." ] }, { "cell_type": "code", "execution_count": 458, "id": "f9434d84", "metadata": {}, "outputs": [], "source": [ "data = pd.read_excel('movies.xlsx')" ] }, { "cell_type": "code", "execution_count": 459, "id": "219ff497", "metadata": {}, "outputs": [], "source": [ "# the correspondence between movies ids and titles\n", "movie_title = data[['movieId', 'original_title']].drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 460, "id": "8915ef40", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
movieId | \n", "1 | \n", "32 | \n", "47 | \n", "50 | \n", "110 | \n", "150 | \n", "260 | \n", "296 | \n", "318 | \n", "356 | \n", "... | \n", "1210 | \n", "1270 | \n", "1580 | \n", "2028 | \n", "2571 | \n", "2762 | \n", "2858 | \n", "2959 | \n", "4993 | \n", "5952 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
userId | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
15 | \n", "2.0 | \n", "4.0 | \n", "5.0 | \n", "5.0 | \n", "3.0 | \n", "3.0 | \n", "5.0 | \n", "5.0 | \n", "2.0 | \n", "1.0 | \n", "... | \n", "5.0 | \n", "5.0 | \n", "4.0 | \n", "3.0 | \n", "5.0 | \n", "1.0 | \n", "4.0 | \n", "5.0 | \n", "5.0 | \n", "5.0 | \n", "
23 | \n", "3.0 | \n", "4.0 | \n", "4.5 | \n", "4.0 | \n", "3.5 | \n", "3.5 | \n", "4.5 | \n", "4.5 | \n", "5.0 | \n", "4.5 | \n", "... | \n", "4.0 | \n", "4.5 | \n", "3.5 | \n", "4.0 | \n", "4.0 | \n", "4.0 | \n", "3.5 | \n", "3.5 | \n", "4.0 | \n", "4.0 | \n", "
30 | \n", "4.0 | \n", "2.0 | \n", "4.0 | \n", "5.0 | \n", "5.0 | \n", "5.0 | \n", "4.0 | \n", "5.0 | \n", "5.0 | \n", "5.0 | \n", "... | \n", "4.0 | \n", "5.0 | \n", "4.0 | \n", "5.0 | \n", "3.0 | \n", "5.0 | \n", "5.0 | \n", "4.0 | \n", "3.0 | \n", "0.0 | \n", "
73 | \n", "5.0 | \n", "5.0 | \n", "5.0 | \n", "5.0 | \n", "4.0 | \n", "3.5 | \n", "4.5 | \n", "5.0 | \n", "5.0 | \n", "5.0 | \n", "... | \n", "5.0 | \n", "5.0 | \n", "3.0 | \n", "4.5 | \n", "4.5 | \n", "4.0 | \n", "4.5 | \n", "5.0 | \n", "5.0 | \n", "5.0 | \n", "
212 | \n", "3.0 | \n", "3.5 | \n", "3.5 | \n", "3.5 | \n", "5.0 | \n", "4.0 | \n", "4.0 | \n", "4.0 | \n", "4.5 | \n", "4.0 | \n", "... | \n", "0.0 | \n", "3.0 | \n", "1.5 | \n", "4.0 | \n", "5.0 | \n", "3.5 | \n", "4.0 | \n", "5.0 | \n", "5.0 | \n", "5.0 | \n", "
213 | \n", "3.0 | \n", "1.5 | \n", "2.5 | \n", "0.0 | \n", "2.5 | \n", "1.5 | \n", "5.0 | \n", "0.0 | \n", "0.0 | \n", "2.0 | \n", "... | \n", "5.0 | \n", "3.0 | \n", "4.0 | \n", "0.0 | \n", "4.0 | \n", "2.5 | \n", "0.0 | \n", "0.0 | \n", "4.5 | \n", "4.0 | \n", "
294 | \n", "4.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "3.0 | \n", "0.0 | \n", "4.0 | \n", "0.0 | \n", "3.0 | \n", "4.0 | \n", "... | \n", "4.0 | \n", "4.0 | \n", "4.5 | \n", "0.0 | \n", "4.5 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "4.0 | \n", "4.0 | \n", "
311 | \n", "3.0 | \n", "0.0 | \n", "0.5 | \n", "3.0 | \n", "3.0 | \n", "5.0 | \n", "4.0 | \n", "3.0 | \n", "4.5 | \n", "5.0 | \n", "... | \n", "3.5 | \n", "4.5 | \n", "3.0 | \n", "5.0 | \n", "4.0 | \n", "4.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
380 | \n", "4.0 | \n", "5.0 | \n", "5.0 | \n", "5.0 | \n", "5.0 | \n", "4.0 | \n", "4.0 | \n", "5.0 | \n", "4.0 | \n", "5.0 | \n", "... | \n", "4.0 | \n", "3.0 | \n", "3.0 | \n", "5.0 | \n", "5.0 | \n", "5.0 | \n", "5.0 | \n", "5.0 | \n", "4.5 | \n", "4.0 | \n", "
388 | \n", "0.0 | \n", "4.0 | \n", "5.0 | \n", "5.0 | \n", "5.0 | \n", "4.5 | \n", "4.5 | \n", "5.0 | \n", "5.0 | \n", "4.0 | \n", "... | \n", "5.0 | \n", "4.0 | \n", "3.0 | \n", "5.0 | \n", "5.0 | \n", "4.0 | \n", "4.0 | \n", "3.0 | \n", "5.0 | \n", "4.5 | \n", "
452 | \n", "3.5 | \n", "0.0 | \n", "2.5 | \n", "0.0 | \n", "4.0 | \n", "4.0 | \n", "4.0 | \n", "5.0 | \n", "5.0 | \n", "4.0 | \n", "... | \n", "0.0 | \n", "4.0 | \n", "4.0 | \n", "4.0 | \n", "2.0 | \n", "5.0 | \n", "5.0 | \n", "2.0 | \n", "0.0 | \n", "1.0 | \n", "
457 | \n", "1.5 | \n", "3.0 | \n", "4.5 | \n", "3.0 | \n", "3.0 | \n", "0.0 | \n", "0.5 | \n", "5.0 | \n", "5.0 | \n", "3.5 | \n", "... | \n", "0.5 | \n", "4.0 | \n", "0.0 | \n", "0.0 | \n", "3.0 | \n", "3.0 | \n", "3.5 | \n", "4.0 | \n", "0.0 | \n", "0.5 | \n", "
461 | \n", "3.5 | \n", "0.0 | \n", "5.0 | \n", "4.0 | \n", "4.0 | \n", "3.0 | \n", "4.5 | \n", "4.5 | \n", "5.0 | \n", "4.0 | \n", "... | \n", "4.5 | \n", "5.0 | \n", "2.0 | \n", "3.5 | \n", "4.5 | \n", "4.5 | \n", "2.0 | \n", "5.0 | \n", "4.5 | \n", "4.5 | \n", "
468 | \n", "4.0 | \n", "3.0 | \n", "3.5 | \n", "3.0 | \n", "3.0 | \n", "3.0 | \n", "3.5 | \n", "3.5 | \n", "3.5 | \n", "3.0 | \n", "... | \n", "3.5 | \n", "3.0 | \n", "3.0 | \n", "3.0 | \n", "3.0 | \n", "3.0 | \n", "3.5 | \n", "3.0 | \n", "3.5 | \n", "3.5 | \n", "
472 | \n", "5.0 | \n", "3.0 | \n", "4.0 | \n", "4.0 | \n", "0.0 | \n", "3.0 | \n", "4.0 | \n", "5.0 | \n", "5.0 | \n", "4.0 | \n", "... | \n", "3.0 | \n", "3.0 | \n", "5.0 | \n", "3.0 | \n", "5.0 | \n", "5.0 | \n", "2.0 | \n", "2.0 | \n", "3.0 | \n", "3.0 | \n", "
509 | \n", "3.0 | \n", "4.0 | \n", "4.0 | \n", "5.0 | \n", "5.0 | \n", "3.0 | \n", "5.0 | \n", "5.0 | \n", "4.0 | \n", "4.0 | \n", "... | \n", "4.0 | \n", "3.0 | \n", "2.0 | \n", "3.0 | \n", "4.5 | \n", "4.0 | \n", "4.5 | \n", "4.0 | \n", "4.5 | \n", "5.0 | \n", "
518 | \n", "5.0 | \n", "0.0 | \n", "5.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "5.0 | \n", "5.0 | \n", "5.0 | \n", "5.0 | \n", "... | \n", "5.0 | \n", "4.0 | \n", "5.0 | \n", "4.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
547 | \n", "3.5 | \n", "4.0 | \n", "4.0 | \n", "4.0 | \n", "0.0 | \n", "4.0 | \n", "0.0 | \n", "5.0 | \n", "5.0 | \n", "2.0 | \n", "... | \n", "2.5 | \n", "3.5 | \n", "2.0 | \n", "0.0 | \n", "3.5 | \n", "4.0 | \n", "5.0 | \n", "3.5 | \n", "2.5 | \n", "0.0 | \n", "
564 | \n", "4.0 | \n", "4.0 | \n", "4.0 | \n", "5.0 | \n", "1.0 | \n", "3.0 | \n", "2.0 | \n", "5.0 | \n", "0.0 | \n", "3.0 | \n", "... | \n", "3.0 | \n", "3.0 | \n", "4.0 | \n", "5.0 | \n", "3.0 | \n", "5.0 | \n", "5.0 | \n", "5.0 | \n", "0.0 | \n", "0.0 | \n", "
580 | \n", "4.0 | \n", "4.5 | \n", "4.0 | \n", "4.0 | \n", "4.5 | \n", "0.0 | \n", "4.0 | \n", "4.5 | \n", "4.0 | \n", "3.5 | \n", "... | \n", "3.5 | \n", "3.0 | \n", "2.5 | \n", "4.0 | \n", "4.5 | \n", "4.0 | \n", "4.5 | \n", "5.0 | \n", "4.5 | \n", "4.0 | \n", "
624 | \n", "5.0 | \n", "2.0 | \n", "4.0 | \n", "4.0 | \n", "0.0 | \n", "3.0 | \n", "5.0 | \n", "5.0 | \n", "0.0 | \n", "3.0 | \n", "... | \n", "5.0 | \n", "5.0 | \n", "4.0 | \n", "0.0 | \n", "2.0 | \n", "4.0 | \n", "4.0 | \n", "4.0 | \n", "4.0 | \n", "4.0 | \n", "
21 rows × 37 columns
\n", "\n", " | userId | \n", "movieId | \n", "rating | \n", "original_title | \n", "
---|---|---|---|---|
0 | \n", "388 | \n", "1 | \n", "4.178314 | \n", "Toy Story | \n", "
1 | \n", "294 | \n", "32 | \n", "3.845652 | \n", "Twelve Monkeys | \n", "
2 | \n", "311 | \n", "32 | \n", "3.56767 | \n", "Twelve Monkeys | \n", "
3 | \n", "452 | \n", "32 | \n", "3.679041 | \n", "Twelve Monkeys | \n", "
4 | \n", "461 | \n", "32 | \n", "3.703987 | \n", "Twelve Monkeys | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
94 | \n", "30 | \n", "5952 | \n", "4.410817 | \n", "The Lord of the Rings: The Two Towers | \n", "
95 | \n", "311 | \n", "5952 | \n", "3.608098 | \n", "The Lord of the Rings: The Two Towers | \n", "
96 | \n", "518 | \n", "5952 | \n", "4.36384 | \n", "The Lord of the Rings: The Two Towers | \n", "
97 | \n", "547 | \n", "5952 | \n", "3.356063 | \n", "The Lord of the Rings: The Two Towers | \n", "
98 | \n", "564 | \n", "5952 | \n", "3.788141 | \n", "The Lord of the Rings: The Two Towers | \n", "
99 rows × 4 columns
\n", "\n", " | userId | \n", "movieId | \n", "rating | \n", "original_title | \n", "
---|---|---|---|---|
0 | \n", "388 | \n", "1 | \n", "4.037184 | \n", "Toy Story | \n", "
1 | \n", "294 | \n", "32 | \n", "3.728931 | \n", "Twelve Monkeys | \n", "
2 | \n", "311 | \n", "32 | \n", "3.426616 | \n", "Twelve Monkeys | \n", "
3 | \n", "452 | \n", "32 | \n", "3.554053 | \n", "Twelve Monkeys | \n", "
4 | \n", "461 | \n", "32 | \n", "3.630588 | \n", "Twelve Monkeys | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
94 | \n", "30 | \n", "5952 | \n", "4.299921 | \n", "The Lord of the Rings: The Two Towers | \n", "
95 | \n", "311 | \n", "5952 | \n", "3.709705 | \n", "The Lord of the Rings: The Two Towers | \n", "
96 | \n", "518 | \n", "5952 | \n", "4.398882 | \n", "The Lord of the Rings: The Two Towers | \n", "
97 | \n", "547 | \n", "5952 | \n", "3.399342 | \n", "The Lord of the Rings: The Two Towers | \n", "
98 | \n", "564 | \n", "5952 | \n", "3.821427 | \n", "The Lord of the Rings: The Two Towers | \n", "
99 rows × 4 columns
\n", "