실제 영화데이터를 이용한 추천시스템 구현실습

2019-01-29

추천시스템

그림, 실습코드 등 학습자료 출처 : https://gitlab.com/radajin

[ 실제 영화데이터를 이용한 추천시스템 구현 ]

데이터 컨텐츠 : https://www.kaggle.com/rounakbanik/the-movies-dataset/home
활용데이터 : links_small.csv, movies_metadata.csv, ratings_small.csv

import numpy as np
import pandas as pd
from scipy import spatial

1. 데이터 로드

# ratings_small 데이터 로드
rating_df = pd.read_csv("ratings_small.csv")
rating_df.tail()

	userId	movieId	rating	timestamp
99999	671	6268	2.5	1065579370
100000	671	6269	4.0	1065149201
100001	671	6365	4.0	1070940363
100002	671	6385	2.5	1070979663
100003	671	6565	3.5	1074784724

## 타임스템프 컬럼은 불필요하므로 제거
rating_df.drop('timestamp', axis=1, inplace = True)

rating_df.tail()

	userId	movieId	rating
99999	671	6268	2.5
100000	671	6269	4.0
100001	671	6365	4.0
100002	671	6385	2.5
100003	671	6565	3.5

links_df = pd.read_csv("links_small.csv")
links_df.tail()

	movieId	imdbId	tmdbId
9120	162672	3859980	402672.0
9121	163056	4262980	315011.0
9122	163949	2531318	391698.0
9123	164977	27660	137608.0
9124	164979	3447228	410803.0

# 결측값 제거
links_df = links_df.dropna()
pd.options.display.float_format = '{:.0f}'.format
## 판다스의 디스플레이 옵션을 설정하는데 float format을 소수점 제거를 해준다.

links_df['tmdbId'] = links_df['tmdbId'].astype('int64')
links_df.tail()

	movieId	imdbId	tmdbId
9120	162672	3859980	402672
9121	163056	4262980	315011
9122	163949	2531318	391698
9123	164977	27660	137608
9124	164979	3447228	410803

metadata_df = pd.read_csv("movies_metadata.csv", low_memory = False)
## low_memory = False -> 메모리를 많이 사용해서 데이터를 크게 가져오겠다.
metadata_df.tail()

	adult	belongs_to_collection	genres	homepage	id	imdb_id	original_language	original_title	overview	...	release_date	runtime	spoken_languages	status	tagline	title	video	vote_average	vote_count
45461	False	NaN	[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...	http://www.imdb.com/title/tt6209470/	439050	tt6209470	fa	رگ خواب	Rising and falling between a man and woman.	...	NaN	90	[{'iso_639_1': 'fa', 'name': 'فارسی'}]	Released	Rising and falling between a man and woman	Subdue	False	4	1
45462	False	NaN	[{'id': 18, 'name': 'Drama'}]	NaN	111109	tt2028550	tl	Siglo ng Pagluluwal	An artist struggles to finish his work while a...	...	2011-11-17	360	[{'iso_639_1': 'tl', 'name': ''}]	Released	NaN	Century of Birthing	False	9	3
45463	False	NaN	[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...	NaN	67758	tt0303758	en	Betrayal	When one of her hits goes wrong, a professiona...	...	2003-08-01	90	[{'iso_639_1': 'en', 'name': 'English'}]	Released	A deadly game of wits.	Betrayal	False	4	6
45464	False	NaN	[]	NaN	227506	tt0008536	en	Satana likuyushchiy	In a small town live two brothers, one a minis...	...	1917-10-21	87	[]	Released	NaN	Satan Triumphant	False	0	0
45465	False	NaN	[]	NaN	461257	tt6980792	en	Queerama	50 years after decriminalisation of homosexual...	...	2017-06-09	75	[{'iso_639_1': 'en', 'name': 'English'}]	Released	NaN	Queerama	False	0	0

5 rows × 24 columns

# 필요한 컬럼들의 데이터만 추출
metadata_df = metadata_df[['id','original_title','title','runtime']]
metadata_df.tail()

	id	original_title	title	runtime
45461	439050	رگ خواب	Subdue	90
45462	111109	Siglo ng Pagluluwal	Century of Birthing	360
45463	67758	Betrayal	Betrayal	90
45464	227506	Satana likuyushchiy	Satan Triumphant	87
45465	461257	Queerama	Queerama	75

# links_small를 이용하면 rating_df과 metadata를 검색할 수 있다.

2. movieId 값으로 영화정보를 출력

def id_to_movie(id_num):
    pd.options.display.float_format = '{:.0f}'.format
    ## 판다스의 디스플레이 옵션을 설정하는데 float format을 소수점 제거를 해준다.
    
    tmdbId = links_df.loc[links_df["movieId"]==id_num]['tmdbId'].values[0]
    movie_info = metadata_df.loc[metadata_df['id'] == str(tmdbId)]
    
    pd.reset_option('display')
    ## 위에 설정한 옵션해제
    
    return movie_info

id_to_movie(6365)

	id	original_title	title	runtime
6221	604	The Matrix Reloaded	The Matrix Reloaded	138.0

3. 데이터 탐색

[unique count]

rating
user
movie

u_user = rating_df['userId'].unique()
len(u_user)

## 671명의 유저가 약 10만건의 rating을 매긴것으로 확인됨

u_movie = rating_df['movieId'].unique()
len(u_movie)
## 약 9000개의 영화

u_rating = rating_df['rating'].unique()
len(u_rating)
## 10가지 카테고리의 평점

rating_df.groupby('rating').size().reset_index(name='rating_count')
## 이 테이블을 보고 추천시스템 개발에 대한 기준을 잡을 수 있다.

	rating	rating_count
0	0.5	1101
1	1.0	3326
2	1.5	1687
3	2.0	7271
4	2.5	4449
5	3.0	20064
6	3.5	10538
7	4.0	28750
8	4.5	7723
9	5.0	15095

user_counts_df = rating_df.groupby("userId").size().reset_index(name = 'user_rating_count')
user_counts_df = user_counts_df.sort_values(by=["user_rating_count"], ascending=False)
user_counts_df.tail()

	userId	user_rating_count
295	296	20
288	289	20
248	249	20
220	221	20
0	1	20

user_counts_df.head()

	userId	user_rating_count
546	547	2391
563	564	1868
623	624	1735
14	15	1700
72	73	1610

movie_counts_df = rating_df.groupby("movieId").size().reset_index(name = 'movie_rating_count')
movie_counts_df = movie_counts_df.sort_values(by=["movie_rating_count"], ascending=False)
movie_counts_df.tail()

	movieId	movie_rating_count
6045	31956	1
6046	31963	1
6047	31973	1
6050	32022	1
9065	163949	1

movie_counts_df.head()

	movieId	movie_rating_count
321	356	341
266	296	324
284	318	311
525	593	304
232	260	291

4. 데이터 전처리

데이터 set 감축

user_limit, movie_limit = 100, 100
## 100개 이상 평가한 유저만 걸러냄
## 100번 이상 평가받은 영화면 걸러냄

filtered_userId = user_counts_df[user_counts_df['user_rating_count'] > user_limit]['userId']
filtered_userId = list(filtered_userId)
len(filtered_userId)

filtered_movieId = movie_counts_df[movie_counts_df['movie_rating_count'] > user_limit]['movieId']
filtered_movieId = list(filtered_movieId)
len(filtered_movieId)

filtered_df = rating_df[rating_df['userId'].isin(filtered_userId)]
len(filtered_df)

filtered_df = filtered_df[filtered_df['movieId'].isin(filtered_movieId)]
len(filtered_df)

filtered_df.tail()

	userId	movieId	rating
99982	671	4993	5.0
99983	671	4995	4.0
99987	671	5349	4.0
99989	671	5445	4.5
99994	671	5952	5.0

5. pivot을 이용하여 user_base로 데이터 프레임 생성

user_df = filtered_df.pivot_table(values='rating', index = 'userId',\
                                  columns = 'movieId', fill_value=0,\
                                  aggfunc = np.average, dropna=False)
## aggfunc = np.average 중복값은 평균값으로 처리
## 데이터가 없는 경우에 0으로 채워짐

user_df.tail()

movieId	1	2	6	10	25	32	34	36	39	47	...	6377	6539	6874	7153	7361	7438	8961	33794	58559	79132
userId
656	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
659	0.0	0.0	3.0	0.0	5.0	4.0	0.0	4.0	0.0	4.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
664	3.5	0.0	4.0	0.0	0.0	5.0	0.0	0.0	0.0	4.5	...	0.0	4.0	4.0	4.0	4.0	4.0	4.0	4.0	4.5	5.0
665	0.0	3.0	0.0	0.0	0.0	4.0	2.0	0.0	2.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
671	5.0	0.0	0.0	0.0	0.0	0.0	0.0	4.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

5 rows × 149 columns

6. 유사도 행렬 생성

유사도 측정함수 구현

def Euclidean_Distance_Similarity(vector_1, vector_2):
    
    ## 0으로 비어있는 데이터 제거
    idx = vector_1.nonzero()[0]
    
    ## 모든데이터가 0인 경우 0을 리턴
    if len(idx) == 0:
        return 0
    
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    

    ## 0으로 비어있는 데이터 제거
    idx = vector_2.nonzero()[0]
    
    ## 모든데이터가 0인 경우 0을 리턴
    if len(idx) == 0:
        return 0
    
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    
    return np.linalg.norm(vector_1 - vector_2)

## 임시테스트
Euclidean_Distance_Similarity(user_df.loc[4], user_df.loc[8])

4.8218253804964775

def Cosine_Similarity(vector_1, vector_2):
    
    ## 0으로 비어있는 데이터 제거
    idx = vector_1.nonzero()[0]
    
    ## 모든데이터가 0인 경우 0을 리턴
    if len(idx) == 0:
        return 0
    
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    

    ## 0으로 비어있는 데이터 제거
    idx = vector_2.nonzero()[0]
    
    ## 모든데이터가 0인 경우 0을 리턴
    if len(idx) == 0:
        return 0
    
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    
    return 1 - spatial.distance.cosine(vector_1, vector_2)

## 임시테스트
Cosine_Similarity(user_df.loc[4], user_df.loc[8])

0.9911164579376771

유사도 메트릭스 생성 함수구현

def similarity_matrix(user_df, similarity_func):
    
    index = user_df.index
    
    matrix = []
    # sample_df의 로우를 하나씩 돌면서 데이터를 가져옴
    for idx_1, value_1 in user_df.iterrows():
        row = []
        for idx_2, value_2 in user_df.iterrows():
            row.append(similarity_func(value_1, value_2))
        matrix.append(row)
        
    return pd.DataFrame(matrix, columns = index, index=index)

유사도 행렬 생성

%%time
sm_df = similarity_matrix(user_df, Cosine_Similarity)

Wall time: 11.8 s

sm_df.head(5)

userId	4	8	15	17	19	21	22	23	26	30	...	647	648	652	654	655	656	659	664	665	671
userId
4	1.000000	0.991116	0.956762	0.948457	0.985932	0.980286	0.981591	0.982744	0.986789	0.979119	...	0.979131	0.951088	0.986368	0.991149	0.983037	0.997707	0.970241	0.994377	0.968998	0.985579
8	0.991116	1.000000	0.914253	0.966828	0.972568	0.985269	0.964117	0.982010	0.984022	0.971471	...	0.974777	0.947942	0.970261	0.988689	0.979823	0.998645	0.972875	0.990196	0.974638	0.982713
15	0.956762	0.914253	1.000000	0.914953	0.950125	0.950927	0.906975	0.923247	0.888292	0.920392	...	0.957841	0.856947	0.893839	0.917356	0.900642	0.873927	0.938017	0.930106	0.903008	0.892096
17	0.948457	0.966828	0.914953	1.000000	0.949537	0.933276	0.939038	0.961024	0.966644	0.942020	...	0.963750	0.933889	0.869626	0.947757	0.964055	0.960849	0.932213	0.964792	0.933463	0.952986
19	0.985932	0.972568	0.950125	0.949537	1.000000	0.963805	0.955135	0.980127	0.954985	0.962846	...	0.971151	0.966500	0.980166	0.979269	0.957911	0.977106	0.962211	0.979273	0.954240	0.971782

5 rows × 258 columns

7. 예측 메트릭스 구현

예측 메트릭스 생성 함수구현

def mean_score(df, sm_df, target, closer_count):
    ms_df = sm_df.drop(target)
    ms_df = ms_df.sort_values(target, ascending = False)
    ms_df = ms_df[:closer_count]
    ms_df = df.loc[ms_df.index]
    
    # 결과데이터 생성
    pred_df = pd.DataFrame(columns = df.columns)
    pred_df.loc['User'] = df.loc[target]
    pred_df.loc['Mean'] = ms_df.mean()
   
    return pred_df

## 작동테스트
pred_df = mean_score(user_df, sm_df, 4, 5)
pred_df

movieId	1	2	6	10	25	32	34	36	39	47	...	6377	6539	6874	7153	7361	7438	8961	33794	58559	79132
User	0.0	0.0	0.0	4.0	0.0	0.0	5.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
Mean	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.8	...	1.8	0.0	1.7	2.0	1.8	0.8	1.5	0.8	1.9	1.0

2 rows × 149 columns

예측 메트릭스 내 상위 평점 10개 추출함수 (추천리스트 산출함수) 구현

def recommand(pred_df, r_count=10):
    recommand_df = pred_df.T
    recommand_df = recommand_df[recommand_df["User"]==0]
    recommand_df = recommand_df.sort_values("Mean",ascending=False)
    return list(recommand_df[:r_count].index)

## 작동테스트
movie_ids = recommand(pred_df)
movie_ids

[4226, 2858, 2959, 4973, 912, 50, 5952, 4306, 3996, 4993]

8. 위에서 구한 movieId 값을 이용해서 영화정보를 가져오기

def movie_info(movie_ids):
    datas = []
    
    for movie_id in movie_ids:
        data = id_to_movie(movie_id).to_dict('records')[0]
        datas.append(data)
        
    return pd.DataFrame(datas)

df = movie_info(movie_ids)
df

	id	original_title	runtime	title
0	77	Memento	113.0	Memento
1	14	American Beauty	122.0	American Beauty
2	550	Fight Club	139.0	Fight Club
3	194	Le fabuleux destin d'Amélie Poulain	122.0	Amélie
4	289	Casablanca	102.0	Casablanca
5	629	The Usual Suspects	106.0	The Usual Suspects
6	121	The Lord of the Rings: The Two Towers	179.0	The Lord of the Rings: The Two Towers
7	808	Shrek	90.0	Shrek
8	146	卧虎藏龙	120.0	Crouching Tiger, Hidden Dragon
9	120	The Lord of the Rings: The Fellowship of the Ring	178.0	The Lord of the Rings: The Fellowship of the Ring

9. 위에서 구현한 코드들을 실행하는 함수생성

def run(df, similarity_func, target, closer_count, r_count):
    
    # 유사도 행렬 데이터 만들기
    sm_df = similarity_matrix(df, similarity_func)
    
    # 예측 행렬 데이터 만들기
    pred_df = mean_score(df, sm_df, target, closer_count)
    
    # 추천 영화 movie_ids 출력
    movie_ids = recommand(pred_df, r_count)
    
    # movie_ids로 영화 정보 데이터 프레임 만들기
    result = movie_info(movie_ids)
    
    return result

# 작동테스트
result_df = run(user_df, Cosine_Similarity, 8, 5, 10)
result_df

	id	original_title	runtime	title
0	601	E.T. the Extra-Terrestrial	115.0	E.T. the Extra-Terrestrial
1	954	Mission: Impossible	110.0	Mission: Impossible
2	62	2001: A Space Odyssey	149.0	2001: A Space Odyssey
3	602	Independence Day	145.0	Independence Day
4	808	Shrek	90.0	Shrek
5	2164	Stargate	121.0	Stargate
6	812	Aladdin	90.0	Aladdin
7	863	Toy Story 2	92.0	Toy Story 2
8	9487	A Bug's Life	95.0	A Bug's Life
9	329	Jurassic Park	127.0	Jurassic Park

10. MAE를 이용한 모델성능 평가

MAE 함수구현

def mae(value, pred):
    idx = value.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]
    
    idx = pred.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]
    
    return np.absolute(sum(value - pred)) / len(idx)

모델에 대한 성능평가

def evaluate(df, sm_df, closer_count):
    
    users = df.index
    evaluate_list = []
    
    for target in users:
        pred_df = mean_score(df, sm_df, target, closer_count)
        evaluate_list.append(mae(pred_df.loc["User"], pred_df.loc["Mean"]))
    
    return np.average(evaluate_list)

## 작동테스트
evaluate(user_df, sm_df, 5)

2.7123888840303985

 추천시스템 작동원리 이해를 위한 구현실습 scipy, sympy를 이용한 미적분 기초학습 