June 21 2021
13 loader = Loader() # Load Datas
19 self.df_neg, self.users, self.movies, movie_lookup = loader.load_dataset()
20 user_input, movie_input, labels = loader.get_train_instances(uids, mids, num_neg, len(self.movies))
27 df = pd.read_csv(file_path + '/ratings.csv', header = None) # userId,movieId,rating,timestamp
28 df = df.drop(df.columns[3], axis = 1)
29 df.columns = ['userId', 'movieId', 'rating']
30 df = df.dropna()
31 df = df.loc[df.rating != 0]
TEST CODE
import pandas as pd data = [[1,2,3,4], [5,6,7,8], [9,10,11,12]] df = pd.DataFrame(data) print(df)RESULT
0 1 2 3 0 1 2 3 4 1 5 6 7 8 2 9 10 11 12
TEST CODE
df = df.drop(df.columns[3], axis = 1) print(df)RESULT
0 1 2 0 1 2 3 1 5 6 7 2 9 10 11
TEST CODE
df.columns = ['userId', 'movieId', 'rating'] print(df)RESULT
userId movieId rating 0 1 2 3 1 5 6 7 2 9 10 11
TEST CODE
df = df.dropna() print(df)RESULT
userId movieId rating 0 1 2 3 1 5 6 7 2 9 10 11
TEST CODE
df = df.loc[df.rating != 0] print(df)RESULT
userId movieId rating 0 1 2 3 1 5 6 7 2 9 10 11
TEST CODE
data = [[1,2,0,4],[5,6,0,8],[9,10,11,12]] # There are datas whose 'rating' value is 0. df = pd.DataFrame(data) df = df.drop(df.columns[3], axis = 1) df.columns = ['userId', 'movieId', 'rating'] df = df.dropna() df = df.loc[df.rating != 0] print(df)RESULT
userId movieId rating 2 9 10 11
34 sample_num = 100000
35 unique_user_lst = list(np.unique(df['userId']))
36 sample_user_idx = np.random.choice(len(unique_user_lst), sample_num, replace=True)
37 sample_user_lst = [unique_user_lst[idx] for idx in sample_user_idx]
38 df = df[df['userId'].isin(sample_user_lst)]
39 df = df.reset_index(drop=True)
- unique_user_lst : A list of userId(There is no duplication)
- sample_user_idx : Choice 100,000(= Value of 'sample_num') number of sample users in 'unique_user_lst'
- sample_user_lst : Data whose column is 'userId' and a value is in the list 'sample_user_lst'
- df.reset_index(drop=True) : Reset index of dataframe and add the index to column of dataframe
If data is
[[1,2,0,4], [5,6,7,8], [9,10,11,12], [13,14,15,16], [17,18,19,20]]
,'unique_user_lst' will be
[5,9,13,17]
. ('userId' 1 is not contained because the value is dropped whose 'rating' is 0) TEST CODEsample_user_idx = np.random.choice(len(unique_user_lst), 2, replace=True) print(sample_user_idx)
- Result : [3 3]
TEST CODE
sample_user_lst = [unique_user_lst[idx] for idx in sample_user_idx] print(sample_user_lst)
- Result : [17, 17]
TEST CODE
df = df[df['userId'].isin(sample_user_lst)] print(df)
- Result
userId movieId rating 4 17 18 19
TEST CODE
df.reset_index(drop=True) print(df)
- Result
userId movieId rating 0 17 18 19
42 df_count = df.groupby(['userId']).count()
43 df['count'] = df.groupby('userId')['userId'].transform('count')
44 df = df[df['count'] > 1]
Grouping by 'userId' and count a number of unique 'userId'.
TEST CODE
df_count = df.groupby(['userId']).count() print(df_count)RESULT
movieId rating userId 17 1 1
TEST CODE
df['count'] = df.groupby('userId')['userId'].transform('count') print(df)RESULT
userId movieId rating count 4 17 18 19 1
TEST CODE
df = df[df['count'] > 1]RESULT
Empty DataFrame Columns: [userId, movieId, rating, count] Index: []
TOTAL TEST CODE
import pandas as pd
import numpy as np
data = [[1,2,0,4], [5,6,7,8,], [9,10,11,12], [17,18,19,20], [5,21,22,23]]
df = pd.DataFrame(data)
df = df.drop(df.columns[3], axis = 1)
df.columns = ['userId', 'movieId', 'rating']
df = df.dropna()
df = df.loc[df.rating != 0]
print(df) #1
print('\n')
unique_user_lst = list(np.unique(df['userId']))
print(unique_user_lst) #2
print('\n')
sample_user_idx = np.random.choice(len(unique_user_lst), 3, replace = True)
print(sample_user_idx) #3
print('\n')
sample_user_lst = [unique_user_lst[idx] for idx in sample_user_idx]
print(sample_user_lst) #4
print('\n')
df = df[df['userId'].isin(sample_user_lst)]
print(df) #5
print('\n')
df = df.reset_index(drop=True)
print(df) $6
print('\n')
df_count = df.groupby(['userId']).count()
print(df_count) #7
print('\n')
df['count'] = df.groupby('userId')['userId'].transform('count')
print(df) #8
print('\n')
df = df[df['count'] >= 1]
print(df) #9
RESULT
#1
userId movieId rating
1 5 6 7
2 9 10 11
3 17 18 19
4 5 21 22
#2
[5, 9, 17]
#3
[0 1 1]
#4
[5, 9, 9]
#5
userId movieId rating
1 5 6 7
2 9 10 11
4 5 21 22
#6
userId movieId rating
0 5 6 7
1 9 10 11
2 5 21 22
#7
movieId rating
userId
5 2 2
9 1 1
#8
userId movieId rating count
0 5 6 7 2
1 9 10 11 1
2 5 21 22 2
#9
userId movieId rating count
0 5 6 7 2
1 9 10 11 1
2 5 21 22 2
47 df['user_id'] = df['userId'].astype("category").cat.codes
48 df['movie_id'] = df['movieId'].astype("category").cat.codes
51 movie_lookup = df[['movie_id', 'movieId']].drop_duplicates()
52 movie_lookup['movie_id'] = movie_lookup.movie_id.astype(str)
For What? IDK..yet..
133 user_input, movie_input, labels = [], [], []
134 zipped = set(zip(uids, mids)) # A set (user, movie)
136 for (u, i) in zip(uids, mids):
139 user_input.append(u)
140 movie_input.append(i)
141 labels.append(1)
144 for t in range(num_neg):
145 j = np.random.randint(num_movies)
147 while(u, j) in zipped:
148 j = np.random.randint(num_movies)
150 user_input.append(u)
151 movie_input.append(j)
152 labels.append(0)
154 return user_input, movie_input, labels
?