Skip to content

Latest commit

ย 

History

History
321 lines (304 loc) ยท 7.18 KB

File metadata and controls

321 lines (304 loc) ยท 7.18 KB

NCF: Data Distribution Type

June 21 2021

./Run.py

def __init__

13  loader = Loader() # Load Datas
19  self.df_neg, self.users, self.movies, movie_lookup = loader.load_dataset()
20  user_input, movie_input, labels = loader.get_train_instances(uids, mids, num_neg, len(self.movies))

./Loader.py

def load_dataset - Load Data

27  df = pd.read_csv(file_path + '/ratings.csv', header = None) # userId,movieId,rating,timestamp
28  df = df.drop(df.columns[3], axis = 1)
29  df.columns = ['userId', 'movieId', 'rating']
30  df = df.dropna()
31  df = df.loc[df.rating != 0]

TEST CODE

import pandas as pd
data = [[1,2,3,4], [5,6,7,8], [9,10,11,12]]
df = pd.DataFrame(data)
print(df)

RESULT

   0   1   2   3
0  1   2   3   4
1  5   6   7   8
2  9  10  11  12

TEST CODE

df = df.drop(df.columns[3], axis = 1)
print(df)

RESULT

   0   1   2
0  1   2   3
1  5   6   7
2  9  10  11

TEST CODE

df.columns = ['userId', 'movieId', 'rating']
print(df)

RESULT

   userId  movieId  rating
0       1        2       3
1       5        6       7
2       9       10      11

TEST CODE

df = df.dropna()
print(df)

RESULT

   userId  movieId  rating
0       1        2       3
1       5        6       7
2       9       10      11

TEST CODE

df = df.loc[df.rating != 0]
print(df)

RESULT

   userId  movieId  rating
0       1        2       3
1       5        6       7
2       9       10      11

TEST CODE

data = [[1,2,0,4],[5,6,0,8],[9,10,11,12]] # There are datas whose 'rating' value is 0.
df = pd.DataFrame(data)
df = df.drop(df.columns[3], axis = 1)
df.columns = ['userId', 'movieId', 'rating']
df = df.dropna()
df = df.loc[df.rating != 0]
print(df)

RESULT

   userId  movieId  rating
2       9       10      11

def load_dataset - User Sampling

34  sample_num = 100000
35  unique_user_lst = list(np.unique(df['userId']))
36  sample_user_idx = np.random.choice(len(unique_user_lst), sample_num, replace=True)
37  sample_user_lst = [unique_user_lst[idx] for idx in sample_user_idx]
38  df = df[df['userId'].isin(sample_user_lst)]
39  df = df.reset_index(drop=True)
  • unique_user_lst : A list of userId(There is no duplication)
  • sample_user_idx : Choice 100,000(= Value of 'sample_num') number of sample users in 'unique_user_lst'
  • sample_user_lst : Data whose column is 'userId' and a value is in the list 'sample_user_lst'
  • df.reset_index(drop=True) : Reset index of dataframe and add the index to column of dataframe

If data is [[1,2,0,4], [5,6,7,8], [9,10,11,12], [13,14,15,16], [17,18,19,20]],

'unique_user_lst' will be [5,9,13,17]. ('userId' 1 is not contained because the value is dropped whose 'rating' is 0) TEST CODE

sample_user_idx = np.random.choice(len(unique_user_lst), 2, replace=True)
print(sample_user_idx)
  • Result : [3 3]

TEST CODE

sample_user_lst = [unique_user_lst[idx] for idx in sample_user_idx]
print(sample_user_lst)
  • Result : [17, 17]

TEST CODE

df = df[df['userId'].isin(sample_user_lst)]
print(df)
  • Result
   userId  movieId  rating
4      17       18      19

TEST CODE

df.reset_index(drop=True)
print(df)
  • Result
   userId  movieId  rating
0      17       18      19

def load_dataset - Use User with more than one user data

42  df_count = df.groupby(['userId']).count()
43  df['count'] = df.groupby('userId')['userId'].transform('count')
44  df = df[df['count'] > 1]

Grouping by 'userId' and count a number of unique 'userId'.

TEST CODE

df_count = df.groupby(['userId']).count()
print(df_count)

RESULT

        movieId  rating
userId                 
17            1       1

TEST CODE

df['count'] = df.groupby('userId')['userId'].transform('count')
print(df)

RESULT

   userId  movieId  rating  count
4      17       18      19      1

TEST CODE

df = df[df['count'] > 1]

RESULT

Empty DataFrame
Columns: [userId, movieId, rating, count]
Index: []

TOTAL TEST CODE

import pandas as pd
import numpy as np

data = [[1,2,0,4], [5,6,7,8,], [9,10,11,12], [17,18,19,20], [5,21,22,23]]
df = pd.DataFrame(data)
df = df.drop(df.columns[3], axis = 1)
df.columns = ['userId', 'movieId', 'rating']
df = df.dropna()
df = df.loc[df.rating != 0]
print(df) #1
print('\n')
unique_user_lst = list(np.unique(df['userId']))
print(unique_user_lst) #2
print('\n')
sample_user_idx = np.random.choice(len(unique_user_lst), 3, replace = True)
print(sample_user_idx) #3
print('\n')
sample_user_lst = [unique_user_lst[idx] for idx in sample_user_idx]
print(sample_user_lst) #4
print('\n')
df = df[df['userId'].isin(sample_user_lst)]
print(df) #5
print('\n')
df = df.reset_index(drop=True)
print(df) $6
print('\n')
df_count = df.groupby(['userId']).count()
print(df_count) #7
print('\n')
df['count'] = df.groupby('userId')['userId'].transform('count')
print(df) #8
print('\n')
df = df[df['count'] >= 1]
print(df) #9

RESULT

#1
   userId  movieId  rating
1       5        6       7
2       9       10      11
3      17       18      19
4       5       21      22

#2
[5, 9, 17]

#3
[0 1 1]

#4
[5, 9, 9]

#5
   userId  movieId  rating
1       5        6       7
2       9       10      11
4       5       21      22

#6
   userId  movieId  rating
0       5        6       7
1       9       10      11
2       5       21      22

#7
        movieId  rating
userId                 
5             2       2
9             1       1

#8
   userId  movieId  rating  count
0       5        6       7      2
1       9       10      11      1
2       5       21      22      2

#9
   userId  movieId  rating  count
0       5        6       7      2
1       9       10      11      1
2       5       21      22      2

./Loader.py - Allocate userId and movieId

47 df['user_id'] = df['userId'].astype("category").cat.codes
48 df['movie_id'] = df['movieId'].astype("category").cat.codes

./Loader.py - Create Lookup Table

51 movie_lookup = df[['movie_id', 'movieId']].drop_duplicates()
52 movie_lookup['movie_id'] = movie_lookup.movie_id.astype(str)

For What? IDK..yet..

def get_train_instances

133   user_input, movie_input, labels = [], [], []
134   zipped = set(zip(uids, mids)) # A set (user, movie)
136   for (u, i) in zip(uids, mids):
139      user_input.append(u)
140      movie_input.append(i)
141      labels.append(1)
144         for t in range(num_neg):
145            j = np.random.randint(num_movies) 
147            while(u, j) in zipped:
148               j = np.random.randint(num_movies)
150            user_input.append(u)
151            movie_input.append(j)
152            labels.append(0)
154   return user_input, movie_input, labels

?