- Twitch 스트리머의 팔로워 증가수를 회귀 모델로 분석 및 예측
- 데이터 전처리
- 모델링
- 성능 평가
- 추가 데이터 예측
[kaggle] https://www.kaggle.com/aayushmishra1512/twitchdata
- twitchdata-update.csv
[twitchtracker] https://twitchtracker.com/
- BeautifulSoup을 활용한 크롤링으로 방송 시작일(Date) feature 추가
- 김성준 : twitchtracker 크롤링, EDA, 모델링, 성능 평가, 새로운 데이터 예측, README 작성
- 김종찬 : EDA, 모델링, 성능 평가, 새로운 데이터 예측
- 정하윤 : 회귀 모델과 매개 변수 조사
# 데이터 불러오기
twitch_df = pd.read_csv("twitch.csv", parse_dates=['Date'])
twitch = twitch_df.copy()
sns.regplot(x='Peak viewers', y='Followers gained', data=twitch.drop(index=[13,14,25]), line_kws={"color": "red"});
target = twitch['Followers gained']
feature = twitch.drop(columns = ['Channel','Language','Followers gained', 'Desc'])
feature = sm.add_constant(feature, has_constant='add')
scaler = MinMaxScaler()
feature = pd.DataFrame(scaler.transform(feature), columns = feature.columns)
model = sm.OLS(target, feature).fit()
"VIF Factor": [variance_inflation_factor(feature.values, idx) for idx in range(feature.shape[1])],
"features": feature.columns
mask = np.zeros_like(twitch.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(18, 15))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(twitch.corr(), mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5});
# 문자 데이터를 숫자 데이터로 변환
twitch['English'] = twitch['Language'] == "English" # 1 if English otherwise 0
twitch['English'] = twitch['English'].astype('int')
twitch['Partnered'] = twitch['Partnered'].astype('int') # 1 if Partnered otherwise 0
twitch['Mature'] = twitch['Mature'].astype('int') # 1 if Mature otherwise 0
# 방송 시작 일자 데이터를 방송 기간(일)로 변환
time = datetime(2020, 9, 1)
twitch['Date'] = time - twitch['Date']
twitch['Date'] = list(date.days for date in twitch['Date'])
twitch['Date'][twitch['Date'].isna()] = twitch['Date'].median()
# Outlier 제거
def get_outlier(df=None, column=None, weight=1.5):
quantile_25 = np.percentile(df[column].values, 25)
quantile_75 = np.percentile(df[column].values, 75)
iqr = quantile_75 - quantile_25
iqr_weight = iqr * weight
lowest_val = quantile_25 - iqr_weight
highest_val = quantile_75 + iqr_weight
outlier_index = df[column][(df[column] < lowest_val) | (df[column] > highest_val)].index
return outlier_index
# 24시간 채널
twitch_str_time = twitch.sort_values(by = "Stream time(minutes)", ascending=False)
twitch_str_time.reset_index(inplace=True, drop=True)
twitch_str_time['Desc'] = '24hr channels'
index = [x for x in list(range(30)) if x not in [7,11,13,15,17,21,27,28]]
twitch_cln = twitch_str_time.drop(index=index)
twitch_cln = twitch_cln.sort_values(by="Watch time(Minutes)", ascending=False)
df_2 = []
def lin_regr(data, drop_cols=[[], ['Date'], ['English'], ['Partnered'], ['Date','English','Partnered']]):
df = []
for column in drop_cols:
X = data.drop(columns = ['Channel','Language','Desc', 'Followers gained'] + column)
y = data['Followers gained']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
rgr_list = [LinearRegression(), ElasticNet(alpha=0.1, l1_ratio=0.5), SVR(kernel='poly', degree=2, C=100, epsilon=0.1),
RandomForestRegressor(max_depth=2, random_state=10)]
scaler_list = [StandardScaler(), MinMaxScaler()]
for rgr in rgr_list:
for scaler in scaler_list:
estimators = [('scaler', scaler),
('rgr', rgr)]
pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
y_pred_tr = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)
rmse_test = (np.sqrt(mean_squared_error(y_test, y_pred_test)))
lin_mae = mean_absolute_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100
df.append({'Dataset' : data.iloc[0]['Desc'], 'Drop Columns' : ', '.join(column), 'scaler' : scaler,
'rgr' : rgr, 'RMSE' : int(round(rmse_test)), 'MAE' : int(round(lin_mae)),
'MAPE' : int(round(mape)), 'R2_Score' : round(r2, 5)})
df = pd.DataFrame(df)
return df.sort_values(by='R2_Score', ascending=False).head()
# 모델 간 성능 비교
table_df.sort_values(by='R2_Score', ascending=False).head(20)
# R2 Score by Columns
sns.boxplot(x='Drop Columns',y='R2_Score',data=table_df).set_title("R2 Score by Columns dropped");
# R2 Score by Dataset
sns.boxplot(x='Dataset',y='R2_Score',data=table_df).set_title("R2 Score by Dataset");
# R2 Score by Regressor
sns.boxplot(x='rgr',y='R2_Score',data=table_df).set_title("R2 Score by Rgr");
# R2 Score by Scaler
sns.boxplot(x='scaler',y='R2_Score',data=table_df).set_title("R2 Score by Scaler");
# 임의의 데이터 예측
def predict_followers_gained(channel):
X = twitch_outlier_3.drop(columns = ['Channel','Language','Desc', 'Followers gained', 'Partnered'])
y = twitch_outlier_3['Followers gained']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
estimators = [('scaler', StandardScaler()),
('rgr', LinearRegression())]
pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
result = int(round(pipe.predict(channel)[0]))
return result
dhtekkz = [[59378100,21180,18364,2821,179851,1626543,0,1235,1]]
# Real Followers gained : 179,851 Predict Followers gained : 182,147
# In fact, there are too many external variables that affects prediction.
# But prediction for this channel was pretty good :)
# RandomForestRegressor의 max_depth를 4로 조정
# Tuning max_depth and dropping Date column makes the best performance
X = twitch_outlier_3.drop(columns = ['Channel','Language','Desc', 'Followers gained', 'Date'])
y = twitch_outlier_3['Followers gained']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
estimators = [('scaler', StandardScaler()),
('rgr', RandomForestRegressor(max_depth=4, random_state=10))]
pipe = Pipeline(estimators)
pipe.fit(X_train, y_train)
y_pred_test = pipe.predict(X_test)
rmse_test = (np.sqrt(mean_squared_error(y_test, y_pred_test)))
lin_mae = mean_absolute_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100
print('RMSE : {}'.format(int(round(rmse_test))), 'MAE : {}'.format(int(round(lin_mae))),
'MAPE : {}'.format(int(round(mape))), 'R2_Score : {}'.format(round(r2, 5)))
### RMSE : 200823 MAE : 111958 MAPE : 172 R2_Score : 0.71019
1. Linear Regression에서 StandardScaler와 MinMaxScaler의 성능 차이가 없다는 의문점 -> 추가 검색 및 학습 예정
2. 회귀 모델들의 parameter값에 따라 성능이 크게 개선됨 -> 추가 학습 후 적용 예정
3. Support Vector Classifier가 가장 좋은 성능을 보였음
4. 주어진 features만으로 종속 변수를 예측하기에는 종속 변수에 영향을 끼치는 외부 변수가 너무 많다는 한계점 존재