1014
[8주차 - Day3] ML_basics - 실습
모델 학습
과제 진행
df_dataset = df_del[['total_items', 'subtotal', 'num_distinct_items',
'min_item_price', 'max_item_price', 'total_onshift', 'total_busy',
'total_outstanding_orders', 'delivery_time',
'estimated_store_to_consumer_driving_duration']]
df_dataset
from sklearn.model_selection import train_test_split
train_dataset, val_dataset = train_test_split(df_dataset, test_size=0.1)
X_train = train_dataset.drop(labels='delivery_time',axis=1)
y_train = train_dataset['delivery_time']
X_val = val_dataset.drop(labels='delivery_time',axis=1)
y_val = val_dataset['delivery_time']
print('학습 이미지:', X_train.shape)
print('학습 레이블:', y_train.shape)
print('검증 이미지:', X_val.shape)
print('검증 레이블:', y_val.shape)
X_train
from sklearn.linear_model import LinearRegression
import time
start_time = time.time() # 시작 시간
model = LinearRegression()
model.fit(X_train, y_train)
print("소요된 시간(초 단위):", time.time() - start_time) # 실행 시간
# - 학습데이터 평가
score = model.score(X_train, y_train)
print(f'Train : {score}')
# - 테스트데이터 평가
score = model.score(X_val, y_val)
print(f'Test : {score}')
y_predict = model.predict(X_val)
plt.scatter(y_val, y_predict, alpha=0.4)
plt.xlabel("Actual time")
plt.ylabel("Predicted time")
plt.title("MULTIPLE LINEAR REGRESSION")
plt.show()
plt.figure(figsize=(20,8))
df_del.boxplot()
plt.xticks(rotation=45)
plt.show()
df_del = df_del[df_del['delivery_time'] <= 7200] # 전처리도 같이 진행
plt.figure(figsize=(20,8))
df_del.boxplot()
plt.xticks(rotation=45)
plt.show()
df_del = df_del[df_del['subtotal'] <= 20000]
df_del = df_del[df_del['min_item_price'] <= 10000]
df_del = df_del[df_del['max_item_price'] <= 10000]
plt.figure(figsize=(20,8))
df_del.boxplot()
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(20,8))
df_del[['subtotal', 'min_item_price', 'max_item_price']].boxplot()
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(20,8))
df_del[['total_items', 'num_distinct_items', 'total_onshift', 'total_busy',
'total_outstanding_orders']].boxplot()
plt.xticks(rotation=45)
plt.show()
df_del = df_del[df_del['total_items'] <= 100]
plt.figure(figsize=(20,8))
df_del[['total_items', 'num_distinct_items', 'total_onshift', 'total_busy',
'total_outstanding_orders']].boxplot()
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(20,8))
df_del[['estimated_order_place_duration',
'estimated_store_to_consumer_driving_duration']].boxplot()
plt.xticks(rotation=45)
plt.show()
df_del = df_del[df_del['estimated_order_place_duration'] <= 2500]
plt.figure(figsize=(20,8))
df_del[['estimated_order_place_duration',
'estimated_store_to_consumer_driving_duration']].boxplot()
plt.xticks(rotation=45)
plt.show()
df_del
df_del.reset_index(drop=False, inplace=True)
df_del = df_del.drop(labels=['index'],axis=1)
df_del
df_dataset = df_del[['total_items', 'subtotal', 'num_distinct_items',
'min_item_price', 'max_item_price', 'total_onshift',
'total_busy', 'total_outstanding_orders',
'estimated_store_to_consumer_driving_duration', 'delivery_time']]
df_dataset
from sklearn.model_selection import train_test_split
train_dataset, val_dataset = train_test_split(df_dataset, test_size=0.1)
X_train = train_dataset.drop(labels='delivery_time',axis=1)
y_train = train_dataset['delivery_time']
X_val = val_dataset.drop(labels='delivery_time',axis=1)
y_val = val_dataset['delivery_time']
print('학습 이미지:', X_train.shape)
print('학습 레이블:', y_train.shape)
print('검증 이미지:', X_val.shape)
print('검증 레이블:', y_val.shape)
X_train
from sklearn.linear_model import LinearRegression
import time
start_time = time.time() # 시작 시간
model = LinearRegression()
model.fit(X_train, y_train)
print("소요된 시간(초 단위):", time.time() - start_time) # 실행 시간
# - 학습데이터 평가
score = model.score(X_train, y_train)
print(f'Train : {score}')
# - 테스트데이터 평가
score = model.score(X_val, y_val)
print(f'Test : {score}')
y_predict = model.predict(X_val)
plt.scatter(y_val, y_predict, alpha=0.4)
plt.xlabel("Actual time")
plt.ylabel("Predicted time")
plt.title("MULTIPLE LINEAR REGRESSION")
plt.show()
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
# 예제 데이터 생성
X = df_dataset.drop(labels='delivery_time',axis=1)
y = df_dataset['delivery_time']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
# SGDRegressor 모델 생성
model = SGDRegressor(alpha=0.01, max_iter=1000, random_state=42)
# 모델 학습
model.fit(X_train, y_train)
# 모델을 사용하여 예측 수행
y_pred = model.predict(X_val)
# 성능 지표 계산
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print("MSE:", mse)
print("R-squared:", r2)
#
num_iterations = 100
mse_scores = []
r2_scores = []
for i in range(num_iterations):
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
# 성능 지표를 리스트에 추가
mse_scores.append(mse)
r2_scores.append(r2)
# 평균 성능 출력
average_mse = np.mean(mse_scores)
average_r2 = np.mean(r2_scores)
print("평균 MSE:", average_mse)
print("평균 R-squared:", average_r2)
#
from sklearn.linear_model import Ridge
# 다항 회귀 적용
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X_poly, y, test_size=0.1, random_state=42)
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)
# 모델을 사용하여 예측 수행
y_pred = model.predict(X_val)
# 성능 지표 계산
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print("MSE:", mse)
print("R-squared:", r2)
#
from sklearn.linear_model import Lasso
X_train, X_val, y_train, y_val = train_test_split(X_poly, y, test_size=0.1, random_state=42)
model = Lasso(alpha=1.0)
model.fit(X_train, y_train)
# 모델을 사용하여 예측 수행
y_pred = model.predict(X_val)
# 성능 지표 계산
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print("MSE:", mse)
print("R-squared:", r2)
#
X_train, X_val, y_train, y_val = train_test_split(X_poly, y, test_size=0.1, random_state=42)
model = Lasso(alpha=0.01)
model.fit(X_train, y_train)
# 모델을 사용하여 예측 수행
y_pred = model.predict(X_val)
# 성능 지표 계산
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print("MSE:", mse)
print("R-squared:", r2)
#
from sklearn.model_selection import GridSearchCV
X_train, X_val, y_train, y_val = train_test_split(X_poly, y, test_size=0.1, random_state=42)
# Ridge 모델 생성
model = Lasso()
# 탐색할 하이퍼파라미터 그리드 설정
param_grid = {
'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
}
# GridSearchCV를 사용하여 최적의 하이퍼파라미터 찾기
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
# 최적의 하이퍼파라미터 및 모델 출력
best_alpha = grid_search.best_params_['alpha']
best_model = Ridge(alpha=best_alpha)
best_model.fit(X_train, y_train)
print("Best alpha:", best_alpha)
print("Best model:", best_model)
#
from sklearn.model_selection import GridSearchCV
X_train, X_val, y_train, y_val = train_test_split(X_poly, y, test_size=0.1, random_state=42)
# Ridge 모델 생성
model = Ridge()
# 탐색할 하이퍼파라미터 그리드 설정
param_grid = {
'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
}
# GridSearchCV를 사용하여 최적의 하이퍼파라미터 찾기
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
# 최적의 하이퍼파라미터 및 모델 출력
best_alpha = grid_search.best_params_['alpha']
best_model = Ridge(alpha=best_alpha)
best_model.fit(X_train, y_train)
print("Best alpha:", best_alpha)
print("Best model:", best_model)
#
from sklearn.model_selection import GridSearchCV
X_train, X_val, y_train, y_val = train_test_split(X_poly, y, test_size=0.1, random_state=42)
# Ridge 모델 생성
model = Ridge()
# 탐색할 하이퍼파라미터 그리드 설정
param_grid = {
'alpha': [100, 1000, 10000],
}
# GridSearchCV를 사용하여 최적의 하이퍼파라미터 찾기
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
# 최적의 하이퍼파라미터 및 모델 출력
best_alpha = grid_search.best_params_['alpha']
best_model = Ridge(alpha=best_alpha)
best_model.fit(X_train, y_train)
print("Best alpha:", best_alpha)
print("Best model:", best_model)
#
from sklearn.model_selection import GridSearchCV
# 예제 데이터 생성
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X_poly, y, test_size=0.1, random_state=42)
model = Lasso(alpha=0.1)
model.fit(X_train, y_train)
# 모델을 사용하여 예측 수행
y_pred = model.predict(X_val)
# 성능 지표 계산
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print("MSE:", mse)
print("R-squared:", r2)
print('학습 평가 : ', model.score(X_train, y_train))
print('검증 평가 : ', model.score(X_val, y_val))
#
from sklearn.model_selection import GridSearchCV
# 예제 데이터 생성
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X_poly, y, test_size=0.1, random_state=42)
# Ridge 모델 생성
#model = Lasso()
model = Ridge(alpha=1000)
model.fit(X_train, y_train)
# 모델을 사용하여 예측 수행
y_pred = model.predict(X_val)
# 성능 지표 계산
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print("MSE:", mse)
print("R-squared:", r2)
print('학습 평가 : ', model.score(X_train, y_train))
print('검증 평가 : ', model.score(X_val, y_val))
# 가중치 확인하면서 정확도 높일 예정